In [1]:
!pip install pyspark



In [2]:
import os, sys
from pyspark.sql import SparkSession

In [3]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
data = [
    ("John", "IT", 45000),
    ("Max", "IT", 50000),
    ("Shawn", "HR", 35000),
    ("Nick", "HR", 25000),
    ("Tom", "IT", 75000),
    ("Matt", "IT", 85000),
    ("Jeff", "HR", 105000),
    ("Chris", "IT", 40000),
    ("Tom", "IT", 45000),
]

In [5]:
# local[x] - running in standalone mode, and here x is the CPU cores I want to utilize
spark = SparkSession.builder.master("local[1]").appName("Example_1").getOrCreate()

In [7]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x0000018D662D7A30>


In [8]:
spark

In [9]:
sparkNewSession = SparkSession.newSession

In [10]:
sparkNewSession

<function pyspark.sql.session.SparkSession.newSession(self)>

In [12]:
rdd = spark.sparkContext.parallelize(data)

In [13]:
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [14]:
df = rdd.toDF()

In [15]:
df

DataFrame[_1: string, _2: string, _3: bigint]

In [17]:
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)



In [18]:
df.show()

+-----+---+------+
|   _1| _2|    _3|
+-----+---+------+
| John| IT| 45000|
|  Max| IT| 50000|
|Shawn| HR| 35000|
| Nick| HR| 25000|
|  Tom| IT| 75000|
| Matt| IT| 85000|
| Jeff| HR|105000|
|Chris| IT| 40000|
|  Tom| IT| 45000|
+-----+---+------+



In [19]:
columns = ['Name', 'Department', 'Salary']

In [20]:
df_2 = rdd.toDF(columns)

In [21]:
df_2

DataFrame[Name: string, Department: string, Salary: bigint]

In [22]:
df_2.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|        IT| 45000|
|  Max|        IT| 50000|
|Shawn|        HR| 35000|
| Nick|        HR| 25000|
|  Tom|        IT| 75000|
| Matt|        IT| 85000|
| Jeff|        HR|105000|
|Chris|        IT| 40000|
|  Tom|        IT| 45000|
+-----+----------+------+



In [23]:
rdd.getNumPartitions()

1

In [24]:
rdd2 = spark.sparkContext.parallelize([i for i in range(1,101)], 10)

In [25]:
rdd2

ParallelCollectionRDD[15] at readRDDFromFile at PythonRDD.scala:274

In [26]:
rdd2.getNumPartitions()

10

In [27]:
rdd3 = spark.sparkContext.textFile('pyspark_intro.txt')

In [28]:
rdd3

pyspark_intro.txt MapPartitionsRDD[17] at textFile at NativeMethodAccessorImpl.java:0

In [29]:
rdd3.getNumPartitions()

1

In [30]:
rdd3.collect()

['Spark',
 '- Open Source Analytical Processing Engine',
 '- Immutable',
 '- Fault tolerant',
 '- Cache & Persistence',
 '- In-Memory',
 '- Real time processing using Streaming and Kafka',
 'PySpark - Spark Library written in Python',
 '',
 'Stream',
 '- continuous flow of data',
 '',
 'Buffering    - chunk of data that is coming from somewhere',
 'Streaming   - continuous flow of data coming from somewhere']

In [31]:
rdd4 = spark.sparkContext.wholeTextFiles('pyspark_intro.txt')

In [32]:
rdd4.collect()

[('file:/C:/Users/asus/NCU_PySpark/pyspark_intro.txt',
  'Spark\r\n- Open Source Analytical Processing Engine\r\n- Immutable\r\n- Fault tolerant\r\n- Cache & Persistence\r\n- In-Memory\r\n- Real time processing using Streaming and Kafka\r\nPySpark - Spark Library written in Python\r\n\r\nStream\r\n- continuous flow of data\r\n\r\nBuffering    - chunk of data that is coming from somewhere\r\nStreaming   - continuous flow of data coming from somewhere')]