# Getting Started with PySpark

## How to build a Spark Session?

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col # load libraries

In [9]:
spark = SparkSession.builder\
        .appName("Intro to Spark Dataframes")\
        .getOrCreate() # create a Spark session 

In [4]:
spark # spark session I've created

In [5]:
data = [
    (1, "Mandy", 24),
    (2, "Robin", 50),
    (3, "Moly", 22)
] # data

cols = ["ID", "Name", "Age"] # column names

In [6]:
df = spark.createDataFrame(data, cols) # create a dataframe

In [8]:
df.show() # data preview

+---+-----+---+
| ID| Name|Age|
+---+-----+---+
|  1|Mandy| 24|
|  2|Robin| 50|
|  3| Moly| 22|
+---+-----+---+



In [10]:
spark.stop() # stop spark session

## Intro to Data Wrangling in PySpark

In [10]:
spark = SparkSession.builder\
        .appName("Data Wrangling in PySpark")\
        .getOrCreate() # create a Spark session 

In [15]:
data = [
    (1, "Mandy", 24),
    (2, "Robin", 50),
    (3, "Moly", 22),
    (4, "Mandilona", 25),
    (5, "Rocky", None),
    (6, None, 14)
] # data

cols = ["ID", "Name", "Age"] # columns names

In [16]:
df = spark.createDataFrame(data, cols) # create dataframe

In [17]:
print("Original DataFrame:")
df.show() # data preview

Original DataFrame:
+---+---------+----+
| ID|     Name| Age|
+---+---------+----+
|  1|    Mandy|  24|
|  2|    Robin|  50|
|  3|     Moly|  22|
|  4|Mandilona|  25|
|  5|    Rocky|NULL|
|  6|     NULL|  14|
+---+---------+----+



In [18]:
print("Dataframe shape: {} rows, {} columns".format(df.count(), len(df.columns))) # dataframe dimensions

Dataframe shape: 6 rows, 3 columns


In [19]:
df.columns # columns names

['ID', 'Name', 'Age']

In [20]:
df.filter(df.Name.isNull()).collect() # identify and report rows with null values in Name column

[Row(ID=6, Name=None, Age=14)]

In [21]:
df.filter(df.Age.isNull()).collect() # identify and report rows with null values in Age column

[Row(ID=5, Name='Rocky', Age=None)]

In [23]:
updated_df = df.filter(df.Age.isNotNull() & df.Name.isNotNull()) # filter NULL values
updated_df.show()

+---+---------+---+
| ID|     Name|Age|
+---+---------+---+
|  1|    Mandy| 24|
|  2|    Robin| 50|
|  3|     Moly| 22|
|  4|Mandilona| 25|
+---+---------+---+



In [24]:
renamed_df = updated_df.withColumnRenamed("Age", "Age (yrs)") # rename columns 
renamed_df.show()

+---+---------+---------+
| ID|     Name|Age (yrs)|
+---+---------+---------+
|  1|    Mandy|       24|
|  2|    Robin|       50|
|  3|     Moly|       22|
|  4|Mandilona|       25|
+---+---------+---------+



In [26]:
selected_ppl = renamed_df.withColumn("> 30 yrs", col("Age (yrs)") >= 30) # create a new column based on age column
selected_ppl.show()

+---+---------+---------+--------+
| ID|     Name|Age (yrs)|> 30 yrs|
+---+---------+---------+--------+
|  1|    Mandy|       24|   false|
|  2|    Robin|       50|    true|
|  3|     Moly|       22|   false|
|  4|Mandilona|       25|   false|
+---+---------+---------+--------+



In [27]:
spark.stop() # stop spark session