# PySpark Data Wrangling
* Notebook by Adam Lang
* Date: 12/17/24

# Overview
* This notebook goes through some basic data wrangling techniques in PySpark.

In [1]:
## install
!pip install pyspark



In [2]:
## import
import pyspark

In [3]:
## read spark file
import pandas as pd
pd.read_csv("/content/drive/MyDrive/Springboard_Data_Science/Big_Mountain_Case_Study/ski_resort_data.csv")

Unnamed: 0,Name,Region,state,summit_elev,vertical_drop,base_elev,trams,fastEight,fastSixes,fastQuads,...,LongestRun_mi,SkiableTerrain_ac,Snow Making_ac,daysOpenLastYear,yearsOpen,averageSnowfall,AdultWeekday,AdultWeekend,projectedDaysOpen,NightSkiing_ac
0,Alyeska Resort,Alaska,Alaska,3939,2500,250,1,0.0,0,2,...,1.0,1610.0,113.0,150.0,60.0,669.0,65.0,85.0,150.0,550.0
1,Eaglecrest Ski Area,Alaska,Alaska,2600,1540,1200,0,0.0,0,0,...,2.0,640.0,60.0,45.0,44.0,350.0,47.0,53.0,90.0,
2,Hilltop Ski Area,Alaska,Alaska,2090,294,1796,0,0.0,0,0,...,1.0,30.0,30.0,150.0,36.0,69.0,30.0,34.0,152.0,30.0
3,Arizona Snowbowl,Arizona,Arizona,11500,2300,9200,0,0.0,1,0,...,2.0,777.0,104.0,122.0,81.0,260.0,89.0,89.0,122.0,
4,Sunrise Park Resort,Arizona,Arizona,11100,1800,9200,0,,0,1,...,1.2,800.0,80.0,115.0,49.0,250.0,74.0,78.0,104.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,Meadowlark Ski Lodge,Wyoming,Wyoming,9500,1000,8500,0,,0,0,...,1.5,300.0,,,9.0,,,,,
326,Sleeping Giant Ski Resort,Wyoming,Wyoming,7428,810,6619,0,0.0,0,0,...,1.0,184.0,18.0,61.0,81.0,310.0,42.0,42.0,77.0,
327,Snow King Resort,Wyoming,Wyoming,7808,1571,6237,0,,0,0,...,1.0,400.0,250.0,121.0,80.0,300.0,59.0,59.0,123.0,110.0
328,Snowy Range Ski & Recreation Area,Wyoming,Wyoming,9663,990,8798,0,0.0,0,0,...,0.7,75.0,30.0,131.0,59.0,250.0,49.0,49.0,,


# Create Spark Session

In [4]:
from pyspark.sql import SparkSession

In [5]:
## init spark session
spark = SparkSession.builder.appName('Experiment').getOrCreate()

In [6]:
## view spark session
spark

# Read Spark CSV

In [7]:
## create pyspark df
df_pyspark = spark.read.csv('/content/drive/MyDrive/Springboard_Data_Science/Big_Mountain_Case_Study/ski_resort_data.csv')

In [8]:
## view pyspark df
df_pyspark

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string, _c18: string, _c19: string, _c20: string, _c21: string, _c22: string, _c23: string, _c24: string, _c25: string, _c26: string]

Summary
* PySpark assigned `_c1` etc. as the column labels. It does this by default. We can fix this

In [9]:
## if we use header=True, inferSchema=True
df_pyspark = spark.read.csv('/content/drive/MyDrive/Springboard_Data_Science/Big_Mountain_Case_Study/ski_resort_data.csv',
                            header=True,
                            inferSchema=True)

In [10]:
df_pyspark.show()

+--------------------+-------------------+----------+-----------+-------------+---------+-----+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|                Name|             Region|     state|summit_elev|vertical_drop|base_elev|trams|fastEight|fastSixes|fastQuads|quad|triple|double|surface|total_chairs|Runs|TerrainParks|LongestRun_mi|SkiableTerrain_ac|Snow Making_ac|daysOpenLastYear|yearsOpen|averageSnowfall|AdultWeekday|AdultWeekend|projectedDaysOpen|NightSkiing_ac|
+--------------------+-------------------+----------+-----------+-------------+---------+-----+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|

In [11]:
#dtype
type(df_pyspark)

In [12]:
## head function in pyspark
df_pyspark.head(3)

[Row(Name='Alyeska Resort', Region='Alaska', state='Alaska', summit_elev=3939, vertical_drop=2500, base_elev=250, trams=1, fastEight=0, fastSixes=0, fastQuads=2, quad=2, triple=0, double=0, surface=2, total_chairs=7, Runs=76, TerrainParks=2, LongestRun_mi=1.0, SkiableTerrain_ac=1610, Snow Making_ac=113, daysOpenLastYear=150, yearsOpen=60, averageSnowfall=669, AdultWeekday=65.0, AdultWeekend=85.0, projectedDaysOpen=150, NightSkiing_ac=550),
 Row(Name='Eaglecrest Ski Area', Region='Alaska', state='Alaska', summit_elev=2600, vertical_drop=1540, base_elev=1200, trams=0, fastEight=0, fastSixes=0, fastQuads=0, quad=0, triple=0, double=4, surface=0, total_chairs=4, Runs=36, TerrainParks=1, LongestRun_mi=2.0, SkiableTerrain_ac=640, Snow Making_ac=60, daysOpenLastYear=45, yearsOpen=44, averageSnowfall=350, AdultWeekday=47.0, AdultWeekend=53.0, projectedDaysOpen=90, NightSkiing_ac=None),
 Row(Name='Hilltop Ski Area', Region='Alaska', state='Alaska', summit_elev=2090, vertical_drop=294, base_elev

In [13]:
## column info --> same as df.info()
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- state: string (nullable = true)
 |-- summit_elev: integer (nullable = true)
 |-- vertical_drop: integer (nullable = true)
 |-- base_elev: integer (nullable = true)
 |-- trams: integer (nullable = true)
 |-- fastEight: integer (nullable = true)
 |-- fastSixes: integer (nullable = true)
 |-- fastQuads: integer (nullable = true)
 |-- quad: integer (nullable = true)
 |-- triple: integer (nullable = true)
 |-- double: integer (nullable = true)
 |-- surface: integer (nullable = true)
 |-- total_chairs: integer (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- TerrainParks: integer (nullable = true)
 |-- LongestRun_mi: double (nullable = true)
 |-- SkiableTerrain_ac: integer (nullable = true)
 |-- Snow Making_ac: integer (nullable = true)
 |-- daysOpenLastYear: integer (nullable = true)
 |-- yearsOpen: integer (nullable = true)
 |-- averageSnowfall: integer (nullable = true)
 |-- AdultWeekday: double

# Diving Deeper into PySpark Wrangling
* Here we will go nuts and bolts of pyspark.

## Create pyspark session

In [14]:
from pyspark.sql import SparkSession

In [15]:
## init session
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [16]:
## view spark session
spark

# Multiple ways to read a pyspark dataset
* If you dont set `inferSchema=True` it will by default infer all datatypes are strings.

## Method 1 - `spark.read.option`

In [17]:
## read the dataset --> add .show() to see the df head
df_pyspark = spark.read.option("header","true").csv('/content/drive/MyDrive/Springboard_Data_Science/Big_Mountain_Case_Study/ski_resort_data.csv',
                                                    inferSchema=True)

In [18]:
## verify schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- state: string (nullable = true)
 |-- summit_elev: integer (nullable = true)
 |-- vertical_drop: integer (nullable = true)
 |-- base_elev: integer (nullable = true)
 |-- trams: integer (nullable = true)
 |-- fastEight: integer (nullable = true)
 |-- fastSixes: integer (nullable = true)
 |-- fastQuads: integer (nullable = true)
 |-- quad: integer (nullable = true)
 |-- triple: integer (nullable = true)
 |-- double: integer (nullable = true)
 |-- surface: integer (nullable = true)
 |-- total_chairs: integer (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- TerrainParks: integer (nullable = true)
 |-- LongestRun_mi: double (nullable = true)
 |-- SkiableTerrain_ac: integer (nullable = true)
 |-- Snow Making_ac: integer (nullable = true)
 |-- daysOpenLastYear: integer (nullable = true)
 |-- yearsOpen: integer (nullable = true)
 |-- averageSnowfall: integer (nullable = true)
 |-- AdultWeekday: double

## Method 2 - `spark.read.csv`

In [19]:
df_pyspark = spark.read.csv('/content/drive/MyDrive/Springboard_Data_Science/Big_Mountain_Case_Study/ski_resort_data.csv',
                            header=True,
                            inferSchema=True)
# show data
df_pyspark.show()

+--------------------+-------------------+----------+-----------+-------------+---------+-----+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|                Name|             Region|     state|summit_elev|vertical_drop|base_elev|trams|fastEight|fastSixes|fastQuads|quad|triple|double|surface|total_chairs|Runs|TerrainParks|LongestRun_mi|SkiableTerrain_ac|Snow Making_ac|daysOpenLastYear|yearsOpen|averageSnowfall|AdultWeekday|AdultWeekend|projectedDaysOpen|NightSkiing_ac|
+--------------------+-------------------+----------+-----------+-------------+---------+-----+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|

In [20]:
## verify data schema
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- state: string (nullable = true)
 |-- summit_elev: integer (nullable = true)
 |-- vertical_drop: integer (nullable = true)
 |-- base_elev: integer (nullable = true)
 |-- trams: integer (nullable = true)
 |-- fastEight: integer (nullable = true)
 |-- fastSixes: integer (nullable = true)
 |-- fastQuads: integer (nullable = true)
 |-- quad: integer (nullable = true)
 |-- triple: integer (nullable = true)
 |-- double: integer (nullable = true)
 |-- surface: integer (nullable = true)
 |-- total_chairs: integer (nullable = true)
 |-- Runs: integer (nullable = true)
 |-- TerrainParks: integer (nullable = true)
 |-- LongestRun_mi: double (nullable = true)
 |-- SkiableTerrain_ac: integer (nullable = true)
 |-- Snow Making_ac: integer (nullable = true)
 |-- daysOpenLastYear: integer (nullable = true)
 |-- yearsOpen: integer (nullable = true)
 |-- averageSnowfall: integer (nullable = true)
 |-- AdultWeekday: double

In [21]:
## dtype -- dataframe is a data structure regardless of pyspark or python
type(df_pyspark)

## Columns in PySpark

In [22]:
## columns similar to pandas
df_pyspark.columns

['Name',
 'Region',
 'state',
 'summit_elev',
 'vertical_drop',
 'base_elev',
 'trams',
 'fastEight',
 'fastSixes',
 'fastQuads',
 'quad',
 'triple',
 'double',
 'surface',
 'total_chairs',
 'Runs',
 'TerrainParks',
 'LongestRun_mi',
 'SkiableTerrain_ac',
 'Snow Making_ac',
 'daysOpenLastYear',
 'yearsOpen',
 'averageSnowfall',
 'AdultWeekday',
 'AdultWeekend',
 'projectedDaysOpen',
 'NightSkiing_ac']

In [23]:
## head in pyspark --> result is a list
df_pyspark.head(3)

[Row(Name='Alyeska Resort', Region='Alaska', state='Alaska', summit_elev=3939, vertical_drop=2500, base_elev=250, trams=1, fastEight=0, fastSixes=0, fastQuads=2, quad=2, triple=0, double=0, surface=2, total_chairs=7, Runs=76, TerrainParks=2, LongestRun_mi=1.0, SkiableTerrain_ac=1610, Snow Making_ac=113, daysOpenLastYear=150, yearsOpen=60, averageSnowfall=669, AdultWeekday=65.0, AdultWeekend=85.0, projectedDaysOpen=150, NightSkiing_ac=550),
 Row(Name='Eaglecrest Ski Area', Region='Alaska', state='Alaska', summit_elev=2600, vertical_drop=1540, base_elev=1200, trams=0, fastEight=0, fastSixes=0, fastQuads=0, quad=0, triple=0, double=4, surface=0, total_chairs=4, Runs=36, TerrainParks=1, LongestRun_mi=2.0, SkiableTerrain_ac=640, Snow Making_ac=60, daysOpenLastYear=45, yearsOpen=44, averageSnowfall=350, AdultWeekday=47.0, AdultWeekend=53.0, projectedDaysOpen=90, NightSkiing_ac=None),
 Row(Name='Hilltop Ski Area', Region='Alaska', state='Alaska', summit_elev=2090, vertical_drop=294, base_elev

### Selecting Columns in PySpark
* The key here is the `.select()` function in pyspark.

In [24]:
## selecting columns
df_pyspark.show()

+--------------------+-------------------+----------+-----------+-------------+---------+-----+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|                Name|             Region|     state|summit_elev|vertical_drop|base_elev|trams|fastEight|fastSixes|fastQuads|quad|triple|double|surface|total_chairs|Runs|TerrainParks|LongestRun_mi|SkiableTerrain_ac|Snow Making_ac|daysOpenLastYear|yearsOpen|averageSnowfall|AdultWeekday|AdultWeekend|projectedDaysOpen|NightSkiing_ac|
+--------------------+-------------------+----------+-----------+-------------+---------+-----+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|

In [25]:
## lets select name column -- show() gives you entire column
df_pyspark.select('Name').show()

+--------------------+
|                Name|
+--------------------+
|      Alyeska Resort|
| Eaglecrest Ski Area|
|    Hilltop Ski Area|
|    Arizona Snowbowl|
| Sunrise Park Resort|
|Yosemite Ski & Sn...|
|       Bear Mountain|
|         Bear Valley|
|Boreal Mountain R...|
|         Dodge Ridge|
|    Donner Ski Ranch|
|Heavenly Mountain...|
|       June Mountain|
|            Kirkwood|
|Mammoth Mountain ...|
| Mt. Shasta Ski Park|
|       Mountain High|
|           Mt. Baldy|
|Northstar California|
|     Sierra-at-Tahoe|
+--------------------+
only showing top 20 rows



In [26]:
## checking type
type(df_pyspark.select('Name'))

In [27]:
## multiple column selections
df_pyspark.select(['Name','Region']).show()

+--------------------+-------------------+
|                Name|             Region|
+--------------------+-------------------+
|      Alyeska Resort|             Alaska|
| Eaglecrest Ski Area|             Alaska|
|    Hilltop Ski Area|             Alaska|
|    Arizona Snowbowl|            Arizona|
| Sunrise Park Resort|            Arizona|
|Yosemite Ski & Sn...|Northern California|
|       Bear Mountain|      Sierra Nevada|
|         Bear Valley|      Sierra Nevada|
|Boreal Mountain R...|      Sierra Nevada|
|         Dodge Ridge|      Sierra Nevada|
|    Donner Ski Ranch|      Sierra Nevada|
|Heavenly Mountain...|      Sierra Nevada|
|       June Mountain|      Sierra Nevada|
|            Kirkwood|      Sierra Nevada|
|Mammoth Mountain ...|      Sierra Nevada|
| Mt. Shasta Ski Park|      Sierra Nevada|
|       Mountain High|      Sierra Nevada|
|           Mt. Baldy|      Sierra Nevada|
|Northstar California|      Sierra Nevada|
|     Sierra-at-Tahoe|      Sierra Nevada|
+----------

In [28]:
## indexing a column the pandas method is different:
df_pyspark['Name']

Column<'Name'>

### Datatypes

In [29]:
## dtypes -- similar to pandas
df_pyspark.dtypes

[('Name', 'string'),
 ('Region', 'string'),
 ('state', 'string'),
 ('summit_elev', 'int'),
 ('vertical_drop', 'int'),
 ('base_elev', 'int'),
 ('trams', 'int'),
 ('fastEight', 'int'),
 ('fastSixes', 'int'),
 ('fastQuads', 'int'),
 ('quad', 'int'),
 ('triple', 'int'),
 ('double', 'int'),
 ('surface', 'int'),
 ('total_chairs', 'int'),
 ('Runs', 'int'),
 ('TerrainParks', 'int'),
 ('LongestRun_mi', 'double'),
 ('SkiableTerrain_ac', 'int'),
 ('Snow Making_ac', 'int'),
 ('daysOpenLastYear', 'int'),
 ('yearsOpen', 'int'),
 ('averageSnowfall', 'int'),
 ('AdultWeekday', 'double'),
 ('AdultWeekend', 'double'),
 ('projectedDaysOpen', 'int'),
 ('NightSkiing_ac', 'int')]

## Describe -- showing pyspark df stats


In [30]:
## .describe().show()
df_pyspark.describe().show()

+-------+--------------------+-------+-------+------------------+------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|                Name| Region|  state|       summit_elev|     vertical_drop|         base_elev|              trams|           fastEight|          fastSixes|         fastQuads|              quad|            triple|            double|          surface|     total_chairs|              Runs|      TerrainParks|     LongestRun_mi| SkiableTerrain_ac|   Snow Making_ac|  daysOpenLastYear|         yearsOpen|   averageSnowfall|      AdultWeekday|      AdultWeekend| projectedDaysOpen|    NightSki

### Adding Spark DF columns

In [31]:
## adding columns --> column name, value assigned
df_pyspark.withColumn('Average Ticket Price',((df_pyspark['AdultWeekday']+df_pyspark['AdultWeekend'])/2))

DataFrame[Name: string, Region: string, state: string, summit_elev: int, vertical_drop: int, base_elev: int, trams: int, fastEight: int, fastSixes: int, fastQuads: int, quad: int, triple: int, double: int, surface: int, total_chairs: int, Runs: int, TerrainParks: int, LongestRun_mi: double, SkiableTerrain_ac: int, Snow Making_ac: int, daysOpenLastYear: int, yearsOpen: int, averageSnowfall: int, AdultWeekday: double, AdultWeekend: double, projectedDaysOpen: int, NightSkiing_ac: int, Average Ticket Price: double]

In [32]:
df_pyspark.show()

+--------------------+-------------------+----------+-----------+-------------+---------+-----+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|                Name|             Region|     state|summit_elev|vertical_drop|base_elev|trams|fastEight|fastSixes|fastQuads|quad|triple|double|surface|total_chairs|Runs|TerrainParks|LongestRun_mi|SkiableTerrain_ac|Snow Making_ac|daysOpenLastYear|yearsOpen|averageSnowfall|AdultWeekday|AdultWeekend|projectedDaysOpen|NightSkiing_ac|
+--------------------+-------------------+----------+-----------+-------------+---------+-----+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|

### Dropping Columns in pyspark

In [33]:
## drop columns -- assign to variable as not in place
df_pyspark = df_pyspark.drop('trams')

In [34]:
## show
df_pyspark.show()

+--------------------+-------------------+----------+-----------+-------------+---------+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|                Name|             Region|     state|summit_elev|vertical_drop|base_elev|fastEight|fastSixes|fastQuads|quad|triple|double|surface|total_chairs|Runs|TerrainParks|LongestRun_mi|SkiableTerrain_ac|Snow Making_ac|daysOpenLastYear|yearsOpen|averageSnowfall|AdultWeekday|AdultWeekend|projectedDaysOpen|NightSkiing_ac|
+--------------------+-------------------+----------+-----------+-------------+---------+---------+---------+---------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|      Alyeska Reso

### Rename Columns in pyspark

In [35]:
## rename cols -- existing + new column
df_pyspark = df_pyspark.withColumnRenamed('fastQuads','Quad_ChairLifts')
df_pyspark.show()

+--------------------+-------------------+----------+-----------+-------------+---------+---------+---------+---------------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|                Name|             Region|     state|summit_elev|vertical_drop|base_elev|fastEight|fastSixes|Quad_ChairLifts|quad|triple|double|surface|total_chairs|Runs|TerrainParks|LongestRun_mi|SkiableTerrain_ac|Snow Making_ac|daysOpenLastYear|yearsOpen|averageSnowfall|AdultWeekday|AdultWeekend|projectedDaysOpen|NightSkiing_ac|
+--------------------+-------------------+----------+-----------+-------------+---------+---------+---------+---------------+----+------+------+-------+------------+----+------------+-------------+-----------------+--------------+----------------+---------+---------------+------------+------------+-----------------+--------------+
|

# PySpark Handling Missing Values
* Dropping columns
* Dropping rows
* Various parameters in dropping functionalities
* Handling missing values by Mean, Median, Mode

In [36]:
## create spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [45]:
## read dataset
df_pyspark = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/Deep Learning Notebooks/BERT Sentiment Analysis Project/Tweets.csv',
               header=True,
               inferSchema=True)

In [46]:
## show data
df_pyspark.show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|          NULL|                     NULL|Virgi

## Drop columns

In [47]:
## drop cols -- doesn't drop inplace
df_pyspark.drop('name').show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|          NULL|                     NULL|Virgin America|                  NULL|               

## Dropping null values

In [48]:
## view spark df again
df_pyspark.show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|          NULL|                     NULL|Virgi

### Drops ALL ROWS with null values

In [49]:
## null values --> options: drop, fill, replace
df_pyspark.na.drop().show()

+------------------+-----------------+----------------------------+----------------+-------------------------+----------+----------------------+---------------+-------------------+-------------+--------------------+--------------------+--------------------+--------------+--------------------+
|          tweet_id|airline_sentiment|airline_sentiment_confidence|  negativereason|negativereason_confidence|   airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|         tweet_coord|       tweet_created|tweet_location|       user_timezone|
+------------------+-----------------+----------------------------+----------------+-------------------------+----------+----------------------+---------------+-------------------+-------------+--------------------+--------------------+--------------------+--------------+--------------------+
|567778009013178368|         negative|                         1.0|Cancelled Flight|                      1.0|    Unit

### How == "all" --> Drops all NULL records

In [50]:
### how = "any", "all"
df_pyspark.na.drop(how="all").show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|          NULL|                     NULL|Virgi

### Threshold
* This means if the **threshold value is 2, at least 2 non-null values should be present.**
  * And so on and so forth....

In [51]:
### threshold =
df_pyspark.na.drop(how="any",
                   thresh=2).show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|          NULL|                     NULL|Virgi

### Subset
* Another param in drop
* Drops null records in the specified `subset` list or array given.

In [52]:
### Subset
df_pyspark.na.drop(how='any',
                   #thresh=3,
                   subset=['tweet_location']).show()

+------------------+-----------------+----------------------------+--------------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|          tweet_id|airline_sentiment|airline_sentiment_confidence|      negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|         tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+------------------+-----------------+----------------------------+--------------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|570301083672813571|          neutral|                      0.6837|         

### Filling missing values
* Wherever there is a missing value it will fill with the string or value that you insert into `.na.fill()`

In [54]:
## fill missing values
df_pyspark.na.fill('Missing Values').show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+--------------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|   tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+--------------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|Missing Values|           Missing Val

In [55]:
## can also specify which column to fill nulls --> example: 'negativereason'
df_pyspark.na.fill('Missing Values', 'negativereason').show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|Missing Values|                     NULL|Virgi

In [57]:
## can also specify which columns to fill nulls as a list of multiple cols
df_pyspark.na.fill('Missing Values', ['negativereason','negativereason_confidence']).show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|Missing Values|           Missing Values|Virgi

In [58]:
## view entire dataset
df_pyspark.show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+
|  570306133677760513|          neutral|                         1.0|          NULL|                     NULL|Virgi

### Imputing missing values

In [62]:
from pyspark.sql.types import DoubleType

## columns were not numeric to begin with
# Cast the columns to DoubleType before applying the Imputer
df_pyspark = df_pyspark.withColumn("airline_sentiment_confidence", df_pyspark["airline_sentiment_confidence"].cast(DoubleType()))
df_pyspark = df_pyspark.withColumn("negativereason_confidence", df_pyspark["negativereason_confidence"].cast(DoubleType()))

from pyspark.ml.feature import Imputer

# Setup the Imputer (unchanged)
imputer = Imputer(
    inputCols=['airline_sentiment_confidence','negativereason_confidence'],
    outputCols=["{}_imputed".format(c) for c in ['airline_sentiment_confidence', 'negativereason_confidence']]
).setStrategy("median") ## mean, median, mode

# Fit and transform the DataFrame with the casted columns
imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+--------------------+--------------------+------------------------------------+---------------------------------+
|            tweet_id|airline_sentiment|airline_sentiment_confidence|negativereason|negativereason_confidence|       airline|airline_sentiment_gold|           name|negativereason_gold|retweet_count|                text|tweet_coord|       tweet_created|      tweet_location|       user_timezone|airline_sentiment_confidence_imputed|negativereason_confidence_imputed|
+--------------------+-----------------+----------------------------+--------------+-------------------------+--------------+----------------------+---------------+-------------------+-------------+--------------------+-----------+--------------------+----------------

# Filter Operations in PySpark
* This includes:
  * Filter operation
  * &, |, ==
  * ~


In [66]:
## generate synthetic data
import pandas as pd

# Create a list of dictionaries, where each dictionary represents a row
data = [
    {'name': 'Alice', 'age': 25, 'experience': 3, 'salary': 60000},
    {'name': 'Bob', 'age': 30, 'experience': 5, 'salary': 75000},
    {'name': 'Charlie', 'age': 28, 'experience': 4, 'salary': 70000},
    {'name': 'David', 'age': 35, 'experience': 8, 'salary': 90000},
    {'name': 'Eve', 'age': 22, 'experience': 1, 'salary': 50000},
    {'name': 'Frank', 'age': 40, 'experience': 12, 'salary': 110000},
    {'name': 'Grace', 'age': 27, 'experience': 3, 'salary': 65000},
    {'name': 'Helen', 'age': 32, 'experience': 6, 'salary': 80000}
]

# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Display the DataFrame
df.to_csv('test1.csv',index=False)

In [67]:
## setup spark session
from pyspark.sql import SparkSession

## init session
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [68]:
## load dataset into spark
data_path = '/content/test1.csv'
df_pyspark = spark.read.csv(data_path,
                            header=True,
                            inferSchema=True)
df_pyspark.show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  Alice| 25|         3| 60000|
|    Bob| 30|         5| 75000|
|Charlie| 28|         4| 70000|
|  David| 35|         8| 90000|
|    Eve| 22|         1| 50000|
|  Frank| 40|        12|110000|
|  Grace| 27|         3| 65000|
|  Helen| 32|         6| 80000|
+-------+---+----------+------+



## Filter Operations

In [71]:
### salary of people less than or equal to 80,000
df_pyspark.filter("Salary<=80000").show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  Alice| 25|         3| 60000|
|    Bob| 30|         5| 75000|
|Charlie| 28|         4| 70000|
|    Eve| 22|         1| 50000|
|  Grace| 27|         3| 65000|
|  Helen| 32|         6| 80000|
+-------+---+----------+------+



In [72]:
## filter and select cols
df_pyspark.filter("Salary<=80000").select(['name','age']).show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 28|
|    Eve| 22|
|  Grace| 27|
|  Helen| 32|
+-------+---+



In [73]:
### filter on 1 conditions
df_pyspark.filter(df_pyspark['Salary']<=80000).show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  Alice| 25|         3| 60000|
|    Bob| 30|         5| 75000|
|Charlie| 28|         4| 70000|
|    Eve| 22|         1| 50000|
|  Grace| 27|         3| 65000|
|  Helen| 32|         6| 80000|
+-------+---+----------+------+



In [76]:
### filter on multiple conditions using &
df_pyspark.filter((df_pyspark['Salary']<=80000) &
                  (df_pyspark['Salary']>=70000)).show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|    Bob| 30|         5| 75000|
|Charlie| 28|         4| 70000|
|  Helen| 32|         6| 80000|
+-------+---+----------+------+



In [78]:
### filter on multiple conditions using |
df_pyspark.filter((df_pyspark['Salary']<=80000) |
                  (df_pyspark['Salary']>=70000)).show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  Alice| 25|         3| 60000|
|    Bob| 30|         5| 75000|
|Charlie| 28|         4| 70000|
|  David| 35|         8| 90000|
|    Eve| 22|         1| 50000|
|  Frank| 40|        12|110000|
|  Grace| 27|         3| 65000|
|  Helen| 32|         6| 80000|
+-------+---+----------+------+



In [80]:
### filter on multiple conditions using inverse or NOT condition --> ~
df_pyspark.filter(~(df_pyspark['Salary']<=80000)).show()

+-----+---+----------+------+
| name|age|experience|salary|
+-----+---+----------+------+
|David| 35|         8| 90000|
|Frank| 40|        12|110000|
+-----+---+----------+------+



# PySpark GroupBy and Aggregate Functions

In [82]:
## generate synthetic data
import pandas as pd

# Create a list of dictionaries, where each dictionary represents a row
data = [
    {'name': 'Alice', 'Department': 'Data Science','salary': 60000},
    {'name': 'Bob', 'Department':'AI', 'salary': 75000},
    {'name': 'Charlie', 'Department': 'Computer Science', 'salary': 70000},
    {'name': 'David', 'Department': 'Data Science','salary': 90000},
    {'name': 'Eve', 'Department': 'AI', 'salary': 50000},
    {'name': 'Frank', 'Department': 'Computer Science', 'salary': 110000},
    {'name': 'Grace', 'Department': 'AI', 'salary': 65000},
    {'name': 'Helen', 'Department': 'Data Science', 'salary': 80000}
]

# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Display the DataFrame
df.to_csv('job_synthetic_dataset.csv',index=False)

In [83]:
## setup spark session
from pyspark.sql import SparkSession

## spark variable
spark = SparkSession.builder.appName('Agg').getOrCreate()

In [84]:
## spark variable
spark

In [86]:
## read dataset
df_pyspark = spark.read.csv('/content/job_synthetic_dataset.csv',
                      header=True,
                      inferSchema=True)

In [87]:
# show data
df_pyspark.show()

+-------+----------------+------+
|   name|      Department|salary|
+-------+----------------+------+
|  Alice|    Data Science| 60000|
|    Bob|              AI| 75000|
|Charlie|Computer Science| 70000|
|  David|    Data Science| 90000|
|    Eve|              AI| 50000|
|  Frank|Computer Science|110000|
|  Grace|              AI| 65000|
|  Helen|    Data Science| 80000|
+-------+----------------+------+



In [88]:
## schema
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- salary: integer (nullable = true)



## GroupBy operation

In [94]:
## Groupby
### Grouped to find max salary
df_pyspark.groupBy('Name').sum().show()

+-------+-----------+
|   Name|sum(salary)|
+-------+-----------+
|  Grace|      65000|
|  Helen|      80000|
|Charlie|      70000|
|    Bob|      75000|
|  Alice|      60000|
|    Eve|      50000|
|  David|      90000|
|  Frank|     110000|
+-------+-----------+



In [95]:
### groupby departments which gives maximum salary
df_pyspark.groupBy('Department').sum().show()

+----------------+-----------+
|      Department|sum(salary)|
+----------------+-----------+
|              AI|     190000|
|Computer Science|     180000|
|    Data Science|     230000|
+----------------+-----------+



In [96]:
### groupby mean
df_pyspark.groupBy('Department').mean().show()

+----------------+------------------+
|      Department|       avg(salary)|
+----------------+------------------+
|              AI|63333.333333333336|
|Computer Science|           90000.0|
|    Data Science| 76666.66666666667|
+----------------+------------------+



In [97]:
### how many employees in each dept
df_pyspark.groupBy('Department').count().show()

+----------------+-----+
|      Department|count|
+----------------+-----+
|              AI|    3|
|Computer Science|    2|
|    Data Science|    3|
+----------------+-----+



In [98]:
## aggregate
df_pyspark.agg({'Salary': 'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|     600000|
+-----------+



In [101]:
## min salary by name
df_pyspark.groupBy('name','department').min().show()

+-------+----------------+-----------+
|   name|      department|min(salary)|
+-------+----------------+-----------+
|    Bob|              AI|      75000|
|  Frank|Computer Science|     110000|
|  David|    Data Science|      90000|
|  Helen|    Data Science|      80000|
|Charlie|Computer Science|      70000|
|    Eve|              AI|      50000|
|  Alice|    Data Science|      60000|
|  Grace|              AI|      65000|
+-------+----------------+-----------+



In [102]:
## max salary by name
df_pyspark.groupBy('name','department').max().show()

+-------+----------------+-----------+
|   name|      department|max(salary)|
+-------+----------------+-----------+
|    Bob|              AI|      75000|
|  Frank|Computer Science|     110000|
|  David|    Data Science|      90000|
|  Helen|    Data Science|      80000|
|Charlie|Computer Science|      70000|
|    Eve|              AI|      50000|
|  Alice|    Data Science|      60000|
|  Grace|              AI|      65000|
+-------+----------------+-----------+



In [103]:
## mean salary by name
df_pyspark.groupBy('name','department').mean().show()

+-------+----------------+-----------+
|   name|      department|avg(salary)|
+-------+----------------+-----------+
|    Bob|              AI|    75000.0|
|  Frank|Computer Science|   110000.0|
|  David|    Data Science|    90000.0|
|  Helen|    Data Science|    80000.0|
|Charlie|Computer Science|    70000.0|
|    Eve|              AI|    50000.0|
|  Alice|    Data Science|    60000.0|
|  Grace|              AI|    65000.0|
+-------+----------------+-----------+

