In [1]:
sc

In [2]:
spark

#### 1.Create a new Spark Session Instance

In [3]:
sc.stop()

In [4]:
from pyspark import SparkConf,SparkContext
config = SparkConf().setMaster("local[4]").setAppName("PysparkSession")
sc = SparkContext(conf=config)

In [5]:
sc


In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PysparkSession").getOrCreate()

In [7]:
spark

In [8]:
!hdfs dfs -mkdir /flights

mkdir: `/flights': File exists


In [9]:
!hdfs dfs -put /home/hadoop/Downloads/raw_flight_data.csv /flights

put: `/flights/raw_flight_data.csv': File exists


In [10]:
flights_df = spark.read.csv("/flights/raw_flight_data.csv",header=True,inferSchema=True)  #default path = hdfs://localhost:9000

In [11]:
flights_df.head(5)

[Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=11433, DestAirportID=13303, DepDelay=-3, ArrDelay=1),
 Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=14869, DestAirportID=12478, DepDelay=0, ArrDelay=-8),
 Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=14057, DestAirportID=14869, DepDelay=-4, ArrDelay=-15),
 Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=15016, DestAirportID=11433, DepDelay=28, ArrDelay=24),
 Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=11193, DestAirportID=12892, DepDelay=-6, ArrDelay=-11)]

In [12]:
flights_df.show()   #shows first 20 rows in a tabular format

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
|        19|        5|     DL|          10397|        15016|      -1|     -19|
|        19|        5|     DL|          15016|        10397|       0|      -1|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|      

#### SELECT
* Used to select specific column from the DataFrame

In [13]:
flights_df.select(['DayofMonth','Carrier','OriginAirportID','DepDelay']).show(5)

+----------+-------+---------------+--------+
|DayofMonth|Carrier|OriginAirportID|DepDelay|
+----------+-------+---------------+--------+
|        19|     DL|          11433|      -3|
|        19|     DL|          14869|       0|
|        19|     DL|          14057|      -4|
|        19|     DL|          15016|      28|
|        19|     DL|          11193|      -6|
+----------+-------+---------------+--------+
only showing top 5 rows



In [14]:
flights_df.select(['Carrier']).distinct().show()

+-------+
|Carrier|
+-------+
|     UA|
|     AA|
|     EV|
|     B6|
|     DL|
|     OO|
|     F9|
|     YV|
|     US|
|     MQ|
|     HA|
|     AS|
|     FL|
|     VX|
|     WN|
|     9E|
+-------+



#### COLUMNS
* Show list of all DataFrame Columns

In [15]:
flights_df.columns

['DayofMonth',
 'DayOfWeek',
 'Carrier',
 'OriginAirportID',
 'DestAirportID',
 'DepDelay',
 'ArrDelay']

#### COUNT
* Aggregate method to show count of values

In [16]:
#returns total number of rows
flights_df.count()

2719418

In [17]:
#Show total count of distinct carriers
flights_df.select(['Carrier']).distinct().count()

16

#### printSchema()
* Method prints schema name and schema datatypes

In [18]:
flights_df.printSchema()

root
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Carrier: string (nullable = true)
 |-- OriginAirportID: integer (nullable = true)
 |-- DestAirportID: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)



#### WHERE
* It is like filter method of RDD with boolean conditions and statement.

In [19]:
flights_df.where(flights_df.DepDelay>0).first()

Row(DayofMonth=19, DayOfWeek=5, Carrier='DL', OriginAirportID=15016, DestAirportID=11433, DepDelay=28, ArrDelay=24)

In [20]:
flights_df.where((flights_df.DepDelay>0 )&(flights_df.ArrDelay>0)).show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|        10397|     323|     322|
|        19|        5|     DL|          11433|        11298|      22|      41|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



#### FILTER

In [21]:
flights_df.filter((flights_df.DepDelay>0 )&(flights_df.ArrDelay>0)).show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          10397|        14869|      15|      24|
|        19|        5|     DL|          10397|        10423|      33|      34|
|        19|        5|     DL|          11278|        10397|     323|     322|
|        19|        5|     DL|          11433|        11298|      22|      41|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



#### isin()
* Filter values from dataframe by matching patterns.

In [22]:
flights_df.where(flights_df.Carrier.isin('DL','F9','UA','9E')).show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
|        19|        5|     DL|          15016|        11433|      28|      24|
|        19|        5|     DL|          11193|        12892|      -6|     -11|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



In [23]:
flights_df.where(flights_df.Carrier=='F9').count()

35821

### Read airports.csv file as Spark DataFrame

In [24]:
airports_df = spark.read.csv('file:///home/hadoop/Downloads/airports.csv',header=True,inferSchema=True)

In [25]:
airports_df.show()

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
|     10304|      Aniak|   AK|       Aniak Airport|
|     10754|     Barrow|   AK|Wiley Post/Will R...|
|     10551|     Bethel|   AK|      Bethel Airport|
|     10926|    Cordova|   AK|Merle K Mudhole S...|
|     14709|  Deadhorse|   AK|   Deadhorse Airport|
|     11336| Dillingham|   AK|  Dillingham Airport|
|     11630|  Fairbanks|   AK|Fairbanks Interna...|
|     11997|   Gustavus|   AK|    Gustavus Airport|
|     12523|     Juneau|   AK|Juneau International|
|     12819|  Ketchikan|   AK|Ketchikan Interna...|
|     10245|King Salmon|   AK| King Salmon Airport|
|     10170|     Kodiak|   AK|      Kodiak Airport|
|     13970|   Kotzebue|   AK| Ralph Wien Memorial|
|     13873|       Nome|   AK|        Nome Airport|
|     14256|

### Join()
* To join two or more dataframes using condition.

In [26]:
flights_airport_df = flights_df.join(airports_df,airports_df.airport_id==flights_df.OriginAirportID)

In [27]:
flights_airport_df.show()

+----------+---------+-------+---------------+-------------+--------+--------+----------+-----------------+-----+--------------------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|             city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+-----------------+-----+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|          Detroit|   MI|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|   Salt Lake City|   UT|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|         Portland|   OR|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|        St. Louis|   MO|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        1

In [32]:
#inner join
flights_airport_df = flights_df.join(airports_df.select(['airport_id','name']),airports_df.airport_id==flights_df.OriginAirportID)

In [29]:
flights_airport_df.show()

+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        12892|      -6|     -11|     11193|Cincinnati/Northe...|
|        19|        5|     DL|          10397|        15016|      -1|     -19|     10397|Hartsfield-Jackso...|
|

### 8.Drop Duplicates()
* Drop Duplicate Records from existing dataframes

In [30]:
flights_df1 = flights_df.dropDuplicates()

### Calculate percentage datalost after Drop Duplicates

In [31]:
perc=(flights_df.count() - flights_df1.count())/flights_df.count()*100
perc

0.8249927006440348

### Describe 
* Use describe() method to show summary statistics for numeric dataframe columns.
* Summary includes - -count(),stddev(),min(),max(),mean()

In [34]:
flights_df1.describe().show()

+-------+------------------+------------------+-------+------------------+------------------+------------------+------------------+
|summary|        DayofMonth|         DayOfWeek|Carrier|   OriginAirportID|     DestAirportID|          DepDelay|          ArrDelay|
+-------+------------------+------------------+-------+------------------+------------------+------------------+------------------+
|  count|           2696983|           2696983|2696983|           2696983|           2696983|           2674774|           2673185|
|   mean|15.798996508320593| 3.900369412784582|   null|12742.459424846207| 12742.85937657004|10.618575625454712|6.7272897311633875|
| stddev| 8.801267199135454|1.9864582421701973|   null|1502.0359941370625|1501.9939589817989|36.198308432512704| 38.75007476808384|
|    min|                 1|                 1|     9E|             10140|             10140|               -63|               -94|
|    max|                31|                 7|     YV|             15376|  

### Summary
* Use the summary() method for detailed summary of columns  
* Summary includes - -count(),stddev(),min(),max(),mean(),quartiles - Q1,Q2(median),Q3.

In [35]:
flights_df1.summary().show()

+-------+------------------+------------------+-------+------------------+------------------+------------------+------------------+
|summary|        DayofMonth|         DayOfWeek|Carrier|   OriginAirportID|     DestAirportID|          DepDelay|          ArrDelay|
+-------+------------------+------------------+-------+------------------+------------------+------------------+------------------+
|  count|           2696983|           2696983|2696983|           2696983|           2696983|           2674774|           2673185|
|   mean|15.798996508320593| 3.900369412784582|   null|12742.459424846207| 12742.85937657004|10.618575625454712|6.7272897311633875|
| stddev| 8.801267199135454|1.9864582421701973|   null|1502.0359941370625|1501.9939589817989|36.198308432512704| 38.75007476808384|
|    min|                 1|                 1|     9E|             10140|             10140|               -63|               -94|
|    25%|                 8|                 2|   null|             11292|  

### Select Categorical Columns

In [36]:
from pyspark.sql.types import IntegerType,StringType
from pyspark.sql.functions import *

In [39]:
#Select categorical columns.
categorical_cols = [field.name for field in flights_df1.schema.fields if isinstance(field.dataType,StringType)]

In [38]:
categorical_cols

['Carrier']

### col() , groupBy()
* 'col' function is used to refer to a column in DataFrame
* groupBy() - Method to group rows of DataFrame based on values of one or more columns

In [40]:
#In pyspark 'col' function is used to refere to a column in DataFrame
#It is used in column operations
from pyspark.sql.functions import col

In [41]:
flights_df1.filter(col('ArrDelay')>10).count()

664460

### Get Frequency Values for each Categorical Column

In [43]:
for column in categorical_cols:
    flights_df1.groupBy(column).count().show()

+-------+------+
|Carrier| count|
+-------+------+
|     UA|286010|
|     AA|288910|
|     EV|157218|
|     B6|121875|
|     DL|381601|
|     OO|159639|
|     F9| 35736|
|     YV| 52740|
|     US|232955|
|     MQ|112113|
|     HA| 17424|
|     AS| 68544|
|     FL| 92674|
|     VX| 34726|
|     WN|575090|
|     9E| 79728|
+-------+------+



### isnull()
* Method returns boolean outcome for missing values.

In [51]:
#Check for nulls in specific columns
flights_df1.filter(col('ArrDelay').isNull()).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        17|        3|     DL|          14869|        14771|    null|    null|
|        11|        4|     EV|          12266|        13871|    null|    null|
|        19|        5|     EV|          11618|        11433|    null|    null|
|        10|        3|     EV|          13930|        13851|    null|    null|
|         9|        2|     EV|          11292|        14107|    null|    null|
|        18|        4|     AA|          13303|        12892|    null|    null|
|        11|        4|     AA|          13930|        11292|    null|    null|
|        18|        4|     AA|          11298|        14107|    null|    null|
|        16|        2|     AA|          11278|        13930|    null|    null|
|        17|        3|     AA|          13930|      

In [50]:
#Show number of missong values for one column
flights_df1.filter(col('ArrDelay').isNull()).count()

23798

In [53]:
#isNull() returns boolean value
flights_df.select(col('ArrDelay').isNull()).show()

+------------------+
|(ArrDelay IS NULL)|
+------------------+
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
|             false|
+------------------+
only showing top 20 rows



In [60]:
from pyspark.sql.functions import when,isnull,count
flights_df.select([count(when(isnull(col),col)).alias(col) for col in flights_df1.columns]).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|         0|        0|      0|              0|            0|   27444|   29033|
+----------+---------+-------+---------------+-------------+--------+--------+



In [85]:
# Alternative Method - col(),isNull()

In [86]:
d1={}
for c in flights_df.columns:
    d1[c] = flights_df.filter(col(c).isNull()).count()
print(d1)

{'DayofMonth': 0, 'DayOfWeek': 0, 'Carrier': 0, 'OriginAirportID': 0, 'DestAirportID': 0, 'DepDelay': 27444, 'ArrDelay': 29033}


In [87]:
from pyspark.sql.functions import col,sum
flights_df.select([sum(col(column).isNull().cast('int')).alias(column) for column in flights_df1.columns]).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|         0|        0|      0|              0|            0|   27444|   29033|
+----------+---------+-------+---------------+-------------+--------+--------+



### fillna()
* To replace or fill missing values of columns with central tendancy (mean,median,mode)
* Here it will replace missing values by 0s.

In [89]:
flights_df2 = flights_df1.fillna({'DepDelay':0,'ArrDelay':0})

In [90]:
flights_df2.select([sum(col(column).isNull().cast('int')).alias(column) for column in flights_df2.columns]).show()

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|         0|        0|      0|              0|            0|       0|       0|
+----------+---------+-------+---------------+-------------+--------+--------+



### dropna()
* dropna() method used to remove rows with null values

In [91]:
flights_df3 = flights_df1.dropna()

### Statistical Methods
* mean(),median(),stddev(),quartiles

In [94]:
flights_df1.select(round(mean(col('DepDelay'))).alias('DepDelay_Mean')).show()

+-------------+
|DepDelay_Mean|
+-------------+
|         11.0|
+-------------+



In [95]:
flights_df1.select(round(stddev(col('DepDelay'))).alias('DepDelay_Std')).show()

+------------+
|DepDelay_Std|
+------------+
|        36.0|
+------------+



In [96]:
flights_df1.select(round(variance(col('DepDelay'))).alias('DepDelay_Variance')).show()

+-----------------+
|DepDelay_Variance|
+-----------------+
|           1310.0|
+-----------------+



In [99]:
flights_df1.select(round(variance(col('DepDelay'))).alias('DepDelay_Variance')).collect()[0][0]   #to extract only the value

1310.0

### To Calculate Median()  - Use approxQuantile(),expr() 

In [97]:

flights_df1.approxQuantile('ArrDelay',[0.5],0.0001)

[-3.0]

### groupBy() and agg()

In [102]:
flights_df1.groupBy('Carrier').agg(mean('DepDelay'))

DataFrame[Carrier: string, avg(DepDelay): double]

In [103]:
flights_df1.groupBy('Carrier').agg(mean('DepDelay')).show()

+-------+------------------+
|Carrier|     avg(DepDelay)|
+-------+------------------+
|     UA|12.644186783024843|
|     AA|12.154096505870111|
|     EV| 14.52813602113455|
|     B6|12.675216069471794|
|     DL| 7.451940716867138|
|     OO| 7.954327121364983|
|     F9|12.142480802645592|
|     YV| 9.595018289496604|
|     US| 5.011879623272143|
|     MQ|15.612577198673192|
|     HA|1.5358414704192993|
|     AS|0.6606730403765751|
|     FL|10.227206113831024|
|     VX|14.416962353959326|
|     WN|12.930658050935614|
|     9E| 9.767838809034908|
+-------+------------------+



In [111]:
flights_df1.groupBy('Carrier').agg(mean('ArrDelay').alias('Mean_ArrDelay'),mean('DepDelay').alias('Mean_DepDelay')).show()

+-------+--------------------+------------------+
|Carrier|       Mean_ArrDelay|     Mean_DepDelay|
+-------+--------------------+------------------+
|     UA|   5.207155029152466|12.644186783024843|
|     AA|    7.22786703097812|12.154096505870111|
|     EV|  10.501436641191532| 14.52813602113455|
|     B6|   9.679335778153199|12.675216069471794|
|     DL|  2.8085929091567747| 7.451940716867138|
|     OO|   6.447766785619039| 7.954327121364983|
|     F9|  12.870312237233028|12.142480802645592|
|     YV|   8.749505833107245| 9.595018289496604|
|     US|   3.957719324788726| 5.011879623272143|
|     MQ|   14.27679662028746|15.612577198673192|
|     HA|   1.534325271442523|1.5358414704192993|
|     AS|-0.27272328542814955|0.6606730403765751|
|     FL|   7.277437501357486|10.227206113831024|
|     VX|   9.678802215555043|14.416962353959326|
|     WN|   8.368672739670938|12.930658050935614|
|     9E|   4.931550031523011| 9.767838809034908|
+-------+--------------------+------------------+


### WithColumn
* All column operations - Used to create new column or modify an existing column in a DataFrame.
* withColumn is also used for applying transformation or calculation
* df.withColumn(colName,col)

In [112]:
flights_df1.withColumn('TotalDelay',col('ArrDelay')+col('DepDelay')).show()

+----------+---------+-------+---------------+-------------+--------+--------+----------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|TotalDelay|
+----------+---------+-------+---------------+-------------+--------+--------+----------+
|         6|        1|     WN|          10821|        10140|       1|     -22|       -21|
|         8|        1|     AA|          11298|        10140|       0|       6|         6|
|        15|        1|     WN|          14747|        10140|      -6|       3|        -3|
|        27|        1|     AA|          11298|        10140|     113|     117|       230|
|         7|        2|     OO|          12266|        10140|      -3|     -11|       -14|
|        28|        2|     WN|          14107|        10140|      -3|       0|        -3|
|        30|        2|     OO|          12266|        10140|      -4|     -11|       -15|
|         1|        3|     EV|          12266|        10140|     -11|     -26|       -37|
|         

### Applying Conditional Logic

In [113]:
flights_df1.withColumn('IsDelay',when(col("DepDelay")>=0,'Delay').otherwise('No Delay')).show()

+----------+---------+-------+---------------+-------------+--------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay| IsDelay|
+----------+---------+-------+---------------+-------------+--------+--------+--------+
|         6|        1|     WN|          10821|        10140|       1|     -22|   Delay|
|         8|        1|     AA|          11298|        10140|       0|       6|   Delay|
|        15|        1|     WN|          14747|        10140|      -6|       3|No Delay|
|        27|        1|     AA|          11298|        10140|     113|     117|   Delay|
|         7|        2|     OO|          12266|        10140|      -3|     -11|No Delay|
|        28|        2|     WN|          14107|        10140|      -3|       0|No Delay|
|        30|        2|     OO|          12266|        10140|      -4|     -11|No Delay|
|         1|        3|     EV|          12266|        10140|     -11|     -26|No Delay|
|         3|        3|     OO|  

### expr()


In [121]:
from pyspark.sql.functions import expr
median_expr = expr(f"percentile_approx({'ArrDelay'},0.5)")
flights_df1.agg(median_expr.alias("Median")).collect()[0][0]

-3