# Create Schema

In [2]:
from pyspark.sql.types import *

#create session
appName = "data preprocessing in Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

flightSchema = StructType([
  StructField("DayofMonth", IntegerType(), False),
  StructField("DayOfWeek", IntegerType(), False),
  StructField("Carrier", StringType(), False),
  StructField("OriginAirportID", IntegerType(), False),
  StructField("DestAirportID", IntegerType(), False),
  StructField("DepDelay", IntegerType(), False),
  StructField("ArrDelay", IntegerType(), False),
])

flights = spark.read.csv('dataset/raw-flight-data.csv', 
                         schema=flightSchema, header=True, sep=',')
flights.show(2)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 2 rows



In [3]:

airportSchema = StructType([
  StructField("airport_id", IntegerType(), False),
  StructField("city", StringType(), False),
  StructField("state", StringType(), False),
  StructField("name", StringType(), False),
])

airports = spark.read.csv('dataset/airports.csv', header=True, 
                          schema=airportSchema)
airports.show(2)

+----------+-----------+-----+--------------------+
|airport_id|       city|state|                name|
+----------+-----------+-----+--------------------+
|     10165|Adak Island|   AK|                Adak|
|     10299|  Anchorage|   AK|Ted Stevens Ancho...|
+----------+-----------+-----+--------------------+
only showing top 2 rows



# JOIN

In [5]:
flightsByOrigin = flights.join(airports,
                               flights.OriginAirportID == 
                               airports.airport_id).groupBy("city").count()
flightsByOrigin.show(3)

+--------------+-----+
|          city|count|
+--------------+-----+
|       Phoenix|90281|
|         Omaha|13537|
|Raleigh/Durham|28436|
+--------------+-----+
only showing top 3 rows



#### Handle duplicate data

In [7]:
#count the number of original data rows
n1 = flights.count()
print("number of original data rows: ", n1)
#count the number of data rows after deleting duplicated data
n2 = flights.dropDuplicates().count()
print("number of data rows after deleting duplicated data: ", n2)
n3 = n1 - n2
print("number of duplicated data: ", n3)

number of original data rows:  2719418
number of data rows after deleting duplicated data:  2696983
number of duplicated data:  22435


In [11]:
# Create Dataframe and delete duplicates
df = spark.createDataFrame([("Rony",27, 168), 
                            ("Rony",15, 165), 
                            ("Rony",27, 168)], 
                           ["name","age","height"])
df.show()
df.dropDuplicates().show()

+----+---+------+
|name|age|height|
+----+---+------+
|Rony| 27|   168|
|Rony| 15|   165|
|Rony| 27|   168|
+----+---+------+

+----+---+------+
|name|age|height|
+----+---+------+
|Rony| 27|   168|
|Rony| 15|   165|
+----+---+------+



In [15]:
#by name and age duplicate filter
df.dropDuplicates(['name','age']).show()


+----+---+------+
|name|age|height|
+----+---+------+
|Rony| 15|   165|
|Rony| 27|   168|
+----+---+------+



##Handle missing data
Delete row if there is at least one (column) missing data


In [17]:
flightsNoMissingValue = flights.dropDuplicates().dropna(
    how="any", subset=["ArrDelay", "DepDelay"])# use how="all" for all column missing data
numberOfMissingValueAny = n1 - flightsNoMissingValue.count()
print("number of missing value rows: ", numberOfMissingValueAny)

number of missing value rows:  46233


In [44]:
flightsNoMissingValue.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|         6|        1|     WN|          10821|        10140|       1|     -22|
|         8|        1|     AA|          11298|        10140|       0|       6|
|        15|        1|     WN|          14747|        10140|      -6|       3|
|        27|        1|     AA|          11298|        10140|     113|     117|
|         7|        2|     OO|          12266|        10140|      -3|     -11|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 5 rows



In [46]:
df = spark.createDataFrame([("Rony",27, 168), 
                            ("Rony",15, 165), 
                            ("Rony",None, 168)], 
                           ["name","age","height"])
df.dropDuplicates().show()
df.dropna().show()
df.dropna( how="all",  subset=["age"]).show()

+----+----+------+
|name| age|height|
+----+----+------+
|Rony|null|   168|
|Rony|  27|   168|
|Rony|  15|   165|
+----+----+------+

+----+---+------+
|name|age|height|
+----+---+------+
|Rony| 27|   168|
|Rony| 15|   165|
+----+---+------+

+----+---+------+
|name|age|height|
+----+---+------+
|Rony| 27|   168|
|Rony| 15|   165|
+----+---+------+



In [None]:
#Fill the missing data using mean value of each corresponding column data


In [45]:
#take mean value
meanArrDelay = flights.groupBy().avg("ArrDelay").take(1)[0][0]
print("mean ArrDelay: ", meanArrDelay)
meanDepDelay = flights.groupBy().avg("DepDelay").take(1)[0][0]
print("mean DepDelay: ", meanDepDelay)
#drop duplicated data and fill missing data with mean value
flightsCleanData=flights.fillna(
    {'ArrDelay': meanArrDelay, 'DepDelay': meanDepDelay})
#just for experiment
flights.groupBy().avg("ArrDelay").show()

mean ArrDelay:  6.63768791455498
mean DepDelay:  10.53686662649788
+----------------+
|   avg(ArrDelay)|
+----------------+
|6.63768791455498|
+----------------+



In [56]:
df = spark.createDataFrame([("Rony",27, 168), 
                            ("Rony",15, 165), 
                            ("toni",None, 168),
                           ("toni",70, 168)], 
                           ["name","age","height"])
df.show()

+----+----+------+
|name| age|height|
+----+----+------+
|Rony|  27|   168|
|Rony|  15|   165|
|toni|null|   168|
|toni|  70|   168|
+----+----+------+



In [80]:
flights.groupBy().avg("ArrDelay").show()

meanArrDelay = df.groupBy().avg("age").take(1)
print("mean ArrDelay: ", meanArrDelay)

meanArrDelay = df.groupBy().avg("age").take(2)[0][0]
print("mean ArrDelay: ", meanArrDelay)


+----------------+
|   avg(ArrDelay)|
+----------------+
|6.63768791455498|
+----------------+

mean ArrDelay:  [Row(avg(age)=37.333333333333336)]
mean ArrDelay:  37.333333333333336


In [81]:
df = spark.createDataFrame([("Rony",27, 168), 
                            ("Rony",15, 165), 
                            ("toni",None, 168),
                           ("toni",70, 168)], 
                           ["name","age","height"])
df.show()

+----+----+------+
|name| age|height|
+----+----+------+
|Rony|  27|   168|
|Rony|  15|   165|
|toni|null|   168|
|toni|  70|   168|
+----+----+------+



In [83]:
meanAge = df.groupBy().avg("age").take(1)[0][0]
print("mean Age: ", meanAge)


mean Age:  37.333333333333336


In [87]:
RonyFillAge=df.fillna(
    {'Age': meanAge})

RonyFillAge.show()
RonyFillAge.printSchema

+----+---+------+
|name|age|height|
+----+---+------+
|Rony| 27|   168|
|Rony| 15|   165|
|toni| 37|   168|
|toni| 70|   168|
+----+---+------+



<bound method DataFrame.printSchema of DataFrame[name: string, age: bigint, height: bigint]>

# Explore the statistic of our data¶


In [89]:
flightsCleanData.describe('DepDelay','ArrDelay').show()

+-------+------------------+-----------------+
|summary|          DepDelay|         ArrDelay|
+-------+------------------+-----------------+
|  count|           2719418|          2719418|
|   mean|10.531448640848888|6.630879842672218|
| stddev| 35.91695039008075|38.44200618946938|
|    min|               -63|              -94|
|    max|              1863|             1845|
+-------+------------------+-----------------+



In [None]:
We can also calculate the correlation between two variables to know whether the varible is related each other or not



In [90]:
correlation = flightsCleanData.corr('DepDelay', 'ArrDelay')
print("correlation between departure delay and arrival delay: ", 
      correlation)

correlation between departure delay and arrival delay:  0.9393538215572761
