# Initial setup

In [2]:
import findspark

findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col,isnan, when, count
import pyspark.sql.functions as F

from configparser import ConfigParser

config = ConfigParser()
# create your own config.ini in root of project folder to store project configurations
config.read('config.ini')

pathfile = config.get('main', 'dirty_csv')  

spark = SparkSession.builder \
    .config("spark.driver.memory", "15g") \
    .appName("SparkFlight").getOrCreate()



In [2]:
# data = spark.read.csv(pathfile, inferSchema='true', header='true', mode='PERMISSIVE', encoding='ISO-8859-1').limit(10**6).cache()
data = spark.read.csv(pathfile, inferSchema='true', header='true', mode='PERMISSIVE', encoding='ISO-8859-1')

### Analysis cells

In [67]:
data.show()

+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|CancellationCode|Cancelled|CarrierDelay|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|Diverted|FlightNum|LateAircraftDelay|Month|NASDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|WeatherDelay|Year|
+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+
|               53|     32|      -8|   1642|      1650|      1545|            65|             

In [133]:
# check if read.csv(inferSchema='true') inferred the column types correctly
data.dtypes

[('ActualElapsedTime', 'string'),
 ('AirTime', 'string'),
 ('ArrDelay', 'string'),
 ('ArrTime', 'string'),
 ('CRSArrTime', 'int'),
 ('CRSDepTime', 'int'),
 ('CRSElapsedTime', 'string'),
 ('CancellationCode', 'string'),
 ('Cancelled', 'int'),
 ('CarrierDelay', 'string'),
 ('DayOfWeek', 'int'),
 ('DayofMonth', 'int'),
 ('DepDelay', 'string'),
 ('DepTime', 'string'),
 ('Dest', 'string'),
 ('Distance', 'string'),
 ('Diverted', 'int'),
 ('FlightNum', 'int'),
 ('LateAircraftDelay', 'string'),
 ('Month', 'int'),
 ('NASDelay', 'string'),
 ('Origin', 'string'),
 ('SecurityDelay', 'string'),
 ('TailNum', 'string'),
 ('TaxiIn', 'string'),
 ('TaxiOut', 'string'),
 ('UniqueCarrier', 'string'),
 ('WeatherDelay', 'string'),
 ('Year', 'int')]

In [37]:
data.count()

1000000

### {preprocess step} Filter out NA and Null values in all columns saved in var colnames

In [3]:
# filter out rows that have NA values in following columns that are assumed to be important for potential FDs:
colnames = ["ActualElapsedTime", "AirTime", "ArrDelay", "ArrTime", "CRSArrTime", "CRSDepTime", "CRSElapsedTime", "DayOfWeek", "DayofMonth", "DepDelay", "DepTime", "Dest", "Distance", "FlightNum", "Month", "Origin", "TailNum", "TaxiIn", "TaxiOut", "UniqueCarrier", "Year"]
# Specific delay columns like CarrierDelay and WeatherDelay with NA or Nul values need to have these values changed to 0 instead of filtered out.
# Tailnum still has some duplicated that do not have the same right UniqueCarrier
# also has some UNKNOW columns for TailNum. We keep these for soft dependency

# pyspark does not recognize csv "NA" strings as NaN values.
# This is reflected in the output of data.dtypes: columns that normally only have numeric values were not inferred as int type.
# Also remove these rows by filtering them out.
data_filt_nan = data.filter(data.ActualElapsedTime.isNotNull() & 
                            (data.ActualElapsedTime != "NA") &
                            data.AirTime.isNotNull() &
                            (data.AirTime != "NA") &
                            data.ArrDelay.isNotNull() &
                            (data.ArrDelay != "NA") &
                            data.ArrTime.isNotNull() &
                            (data.ArrTime != "NA") &              
                            data.CRSArrTime.isNotNull() & 
                            data.CRSDepTime.isNotNull() &
                            data.CRSElapsedTime.isNotNull() &
                            (data.CRSElapsedTime != "NA") &
                            data.DayOfWeek.isNotNull() &
                            data.DayofMonth.isNotNull() &
                            data.DepDelay.isNotNull() &
                            (data.DepDelay != "NA") &
                            data.DepTime.isNotNull() &
                            (data.DepTime != "NA") &
                            data.Dest.isNotNull() &
                            (data.Dest != "NA") &
                            data.Distance.isNotNull() &
                            (data.Distance != "NA") &
                            data.FlightNum.isNotNull() &
                            data.Month.isNotNull() &
                            data.Origin.isNotNull() &
                            (data.Origin != "NA") &
                            data.TailNum.isNotNull() &
                            (data.TailNum != "NA") &
                            data.TaxiIn.isNotNull() &
                            (data.TaxiIn != "NA") &
                            data.TaxiOut.isNotNull() &
                            (data.TaxiOut != "NA") &
                            data.UniqueCarrier.isNotNull() &
                            (data.UniqueCarrier != "NA") &
                            data.Year.isNotNull())

### Analysis cells

In [72]:
# this suggests the correct amount of rows are removed
data_filt_nan.count()

680914

In [70]:
# in the original dataframe there are still columns with NaN value that should be filtered out
data.select(*colnames).show()

+-----------------+-------+--------+-------+----------+----------+--------------+---------+----------+--------+-------+----+--------+---------+-----+------+-------+------+-------+-------------+----+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|FlightNum|Month|Origin|TailNum|TaxiIn|TaxiOut|UniqueCarrier|Year|
+-----------------+-------+--------+-------+----------+----------+--------------+---------+----------+--------+-------+----+--------+---------+-----+------+-------+------+-------+-------------+----+
|               53|     32|      -8|   1642|      1650|      1545|            65|        4|        10|       4|   1549| PIT|     205|      209|   10|   DCA| N443US|     7|     14|           US|2002|
|              164|    155|     -11|   1754|      1805|      1610|           175|        4|         2|       0|   1610| MCI|    1072|      109|   12|   MCO|   N755|     2|      7|           WN|1999|
|    

In [71]:
# NaN values in colnames should be filtered out
# TailNum still shows weird ascci signs sometimes.
data_filt_nan.select(*colnames).show(50)

+-----------------+-------+--------+-------+----------+----------+--------------+---------+----------+--------+-------+----+--------+---------+-----+------+-------+------+-------+-------------+----+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|FlightNum|Month|Origin|TailNum|TaxiIn|TaxiOut|UniqueCarrier|Year|
+-----------------+-------+--------+-------+----------+----------+--------------+---------+----------+--------+-------+----+--------+---------+-----+------+-------+------+-------+-------------+----+
|               53|     32|      -8|   1642|      1650|      1545|            65|        4|        10|       4|   1549| PIT|     205|      209|   10|   DCA| N443US|     7|     14|           US|2002|
|              164|    155|     -11|   1754|      1805|      1610|           175|        4|         2|       0|   1610| MCI|    1072|      109|   12|   MCO|   N755|     2|      7|           WN|1999|
|    

### {preprocess step} Take +- 15 mil rows now before any further transformations

In [4]:
data_limit = data_filt_nan.limit(15_000_000)

### Analysis cells

In [73]:
# data.printSchema()
# data.schema.names

['ActualElapsedTime',
 'AirTime',
 'ArrDelay',
 'ArrTime',
 'CRSArrTime',
 'CRSDepTime',
 'CRSElapsedTime',
 'CancellationCode',
 'Cancelled',
 'CarrierDelay',
 'DayOfWeek',
 'DayofMonth',
 'DepDelay',
 'DepTime',
 'Dest',
 'Distance',
 'Diverted',
 'FlightNum',
 'LateAircraftDelay',
 'Month',
 'NASDelay',
 'Origin',
 'SecurityDelay',
 'TailNum',
 'TaxiIn',
 'TaxiOut',
 'UniqueCarrier',
 'WeatherDelay',
 'Year']

In [53]:
# data_filt_nan

123395195

### {preprocess step} Transform certain existing columns that store minutes as string to timestamp type

In [5]:
# transform existing date columns to be able to use them for to_timestamp transformation
# this can then be used to calculate with timestamps for total time difference between two attributes (delta dependencies)
# example: transform 1 minute to hhm:mm -> 00:01
date_cols = ["Year", "Month", "DayofMonth", "CRSDepTime"]
data_timestamp = (data_limit.withColumn('Month', F.when(F.length(F.col('Month')) == 1, F.concat(F.lit('0'), F.col('Month'))).otherwise(F.col('Month')))
                    .withColumn('DayofMonth', F.when(F.length(F.col('DayofMonth')) == 1, F.concat(F.lit('0'), F.col('DayofMonth'))).otherwise(F.col('DayofMonth')))
                    .withColumn('CRSDepTime', F.when(F.length(F.col('CRSDepTime')) == 1, F.concat(F.lit('000'), F.col('CRSDepTime')))
                                        .when(F.length(F.col('CRSDepTime')) == 2, F.concat(F.lit('00'), F.col('CRSDepTime')))
                                        .when(F.length(F.col('CRSDepTime')) == 3, F.concat(F.lit('0'), F.col('CRSDepTime')))
                                        .otherwise(F.col('CRSDepTime')))
                    .withColumn('CRSDepTimeStamp', F.to_timestamp(F.concat(*date_cols), format='yyyyMMddHHmm'))
                 )

### Analysis cells

In [76]:
data_timestamp.show()

+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+-------------------+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|CancellationCode|Cancelled|CarrierDelay|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|Diverted|FlightNum|LateAircraftDelay|Month|NASDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|WeatherDelay|Year|    CRSDepTimeStamp|
+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+-------------------+
|               53|     32|      -

### {preprocess step} Transform DepTime with minutes only to full Timestamp with a date as new column using DepDelay

In [6]:
# Deptimestamp calculating by using CRSDepTimeStamp and DepDelay because actual DepTime and CRSDepTime can differ by at most one full day = 1440 mins
data_deptime = data_timestamp.withColumn("DepTimestamp", (col("CRSDepTimeStamp").cast("long") + (col("DepDelay").cast("long"))*60).cast("timestamp"))



### Analysis cells

In [84]:
data_deptime.show(500)

+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+-------------------+-------------------+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|CancellationCode|Cancelled|CarrierDelay|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|Diverted|FlightNum|LateAircraftDelay|Month|NASDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|WeatherDelay|Year|    CRSDepTimeStamp|       DepTimestamp|
+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+---------------

In [82]:
data_deptime.filter(data_deptime.DepDelay > 1300).show()

+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------------------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+-------------------+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|CancellationCode|Cancelled|CarrierDelay|DayOfWeek|DayofMonth|DepDelay|            DepTime|Dest|Distance|Diverted|FlightNum|LateAircraftDelay|Month|NASDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|WeatherDelay|Year|    CRSDepTimeStamp|
+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------------------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+-------------------

### {preprocess step} Transform ArrTime with minutes only to full Timestamp with a date as new column by adding ActualElapsedTime to DepTimestamp 

In [7]:
data_arrtime = data_deptime.withColumn("ArrTimestamp", (col("DepTimestamp").cast("long") + (col("ActualElapsedTime").cast("long"))*60).cast("timestamp"))

### Analysis cells

In [87]:
data_arrtime.show(20)

+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+-------------------+-------------------+-------------------+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|CancellationCode|Cancelled|CarrierDelay|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|Diverted|FlightNum|LateAircraftDelay|Month|NASDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|WeatherDelay|Year|    CRSDepTimeStamp|       DepTimestamp|       ArrTimestamp|
+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------

### {preprocess step} Transform CRSArrTime with minutes only to full Timestamp with a date as new column by adding CRSElapsedTime to CRSDepTime 

In [8]:
# CRSArrTimestamp calculating by using CRSDepTimestamp and CRSElapsedTime
data_crs_arrtime = data_arrtime.withColumn("CRSArrTimestamp", (col("CRSDepTimestamp").cast("long") + (col("CRSElapsedTime").cast("long"))*60).cast("timestamp"))

### Analysis cells

In [89]:
data_crs_arrtime.show(20)

+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+-------------------+-------------------+-------------------+-------------------+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|CancellationCode|Cancelled|CarrierDelay|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|Diverted|FlightNum|LateAircraftDelay|Month|NASDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|WeatherDelay|Year|    CRSDepTimeStamp|       DepTimestamp|       ArrTimestamp|    CRSArrTimestamp|
+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+----

In [95]:
#  check if there are duplicates in DepTimestamp for delta dependency
data_crs_arrtime.groupBy("DepTimestamp").count().orderBy(col("count").desc()).show(500)

+-------------------+-----+
|       DepTimestamp|count|
+-------------------+-----+
|1999-06-21 23:57:00|    7|
|1999-03-02 23:59:00|    7|
|1995-11-08 00:00:00|    7|
|1999-08-31 23:59:00|    7|
|1999-03-01 23:57:00|    6|
|1999-03-15 00:00:00|    6|
|1999-03-07 23:58:00|    6|
|1997-12-16 23:59:00|    6|
|1995-11-12 23:59:00|    6|
|1999-03-09 23:59:00|    6|
|1995-12-02 23:59:00|    6|
|1996-01-06 00:00:00|    6|
|1995-03-08 07:00:00|    6|
|1999-10-06 23:58:00|    6|
|1999-03-02 23:57:00|    6|
|1999-03-01 23:59:00|    6|
|1995-12-02 00:00:00|    6|
|1999-11-16 23:59:00|    6|
|1997-11-30 23:59:00|    6|
|1999-03-01 00:00:00|    6|
|1998-01-26 00:00:00|    6|
|1999-11-17 00:00:00|    6|
|1999-11-05 23:59:00|    6|
|2004-04-08 09:30:00|    6|
|1999-11-05 23:58:00|    6|
|1995-11-16 00:00:00|    5|
|1999-02-28 23:57:00|    5|
|1999-10-01 23:58:00|    5|
|1998-01-24 00:00:00|    5|
|1999-02-28 23:56:00|    5|
|1996-12-07 00:00:00|    5|
|1995-08-11 00:00:00|    5|
|1998-01-29 23:58:00

In [96]:
#  check if there are duplicates in ArrTimestamp for delta dependency
data_crs_arrtime.groupBy("ArrTimestamp").count().orderBy(col("count").desc()).show(500)

+-------------------+-----+
|       ArrTimestamp|count|
+-------------------+-----+
|1999-09-27 17:30:00|    5|
|2006-10-04 16:30:00|    5|
|2004-09-18 19:43:00|    4|
|2001-08-08 11:24:00|    4|
|2007-02-06 09:20:00|    4|
|2004-07-03 16:54:00|    4|
|2004-02-05 18:51:00|    4|
|2005-01-05 15:48:00|    4|
|2001-05-08 15:49:00|    4|
|2006-07-19 17:40:00|    4|
|2004-04-29 13:06:00|    4|
|1997-07-27 11:34:00|    4|
|2001-09-02 09:46:00|    4|
|1999-10-31 15:08:00|    4|
|2006-03-26 15:50:00|    4|
|2000-12-05 23:20:00|    4|
|2006-03-10 18:04:00|    4|
|1998-06-25 21:32:00|    4|
|2004-04-22 11:45:00|    4|
|1997-06-14 19:22:00|    4|
|2008-09-01 14:23:00|    4|
|2001-01-18 20:12:00|    4|
|1997-11-17 13:02:00|    4|
|2008-03-02 13:56:00|    4|
|2004-01-14 12:50:00|    4|
|2006-06-04 16:22:00|    4|
|2007-09-03 18:21:00|    4|
|2005-08-28 10:35:00|    4|
|2003-07-15 18:39:00|    4|
|1997-11-26 19:29:00|    4|
|2007-03-27 12:01:00|    4|
|1996-11-17 21:42:00|    4|
|1997-10-10 18:10:00

### {preprocess step} Filter out rows that have have cancelled == 1 or Diverted == 1 as they leave out very valuable info
### This is done before removing these columns to filter out low information rows first

In [9]:
# Rows with cancelled == 1 or Diverted == 1 were probably already filtered because of filtered out NaN values for the important time columns
# Just to be sure
data_filt_div_canc = data_crs_arrtime.filter((data_crs_arrtime.Cancelled == 0) | (data_crs_arrtime.Diverted == 0))

### Analysis cells

In [123]:
# check if there are no rows left with cancelled == 1
data_filt_div_canc.orderBy(col("Cancelled").desc()).show(20)

+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+-------------+-------+------+-------+-------------+------------+----+-------------------+-------------------+-------------------+-------------------+
|ActualElapsedTime|AirTime|ArrDelay|ArrTime|CRSArrTime|CRSDepTime|CRSElapsedTime|CancellationCode|Cancelled|CarrierDelay|DayOfWeek|DayofMonth|DepDelay|DepTime|Dest|Distance|Diverted|FlightNum|LateAircraftDelay|Month|NASDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|WeatherDelay|Year|    CRSDepTimeStamp|       DepTimestamp|       ArrTimestamp|    CRSArrTimestamp|
+-----------------+-------+--------+-------+----------+----------+--------------+----------------+---------+------------+---------+----------+--------+-------+----+--------+--------+---------+-----------------+-----+--------+------+----

### {preprocess step} Drop columns that have already been transformed to other columns or columns that seem less useful
- Original dataset has 28 columns which we want to try to reduce due to computational complexity
- The date columns like Year and Month have already been transformed to CRSDepTimeStamp
- Flightnumber only has a few thousand unique numbers over a million records and seems to have no correlation with others
- CancellationCode, Cancelled and Diverted seem to be very messy and occur only very rarely
    - Cancelled and Diverted only have two possible values: 1 and 0 and 1 only occurs very rarely
    - CancellationCode is also not always set when a flight is cancelled
- After this step number of cols is reduced from 28 -> 21

In [10]:
columns_to_drop = ["CancellationCode", "Cancelled", "Diverted", "Year", "Month", "DayOfWeek", "DayofMonth", "ArrTime", "CRSArrTime", "CRSDepTime", "DepTime", "FlightNum"]
# drop as many (mostly) useless columns as we can
data_drop_cols = data_filt_div_canc.drop(*columns_to_drop)

### Analysis cells

In [135]:
# data_drop_cols.show(20)
len(data_drop_cols.columns)

21

In [136]:
data_drop_cols.columns

['ActualElapsedTime',
 'AirTime',
 'ArrDelay',
 'CRSElapsedTime',
 'CarrierDelay',
 'DepDelay',
 'Dest',
 'Distance',
 'LateAircraftDelay',
 'NASDelay',
 'Origin',
 'SecurityDelay',
 'TailNum',
 'TaxiIn',
 'TaxiOut',
 'UniqueCarrier',
 'WeatherDelay',
 'CRSDepTimeStamp',
 'DepTimestamp',
 'ArrTimestamp',
 'CRSArrTimestamp']

### {preprocess step} Combine several specific delay columns to one column to reduce amount of columns
### Also cast columns that store minutes to int (turned out to be bigint)


In [11]:
# Combine LateAircraftDelay and CarrierDelay into one column.
# Combine NASDelay and WeatherDelay into one column
# Cast columns with string type to bigint (will get transformed later)
def transform_and_cast_cols(x):
    ActualElapsedTime= int(x.ActualElapsedTime)
    AirTime= int(x.AirTime)
    ArrDelay= int(x.ArrDelay)
    CRSElapsedTime= int(x.CRSElapsedTime)
    DepDelay= int(x.DepDelay)
    Dest= x.Dest
    Distance= int(x.Distance)
    LateAircraftDelay = x.LateAircraftDelay
    CarrierDelay = x.CarrierDelay
    
    if LateAircraftDelay == "NA":
        LateAircraftDelay = 0
    if CarrierDelay == "NA":
        CarrierDelay = 0 
    
    LateAircraftAndCarrierDelay = int(LateAircraftDelay) + int(CarrierDelay)
    
    NASDelay = x.NASDelay
    WeatherDelay = x.WeatherDelay
    if NASDelay == "NA":
        NASDelay = 0
    if WeatherDelay == "NA":
        WeatherDelay = 0 
    
    NASAndWeatherDelay= int(NASDelay) + int(WeatherDelay)
    Origin=x.Origin
    SecurityDelay = x.SecurityDelay
    if SecurityDelay == "NA":
        SecurityDelay = 0
        
    SecurityDelay= int(SecurityDelay)
    TailNum=x.TailNum
    TaxiIn=int(x.TaxiIn)
    TaxiOut=int(x.TaxiOut)
    UniqueCarrier=x.UniqueCarrier
    CRSDepTimeStamp=x.CRSDepTimeStamp
    DepTimestamp=x.DepTimestamp
    ArrTimestamp=x.ArrTimestamp
    CRSArrTimestamp=x.CRSArrTimestamp
    return (ActualElapsedTime,
            AirTime,
            ArrDelay,
            CRSElapsedTime,
            DepDelay,
            Dest,
            Distance,
            LateAircraftAndCarrierDelay,
            NASAndWeatherDelay,
            Origin,
            SecurityDelay,
            TailNum,
            TaxiIn,
            TaxiOut,
            UniqueCarrier,
            CRSDepTimeStamp,
            DepTimestamp,
            ArrTimestamp,
            CRSArrTimestamp)


In [12]:
# first transform dataframe to rdd as only rdd has map transform function
rdd_trans_n_cast_cols = data_drop_cols.rdd.map(lambda x: transform_and_cast_cols(x))  

In [13]:
# transform rdd back to dataframe
# have to specify columns again because rdd has to tabular structure
data_trans_n_cast_cols = rdd_trans_n_cast_cols.toDF(['ActualElapsedTime',
                                 'AirTime',
                                 'ArrDelay',
                                 'CRSElapsedTime',
                                 'DepDelay',
                                 'Dest',
                                 'Distance',
                                 'LateAircraftAndCarrierDelay',
                                 'NASAndWeatherDelay',
                                 'Origin',
                                 'SecurityDelay',
                                 'TailNum',
                                 'TaxiIn',
                                 'TaxiOut',
                                 'UniqueCarrier',
                                 'CRSDepTimeStamp',
                                 'DepTimestamp',
                                 'ArrTimestamp',
                                 'CRSArrTimestamp'])

### Analysis cells

In [150]:
data_trans_n_cast_cols.show(20)

+-----------------+-------+--------+--------------+--------+----+--------+---------------------------+------------------+------+-------------+-------+------+-------+-------------+-------------------+-------------------+-------------------+-------------------+
|ActualElapsedTime|AirTime|ArrDelay|CRSElapsedTime|DepDelay|Dest|Distance|LateAircraftAndCarrierDelay|NASAndWeatherDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|    CRSDepTimeStamp|       DepTimestamp|       ArrTimestamp|    CRSArrTimestamp|
+-----------------+-------+--------+--------------+--------+----+--------+---------------------------+------------------+------+-------------+-------+------+-------+-------------+-------------------+-------------------+-------------------+-------------------+
|               53|     32|      -8|            65|       4| PIT|     205|                          0|                 0|   DCA|            0| N443US|     7|     14|           US|2002-10-10 15:45:00|2002-10-10 15:49:00|2

In [152]:
data_trans_n_cast_cols.dtypes

[('ActualElapsedTime', 'bigint'),
 ('AirTime', 'bigint'),
 ('ArrDelay', 'bigint'),
 ('CRSElapsedTime', 'bigint'),
 ('DepDelay', 'bigint'),
 ('Dest', 'string'),
 ('Distance', 'bigint'),
 ('LateAircraftAndCarrierDelay', 'bigint'),
 ('NASAndWeatherDelay', 'bigint'),
 ('Origin', 'string'),
 ('SecurityDelay', 'bigint'),
 ('TailNum', 'string'),
 ('TaxiIn', 'bigint'),
 ('TaxiOut', 'bigint'),
 ('UniqueCarrier', 'string'),
 ('CRSDepTimeStamp', 'timestamp'),
 ('DepTimestamp', 'timestamp'),
 ('ArrTimestamp', 'timestamp'),
 ('CRSArrTimestamp', 'timestamp')]

### {preprocess step} Cast bigint columns to int

In [14]:
int_col_list = ['ActualElapsedTime', 'AirTime', 'ArrDelay', 'CRSElapsedTime', 'DepDelay', 'Distance', 'LateAircraftAndCarrierDelay', 'NASAndWeatherDelay', 'SecurityDelay', 'TaxiIn', 'TaxiOut']
for col in int_col_list:
    data_trans_n_cast_cols = data_trans_n_cast_cols.withColumn(col, data_trans_n_cast_cols[col].cast('integer'))

### Analysis cells

In [157]:
data_trans_n_cast_cols.dtypes

[('ActualElapsedTime', 'int'),
 ('AirTime', 'int'),
 ('ArrDelay', 'int'),
 ('CRSElapsedTime', 'int'),
 ('DepDelay', 'int'),
 ('Dest', 'string'),
 ('Distance', 'int'),
 ('LateAircraftAndCarrierDelay', 'int'),
 ('NASAndWeatherDelay', 'int'),
 ('Origin', 'string'),
 ('SecurityDelay', 'int'),
 ('TailNum', 'string'),
 ('TaxiIn', 'int'),
 ('TaxiOut', 'int'),
 ('UniqueCarrier', 'string'),
 ('CRSDepTimeStamp', 'timestamp'),
 ('DepTimestamp', 'timestamp'),
 ('ArrTimestamp', 'timestamp'),
 ('CRSArrTimestamp', 'timestamp')]

In [158]:
data_trans_n_cast_cols.show(20)

+-----------------+-------+--------+--------------+--------+----+--------+---------------------------+------------------+------+-------------+-------+------+-------+-------------+-------------------+-------------------+-------------------+-------------------+
|ActualElapsedTime|AirTime|ArrDelay|CRSElapsedTime|DepDelay|Dest|Distance|LateAircraftAndCarrierDelay|NASAndWeatherDelay|Origin|SecurityDelay|TailNum|TaxiIn|TaxiOut|UniqueCarrier|    CRSDepTimeStamp|       DepTimestamp|       ArrTimestamp|    CRSArrTimestamp|
+-----------------+-------+--------+--------------+--------+----+--------+---------------------------+------------------+------+-------------+-------+------+-------+-------------+-------------------+-------------------+-------------------+-------------------+
|               53|     32|      -8|            65|       4| PIT|     205|                          0|                 0|   DCA|            0| N443US|     7|     14|           US|2002-10-10 15:45:00|2002-10-10 15:49:00|2

### Save dataframe to CSV again

In [15]:
# will partition the dataframe into several csv files stored under a .csv directory
# to load in the preprocessed csv again you may use the path to the .csv directory
# spark will automatically load in all csv parts included in the directory
data_trans_n_cast_cols.write.csv('preprocessed_data.csv', header = True)

# started at 19:26

### Test the saved preprocessed CSV

In [3]:
preproc_data = spark.read.csv('preprocessed_data.csv', inferSchema='true', header='true', mode='PERMISSIVE', encoding='ISO-8859-1').cache()

In [4]:
preproc_data.show(200)

2008-09-16 11:42:00|2008-09-16 12:19:00|2008-09-16 12:22:00|
|              290|    254|      -4|           289|      -5| LAX|    1946|                          0|                 0|   ATL|            0|   N829MH|     9|     27|           DL|2008-11-07 16:10:00|2008-11-07 16:05:00|2008-11-07 20:55:00|2008-11-07 20:59:00|
|              161|    143|      12|           175|      26| BDL|    1133|                          0|                 0|   PBI|            0|   -N382D|     3|     15|           DL|2001-03-29 11:00:00|2001-03-29 11:26:00|2001-03-29 14:07:00|2001-03-29 13:55:00|
|              313|    298|      25|           327|      39| SFO|    2254|                          0|                 0|   PIT|            0|   N625AU|     8|      7|           US|1997-05-23 14:50:00|1997-05-23 15:29:00|1997-05-23 20:42:00|1997-05-23 20:17:00|
|               68|     56|      -7|            75|       0| BWI|     328|                          0|                 0|   PVD|            0|   N660SW| 

In [5]:
preproc_data.count()

15000000

In [6]:
preproc_data.dtypes

[('ActualElapsedTime', 'int'),
 ('AirTime', 'int'),
 ('ArrDelay', 'int'),
 ('CRSElapsedTime', 'int'),
 ('DepDelay', 'int'),
 ('Dest', 'string'),
 ('Distance', 'int'),
 ('LateAircraftAndCarrierDelay', 'int'),
 ('NASAndWeatherDelay', 'int'),
 ('Origin', 'string'),
 ('SecurityDelay', 'int'),
 ('TailNum', 'string'),
 ('TaxiIn', 'int'),
 ('TaxiOut', 'int'),
 ('UniqueCarrier', 'string'),
 ('CRSDepTimeStamp', 'timestamp'),
 ('DepTimestamp', 'timestamp'),
 ('ArrTimestamp', 'timestamp'),
 ('CRSArrTimestamp', 'timestamp')]

In [6]:
len(preproc_data.columns)

19

In [188]:
import pyspark
sc = pyspark.SparkContext.getOrCreate();

res = []

for column in preproc_data.dtypes:
    for column2 in preproc_data.dtypes:
        if(column != column2):
            res.append((column[0], column2[0]))

rdd = sc.parallelize(preproc_data.head(100000), 4)
#Row(ActualElapsedTime=53, AirTime=32, ArrDelay=-8, CRSElapsedTime=65, DepDelay=4, Dest='PIT', Distance=205, LateAircraftAndCarrierDelay=0, NASAndWeatherDelay=0, Origin='DCA', SecurityDelay=0, TailNum='N443US', TaxiIn=7, TaxiOut=14, UniqueCarrier='US', CRSDepTimeStamp=datetime.datetime(2002, 10, 10, 15, 45), DepTimestamp=datetime.datetime(2002, 10, 10, 15, 49), ArrTimestamp=datetime.datetime(2002, 10, 10, 16, 42), CRSArrTimestamp=datetime.datetime(2002, 10, 10, 16, 50))

test = rdd.sample(False, 0.001, 1)

print(len(test.collect()))

#Row(ActualElapsedTime=53, AirTime=32, ArrDelay=-8, CRSElapsedTime=65, DepDelay=4, Dest='PIT', Distance=205, LateAircraftAndCarrierDelay=0, NASAndWeatherDelay=0, Origin='DCA', SecurityDelay=0, TailNum='N443US', TaxiIn=7, TaxiOut=14, UniqueCarrier='US', CRSDepTimeStamp=datetime.datetime(2002, 10, 10, 15, 45), DepTimestamp=datetime.datetime(2002, 10, 10, 15, 49), ArrTimestamp=datetime.datetime(2002, 10, 10, 16, 42), CRSArrTimestamp=datetime.datetime(2002, 10, 10, 16, 50))

mapped = test.flatMap(lambda x: [(((i[0], i[1]), x[i[0]]), ((i[1], x[i[1]]), True)) for i in res])

print(len(mapped.collect()))

#mapped = test.flatMap(lambda x: [
#    ((('ActualElapsedTime', 'AirTime'), x['ActualElapsedTime']), (('AirTime', x['AirTime']), True)), 
#    ((('ActualElapsedTime', 'ActualElapsedTime'), x['ActualElapsedTime']), (('ActualElapsedTime', x['ActualElapsedTime']), True))
#    ])
#((('ActualElapsedTime', 'AirTime'), 82), (('AirTime', 68), True))


grouped = mapped.reduceByKey(lambda x, y: y if(x == y) else (x[0], False))
#((('ActualElapsedTime', 'ActualElapsedTime'), 82), (('ActualElapsedTime', 82), True))


removedata = grouped.map(lambda x: (x[0][0], x[1][1]))
#(('ActualElapsedTime', 'ActualElapsedTime'), True)

grouptofd = removedata.reduceByKey(lambda x, y: x and y)
#(('ActualElapsedTime', 'ActualElapsedTime'), True)

print(grouptofd.collect())



81
27702
[(('ActualElapsedTime', 'ArrDelay'), False), (('ActualElapsedTime', 'Dest'), False), (('ActualElapsedTime', 'TaxiIn'), False), (('ActualElapsedTime', 'UniqueCarrier'), False), (('CRSElapsedTime', 'DepDelay'), False), (('CRSElapsedTime', 'LateAircraftAndCarrierDelay'), False), (('CRSElapsedTime', 'CRSDepTimeStamp'), False), (('CRSElapsedTime', 'DepTimestamp'), False), (('CRSElapsedTime', 'ArrTimestamp'), False), (('TaxiIn', 'ActualElapsedTime'), False), (('TaxiIn', 'ArrDelay'), False), (('TaxiIn', 'Dest'), False), (('TaxiIn', 'UniqueCarrier'), False), (('DepTimestamp', 'CRSElapsedTime'), True), (('DepTimestamp', 'DepDelay'), True), (('DepTimestamp', 'LateAircraftAndCarrierDelay'), True), (('DepTimestamp', 'CRSDepTimeStamp'), True), (('DepTimestamp', 'ArrTimestamp'), True), (('AirTime', 'Distance'), False), (('AirTime', 'Origin'), False), (('AirTime', 'TailNum'), False), (('AirTime', 'TaxiOut'), False), (('AirTime', 'CRSArrTimestamp'), False), (('ArrDelay', 'ActualElapsedTime'),

In [260]:
import time
from itertools import product

def generatePossibleFDs(lhsSize, functionalDependencies):
    columns = list(map(lambda x: x[0], preproc_data.dtypes))
    possibilities = list(product(columns, repeat=lhsSize+1))
    possibilities = list(filter(lambda x: len(set(x)) == len(x), possibilities))

    #TODO remove posibilities which fully contain already found functional dependencies to reduce the amount of posible FDs for lhs=3

    return possibilities

def getFlatMapLambda(lhsSize, possibleCombinations):
    if(lhsSize == 1):
        return lambda x: [(((i[0], i[1]), x[i[0]]), ((i[1], x[i[1]]), True)) for i in possibleCombinations]
    if(lhsSize == 2):
        return lambda x: [(((i[0], i[1], i[2]), x[i[0]], x[i[1]]), ((i[2], x[i[2]]), True)) for i in possibleCombinations]
    if(lhsSize == 3):
        return lambda x: [(((i[0], i[1], i[2], i[3]), x[i[0]], x[i[1]], x[i[2]]), ((i[3], x[i[3]]), True)) for i in possibleCombinations]

def sampleFD(samplerate, dataframe, possibleCombinations, parallelWorkers, lhsSize):
    if(len(possibleCombinations) == 0):
        return possibleCombinations
    
    rdd = sc.parallelize(dataframe, parallelWorkers)
    sampled = rdd.sample(True, samplerate) 
    mappedCombinations = sampled.flatMap(getFlatMapLambda(lhsSize, possibleCombinations))
    grouped = mappedCombinations.reduceByKey(lambda x, y: y if(x == y) else (x[0], False))
    removedata = grouped.map(lambda x: (x[0][0], x[1][1]))
    grouptofd = removedata.reduceByKey(lambda x, y: x and y)
    filtered = grouptofd.filter(lambda x: x[1])
    removeBool = filtered.map(lambda x: x[0])
    return removeBool.collect()

def findFDs(dataframe, parallelWorkers, sampleRates, lhsSizes):
    functionalDependencies = []
    for lhsSize in lhsSizes: 
        possibleCombinations = generatePossibleFDs(lhsSize, functionalDependencies)
        
        for sampleRate in sampleRates:
            print(f'Running samplingrate {sampleRate} over {len(possibleCombinations)} possible FDs')
            tic = time.perf_counter()
            possibleCombinations = sampleFD(samplerate=sampleRate, dataframe=dataframe.head(15000000), possibleCombinations=possibleCombinations, parallelWorkers=parallelWorkers, lhsSize=lhsSize)
            toc = time.perf_counter()
            print(f"Sampling took {toc - tic:0.4f} seconds")
        
        functionalDependencies.append(possibleCombinations)

    return functionalDependencies

In [None]:
combinations = findFDs(preproc_data, 16, [0.001, 0.01, 0.05, 0.15, 0.3, 1], [1,2])

print(combinations)

In [218]:
from collections import Counter

columns = []

for column in preproc_data.dtypes:
    columns.append(column[0])

possibilities = list(product(columns, columns, columns))
possibilities = list(filter(lambda x: len(set(x)) == len(x), possibilities))
len(possibilities)

5814

In [230]:
len(generatePossibleFDs(4))

1395360

In [None]:
Running samplingrate 0.001 over 5814 possible FDs
Sampling took 107.6598 seconds
Running samplingrate 0.01 over 2236 possible FDs
Sampling took 667.6351 seconds
Running samplingrate 0.05 over 1078 possible FDs
Sampling took 2987.7086 seconds
Running samplingrate 0.15 over 374 possible FDs
Sampling took 2607.5667 seconds
Running samplingrate 1 over 252 possible FDs