In [1]:
"""
This notebook was used to test the transformations of the dataset and to test machine learning models
"""

In [None]:
import findspark
findspark.init('/opt/spark')

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
spark = SparkSession.builder.appName('OnTimeProcess_byriel').getOrCreate()

In [3]:
data = spark.read.csv('final_2020_*.csv', header=True, inferSchema=True)

In [5]:
data.groupBy('Dest').count().count()

210

In [6]:
data.columns

['Dest',
 'ArrDelayMinutes',
 'Diverted',
 'Cancelled',
 'WeatherDelay',
 'temp',
 'dewpoint',
 'wind',
 'precip',
 'alti',
 'vis',
 'cloudCoverage',
 'cloudAlt',
 'weather',
 'ice']

In [7]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

cloudIndex = StringIndexer(inputCol='cloudCoverage', outputCol='cloudIndex')
cloudOnehot = StringIndexer(inputCol='cloudIndex', outputCol='cloudOnehot')
weatherIndex = StringIndexer(inputCol='weather', outputCol='weatherIndex')
weatherOnehot = StringIndexer(inputCol='weatherIndex', outputCol='weatherOnehot')

In [8]:
data = cloudIndex.fit(data).transform(data)
data = cloudOnehot.fit(data).transform(data)
data = weatherIndex.fit(data).transform(data)
data = weatherOnehot.fit(data).transform(data)

In [9]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[
    'temp',
    'dewpoint',
    'wind',
    'precip',
    'alti',
    'vis',
    'cloudOnehot',
    'cloudAlt',
    'weatherOnehot',
    'ice'
], outputCol='features')

In [10]:
data = assembler.transform(data)

In [11]:
ml = data.select(['ArrDelayMinutes',
 'Diverted',
 'Cancelled',
 'WeatherDelay',
 'features'])

In [12]:
ml.groupBy('Diverted').count().show()

dones = ml.filter(ml['Diverted'] == 1).count()
dzeros = ml.filter(ml['Diverted'] == 0).count()

+--------+-----+
|Diverted|count|
+--------+-----+
|     0.0|88897|
|     1.0|  195|
+--------+-----+



In [13]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

divertWeights = udf(lambda x: dzeros/(dones+dzeros) if x == 1.0 else dones/(dones+dzeros), DoubleType())
ml = ml.withColumn('divertWeight', divertWeights(ml['Diverted']))

In [14]:
ml.groupBy('Cancelled').count().show()

cones = ml.filter(ml['Cancelled'] == 1).count()
czeros = ml.filter(ml['Cancelled'] == 0).count()

+---------+-----+
|Cancelled|count|
+---------+-----+
|      0.0|88513|
|      1.0|  579|
+---------+-----+



In [15]:
cancelWeights = udf(lambda x: czeros/(cones+czeros) if x == 1.0 else cones/(cones+czeros), DoubleType())
ml = ml.withColumn('cancelWeight', divertWeights(ml['Cancelled']))

In [16]:
isWxDelay = udf(lambda x: 0.0 if x == 0.0 else 1.0, DoubleType())
ml = ml.withColumn('hasWXDelay', isWxDelay(ml['WeatherDelay']))

In [17]:
ml.groupBy('hasWXDelay').count().show()

wones = ml.filter(ml['hasWXDelay'] == 1).count()
wzeros = ml.filter(ml['hasWXDelay'] == 0).count()

+----------+-----+
|hasWXDelay|count|
+----------+-----+
|       0.0|88259|
|       1.0|  833|
+----------+-----+



In [18]:
cancelWeights = udf(lambda x: wzeros/(wones+wzeros) if x == 1.0 else wones/(wones+wzeros), DoubleType())
ml = ml.withColumn('wxdWeight', divertWeights(ml['hasWXDelay']))

In [19]:
from pyspark.ml.classification import LogisticRegression

In [20]:
train, test = ml.randomSplit([0.7, 0.3], seed=101)

In [21]:
divertLR = LogisticRegression(featuresCol='features', labelCol='Diverted', weightCol='divertWeight')
cancelLR = LogisticRegression(featuresCol='features', labelCol='Cancelled', weightCol='cancelWeight')
delayLR = LogisticRegression(featuresCol='features', labelCol='hasWXDelay', weightCol='wxdWeight')

In [22]:
divertModel = divertLR.fit(train)
cancelModel = cancelLR.fit(train)
delayModel = delayLR.fit(train)

In [23]:
divertResult = divertModel.transform(test)
    #Area Under ROC: about .6
    #Accuracy: About 0.4
cancelResult = cancelModel.transform(test)
    #Area Under ROC: about 0
    #Accuracy: About .99
delayResult = delayModel.transform(test)
    #Area Under ROC: about .7
    #Accuracy: About 0.6

In [24]:
print('divert lr accuracy',divertModel.evaluate(test).accuracy)
print('divert roc', divertModel.evaluate(test).areaUnderROC)
print('cancel lr accuracy',cancelModel.evaluate(test).accuracy)
print("delay lr accuracy",delayModel.evaluate(test).accuracy)

divert 0.7859548039509129
divert roc 0.5762919276330207
cancel 0.024431307991619276
delay 0.00924124513618677


In [25]:
ml.show(truncate=False)

+---------------+--------+---------+------------+-----------------------------------------------+--------------------+--------------------+----------+--------------------+
|ArrDelayMinutes|Diverted|Cancelled|WeatherDelay|features                                       |divertWeight        |cancelWeight        |hasWXDelay|wxdWeight           |
+---------------+--------+---------+------------+-----------------------------------------------+--------------------+--------------------+----------+--------------------+
|0.0            |0.0     |0.0      |0.0         |(10,[2,4,5,6,7],[2.0,29.79,10.0,2.0,6000.0])   |0.002188748709199479|0.002188748709199479|0.0       |0.002188748709199479|
|9.0            |0.0     |0.0      |0.0         |(10,[2,4,5,6,7],[8.0,30.21,10.0,4.0,12000.0])  |0.002188748709199479|0.002188748709199479|0.0       |0.002188748709199479|
|18.0           |0.0     |0.0      |0.0         |(10,[2,4,5,6,7],[5.0,30.38,10.0,1.0,3700.0])   |0.002188748709199479|0.002188748709199479|0

In [26]:
from pyspark.ml.classification import RandomForestClassifier
#delay with random forests
delayForest = RandomForestClassifier(labelCol='hasWXDelay', weightCol='wxdWeight', maxBins = 100)
delayRF = delayForest.fit(train)
delayRFResult = delayRF.transform(test)

In [27]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol='hasWXDelay', weightCol='wxdWeight', metricName='accuracy')

In [28]:
evaluator.evaluate(delayRFResult)

0.8040839223647489

In [30]:
#cancellations with random forest
cancelForest = RandomForestClassifier(labelCol='Cancelled', weightCol='cancelWeight', maxBins = 100)
cancelRF = cancelForest.fit(train)
cancelRFResult = cancelRF.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol='Cancelled', weightCol='cancelWeight', metricName='accuracy')
print(evaluator.evaluate(cancelRFResult))

0.7960611093168654


In [31]:
#diversions with random forest
divertForest = RandomForestClassifier(labelCol='Diverted', weightCol='divertWeight', maxBins = 100)
divertRF = divertForest.fit(train)
divertRFResult = divertRF.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol='Diverted', weightCol='divertWeight', metricName='accuracy')
print(evaluator.evaluate(delayRFResult))

0.4769990505334242


In [32]:
from pyspark.ml.classification import GBTClassifier

In [36]:
#cancellations with GBT
cancelGBT = GBTClassifier(labelCol='Cancelled', weightCol='cancelWeight', maxBins = 100)
cancelGB = cancelGBT.fit(train)
cancelGBTResult = cancelGB.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol='Cancelled', weightCol='cancelWeight', metricName='accuracy')
print(evaluator.evaluate(cancelGBTResult))

0.8113088997734172


In [38]:
#delay with GBT
delayGBT = GBTClassifier(labelCol='hasWXDelay', weightCol='wxdWeight', maxBins = 100)
delayGB = delayGBT.fit(train)
delayGBTResult = delayGB.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol='hasWXDelay', weightCol='wxdWeight', metricName='accuracy')
print(evaluator.evaluate(delayGBTResult))

0.7541158961473642


In [None]:
#diversions with GBT
divertGBT = GBTClassifier(labelCol='Diverted', weightCol='divertWeight', maxBins = 100)
divertGB = divertGBT.fit(train)
divertGBTResult = divertGB.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol='Diverted', weightCol='divertWeight', metricName='accuracy')
print(evaluator.evaluate(divertGBTResult))

In [40]:
from pyspark.ml.classification import DecisionTreeClassifier

In [42]:
#Delay with decision tree
delayTree = DecisionTreeClassifier(labelCol='hasWXDelay', weightCol='wxdWeight', maxBins = 100)
delayTM = delayTree.fit(train)
delayTreeResult = delayTM.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol='hasWXDelay', weightCol='wxdWeight', metricName='accuracy')
print(evaluator.evaluate(delayTreeResult))

0.7706661792350311


In [44]:
#Diversion with decition tree
divertTree = DecisionTreeClassifier(labelCol='Diverted', weightCol='divertWeight', maxBins = 100)
divertTM = divertTree.fit(train)
divertTreeResult = divertTM.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol='Diverted', weightCol='divertWeight', metricName='accuracy')
print(evaluator.evaluate(divertTreeResult))

0.5557437026590059


In [45]:
#Cancellation with decision tree
cancelTree = DecisionTreeClassifier(labelCol='Cancelled', weightCol='cancelWeight', maxBins = 100)
cancelTM = cancelTree.fit(train)
cancelTreeResult = cancelTM.transform(test)
evaluator = MulticlassClassificationEvaluator(labelCol='Cancelled', weightCol='cancelWeight', metricName='accuracy')
print(evaluator.evaluate(cancelTreeResult))

0.790692218774066


In [None]:
#Final choices:
#Diversion: Logistic Regression
#Cancel: GBT
#wxDelay: Random Forest 