<h1> Predicting airline delays with Spark and ML-Lib using pySpark </h1>
<br>
Adapted from http://nbviewer.ipython.org/github/ofermend/IPython-notebooks/blob/master/blog-part-2.ipynb

<h3>Pre-processing with PySpark</h3>

In [3]:
import os.path

AIRPORT = 'LAX'

base_dir = os.path.join('data')
input_path_2007 = os.path.join('flights', '2007.csv')
input_path_2008 = os.path.join('flights', '2008.csv')
file_name_2007 = os.path.join(base_dir, input_path_2007)
file_name_2008 = os.path.join(base_dir, input_path_2008)

raw_data_2007 = sc.textFile(file_name_2007)
raw_data_2008 = sc.textFile(file_name_2008)
header = raw_data_2007.take(1) 

# filter on Airport
filtered_data_2007 = (raw_data_2007
                        .filter(lambda line: ',' + AIRPORT + ',' in line)
                        # filter out cancelled flights
                        .filter(lambda line: ',,' in line)
                        .filter(lambda line: 'Year' not in line))
filtered_data_2008 = (raw_data_2008
                        .filter(lambda line: ',' + AIRPORT + ',' in line)
                        # filter out cancelled flights
                        .filter(lambda line: ',,' in line)
                        .filter(lambda line: 'Year' not in line))

# CRS = Computer Reservation System
# scheduled time as opposed to the actual time
print header
print filtered_data_2008.take(1)

[u'Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,TailNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Origin,Dest,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay']
[u'2008,1,3,4,1738,1715,1838,1820,WN,82,N499WN,60,65,42,18,23,LAS,LAX,236,6,12,0,,0,0,0,0,12,6']


In [4]:
holidays = ['01/01/2007', '01/15/2007', '02/19/2007', '05/28/2007', '06/07/2007', '07/04/2007',
      '09/03/2007', '10/08/2007' ,'11/11/2007', '11/22/2007', '12/25/2007',
      '01/01/2008', '01/21/2008', '02/18/2008', '05/22/2008', '05/26/2008', '07/04/2008',
      '09/01/2008', '10/13/2008' ,'11/11/2008', '11/27/2008', '12/25/2008']

In [5]:
import datetime

def days_from_nearest_holiday(year, month, day):
    diffs = []
    sample_date = datetime.date(year, month, day)
    for holiday in holidays:
        dt = datetime.datetime.strptime(holiday, '%m/%d/%Y').date()
        td = dt - sample_date
        diffs.append(abs(td.days))

    return min(diffs) * 1.0

def split_data(line):
    try:
        vals = line.split(',')
        features = []
        features.append(float(vals[15]))
        features.append(float(vals[1]))
        features.append(float(vals[2]))
        features.append(float(vals[3]))
        features.append(float(vals[5]) / 100)
        features.append(float(vals[18]))
        features.append(days_from_nearest_holiday(int(vals[0]), int(vals[1]), int(vals[2])))

        return (str(vals[0] + str(vals[1]) + str(vals[2])), features)
    except:
        return (str(vals[0] + str(vals[1]) + str(vals[2])), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [6]:
features_2007 = filtered_data_2007.map(lambda line: split_data(line))
features_2008 = filtered_data_2008.map(lambda line: split_data(line))

print features_2007.take(1)

[('200711', [1.0, 1.0, 1.0, 1.0, 14.45, 1593.0, 0.0])]


<h3>Modeling with Spark and ML-Lib</h3>

In [7]:
from pyspark.mllib.regression import LabeledPoint
import numpy as np

def parse_data(tup):
    if tup[1][0] >= 15:
        return LabeledPoint(1.0, tup[1])
    else:
        return LabeledPoint(0.0, tup[1])
    
def scale(data):
    features_mean = data.map(lambda lp: lp.features).mean()
    return data.map(lambda lp: LabeledPoint(lp.label, (lp.features - features_mean) / 1.0))

train_data = features_2007.map(parse_data)
train_data.cache()
scaled_train_data = scale(train_data)
scaled_train_data.cache()

test_data = features_2008.map(parse_data)
test_data.cache()
scaled_test_data = scale(test_data)
scaled_test_data.cache()

print scaled_train_data.take(1)
print scaled_test_data.take(1)

[LabeledPoint(0.0, [-7.29720804563,0.0,-15.453017112,-2.81687181027,0.741804262984,931.822275593,-6.16121284899])]
[LabeledPoint(1.0, [8.56169456377,0.0284402310769,-13.5271811583,0.204710413272,3.87748037328,-327.113612798,-3.18619463783])]


In [21]:
def eval_metrics(lbl_pred):
    tp = float(lbl_pred.filter(lambda lp: lp[0]==1.0 and lp[1]==1.0).count())
    tn = float(lbl_pred.filter(lambda lp: lp[0]==0.0 and lp[1]==0.0).count())
    fp = float(lbl_pred.filter(lambda lp: lp[0]==1.0 and lp[1]==0.0).count())
    fn = float(lbl_pred.filter(lambda lp: lp[0]==0.0 and lp[1]==1.0).count())
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F_measure = 2 * precision * recall / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    return([tp, tn, fp, fn], [precision, recall, F_measure, accuracy])

In [22]:
from pyspark.mllib.classification import LogisticRegressionWithSGD

model_lr = LogisticRegressionWithSGD.train(scaled_train_data, iterations=100)
labels_and_predictions = scaled_test_data.map(lambda lp: (model_lr.predict(lp.features), lp.label))
metrics = eval_metrics(labels_and_predictions)

print('Precision : %.2f' % round(metrics[1][0], 2))
print('Recall : %.2f' % round(metrics[1][1], 2))
print('F1 : %.2f' % round(metrics[1][2], 2))
print('Accuracy : %.2f' % round(metrics[1][3], 2))

Precision : 0.35
Recall : 0.83
F1 : 0.49
Accuracy : 0.54


In [23]:
from pyspark.mllib.classification import SVMWithSGD

model_svm = SVMWithSGD.train(scaled_train_data, iterations=100, step=1.0, regParam=0.01)
labels_and_predictions = scaled_test_data.map(lambda lp: (model_svm.predict(lp.features), lp.label))
metrics = eval_metrics(labels_and_predictions)

print('Precision : %.2f' % round(metrics[1][0], 2))
print('Recall : %.2f' % round(metrics[1][1], 2))
print('F1 : %.2f' % round(metrics[1][2], 2))
print('Accuracy : %.2f' % round(metrics[1][3], 2))

Precision : 0.38
Recall : 0.84
F1 : 0.53
Accuracy : 0.60


In [24]:
from pyspark.mllib.tree import DecisionTree

# Experimental
model_dt = DecisionTree.trainClassifier(scaled_train_data, 2, {}, 'gini', 10, 100)
# this detour is needed because of the spark context error 
# if we map the test data set like in the previous models
list_data = scaled_test_data.collect()
list_predictions = []
for lp in list_data:
    list_predictions.append((model_dt.predict(lp.features), lp.label))

labels_and_predictions = sc.parallelize(list_predictions)
metrics = eval_metrics(labels_and_predictions)

print('Precision : %.2f' % round(metrics[1][0], 2))
print('Recall : %.2f' % round(metrics[1][1], 2))
print('F1 : %.2f' % round(metrics[1][2], 2))
print('Accuracy : %.2f' % round(metrics[1][3], 2))

Precision : 1.00
Recall : 0.81
F1 : 0.89
Accuracy : 0.95


In [25]:
from pyspark.mllib.tree import RandomForest

# Experimental
model_rf = RandomForest.trainClassifier(scaled_train_data, 2, {}, 3, 'sqrt', 'gini', 10, 100)
# this detour is needed because of the spark context error 
# if we map the test data set like in the previous models
list_data = scaled_test_data.collect()
list_predictions = []
for lp in list_data:
    list_predictions.append((model_rf.predict(lp.features), lp.label))

labels_and_predictions = sc.parallelize(list_predictions)
metrics = eval_metrics(labels_and_predictions)

print('Precision : %.2f' % round(metrics[1][0], 2))
print('Recall : %.2f' % round(metrics[1][1], 2))
print('F1 : %.2f' % round(metrics[1][2], 2))
print('Accuracy : %.2f' % round(metrics[1][3], 2))

Precision : 1.00
Recall : 0.81
F1 : 0.89
Accuracy : 0.95


<h3>Building a richer model with weather data</h3>

In [26]:
base_dir = os.path.join('data')
input_path_LAX = os.path.join('flights', 'LAX.csv')
file_name_LAX = os.path.join(base_dir, input_path_LAX)

raw_data_LAX = sc.textFile(file_name_LAX)