In [1]:
# define imports and configurations
from IPython.display import HTML, display
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import datetime
import pandas
import numpy as np
np.set_printoptions(threshold=np.nan)
from matplotlib import cm, pyplot as plt
from pomegranate import *
pandas.options.display.max_rows=500
from random import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
#pomegranate.utils.enable_gpu()

In [2]:
def load_data(year="2017", airport=None):
    data=pandas.read_csv("./data/processed/"+year+".csv")
    # define delay as DepDelay > 30 or ArrDelay>30 -- which implies traveler anxious at airport or miss connection flight
    data["Delay"] = ((data["DepDelay"] > 30) | (data["ArrDelay"] > 30) | data['Cancelled'])*1 

    # discretize ArrTime, DepTime, AirTime,Distance
    # ArrTime, DepTime are divided by 100, because 22:30 is sored as 2230.
    # AirTime is divided into hours
    # Distance is divided by 250
    data = data.sort_values(['Year', 'Month','DayofMonth','CRSDepTime'], ascending=[True, True, True, True])
    data["ArrTime"] = data["ArrTime"]//100
    data["DepTime"] = data["DepTime"]//100
    data["AirTime"] = data["AirTime"]//60
    data["Distance"] = data["Distance"] // 250
    label_set = pandas.factorize(data["UniqueCarrier"])
    data["UniqueCarrier"]=label_set[0]
    # for keep data from certain airport

    if airport:
        data = data.query("Origin==\'"+airport+"\'")
        # data = data[data["Origin"]=="IAH"]
    display(data.head())
    return data

# utility for model scoring
# the model has two states, 0 or 1. The state 0 and 1 may mean different things for each run, so it may need to be flipped.
def score_model(model, data, truth, flip=False):
    if flip:        
        return accuracy_score(model.predict(data),truth)
    else:
        return 1 - accuracy_score(model.predict(data),truth)
# score the model given validation serires
def score_model_given_series(model, validate_x, validate_y):
    # is 0 delay or is 1 delay
    zero_score = score_model(model, validate_x[0], validate_y[0])
    one_score = score_model(model, validate_x[0], validate_y[0], True)
    flip = None
#     if zero_score > one_score:
#         flip = False
#     else:
#         flip = True
    sum_score = 0
    for i in range(len(validate_x)):
        score = score_model(model, validate_x[i], validate_y[i], flip)
        sum_score += score
    print(sum_score/len(validate_x))

In [3]:
# splice_series to day
def splice_series(input_data, split=0.8):
    # split is the split ratio for train vs validate
    x = []
    y = []
    for year in input_data.Year.unique():
        for month in input_data.Month.unique():
            for day in input_data.DayofMonth.unique():
                hold = input_data[(input_data.Month == month) & (input_data.DayofMonth==day)& (input_data.Year==year)]
                hold = hold.sort_values(axis=0,by="CRSDepTime")[features]
                if len(hold>0):
                    x.append(hold.loc[:, hold.columns != 'Delay'].as_matrix().astype(int).tolist())
                    y.append(hold.loc[:, hold.columns == 'Delay'].as_matrix().astype(int).flatten())
    x = np.array(x) 
    y = np.array(y)
    indexer = np.random.permutation(x.shape[0])
    x = x[indexer]
    y = y[indexer]
    train_x= x[:int(x.shape[0]*split)]
    train_y = y[:int(x.shape[0]*split)]    
    validate_x = x[int(x.shape[0]*split):]
    validate_y = y[int(x.shape[0]*split):]    
    return train_x, train_y, validate_x, validate_y
# Train BayesianNetwork and Treat Each Flight as Independent Flight
def BN_preparation(input_data):
    input_data = input_data[['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier',
       'FlightNum', 'OriginAirportID', 'DestAirportID',
       'AirTime', 'Distance',
       'CRSDepTime', 'Delay']]
#     input_data["ArrTime"] = input_data["ArrTime"]//100
#     input_data["DepTime"].apply(lambda x: x//100)
    input_data["AirTime"].apply(lambda x: x//60)
    return input_data

In [4]:
data = load_data("data_all")

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,UniqueCarrier,FlightNum,OriginAirportID,Origin,Dest,DestAirportID,ArrTime,AirTime,Distance,DepDelay,ArrDelay,DepTime,CRSDepTime,Cancelled,Delay
21351723,2013,1,1,1,2,0,1568,12892,LAX,MSP,13487,5.0,3.0,6.0,-2.0,9.0,0.0,5,0.0,0
21294259,2013,1,1,1,2,1,1584,12892,LAX,FLL,11697,7.0,4.0,9.0,-9.0,4.0,0.0,15,0.0,0
21307557,2013,1,1,1,2,1,2400,12892,LAX,DFW,11298,5.0,2.0,4.0,3.0,1.0,0.0,15,0.0,0
21609023,2013,1,1,1,2,2,1496,14771,SFO,PHL,14100,8.0,4.0,10.0,-2.0,-23.0,0.0,15,0.0,0
21609059,2013,1,1,1,2,2,1539,14747,SEA,CLT,11057,7.0,4.0,9.0,-9.0,-15.0,0.0,15,0.0,0


In [22]:

train = BN_preparation(data[:int(data.shape[0]*0.8)])
test = BN_preparation(data[int(data.shape[0]*0.8):])
train_y = train.Delay.values
train_x = train.drop("Delay", axis=1).values
size = train_x.shape[0]//10
def train_clf(depth):
    clf = RandomForestClassifier(max_depth=depth, criterion='entropy')
    for i in range(10):
        clf.fit(train_x[size*i:size*(i+1)], train_y[size*i:size*(i+1)])
    y_pred = clf.predict(test[test.columns[:-1]])
    print("Depth %i"%depth, "F1 score : %f"%f1_score(y_pred, test.Delay) ,"Accuracy : %f"%accuracy_score(y_pred, test.Delay), "Recall : %f"%recall_score(y_pred, test.Delay), "Precision : %f"%precision_score(y_pred, test.Delay))
for i in range(10,50, 5):
    train_clf(i)

Depth 10 F1 score : 0.000271 Accuracy : 0.865080 Recall : 0.666667 Precision : 0.000135
Depth 15 F1 score : 0.004991 Accuracy : 0.864513 Recall : 0.274646 Precision : 0.002518
Depth 20 F1 score : 0.029441 Accuracy : 0.861771 Recall : 0.279803 Precision : 0.015538
Depth 25 F1 score : 0.067449 Accuracy : 0.855592 Recall : 0.262125 Precision : 0.038704
Depth 30 F1 score : 0.087067 Accuracy : 0.851879 Recall : 0.258550 Precision : 0.052348
Depth 35 F1 score : 0.081980 Accuracy : 0.853323 Recall : 0.263592 Precision : 0.048538
Depth 40 F1 score : 0.082223 Accuracy : 0.853062 Recall : 0.261471 Precision : 0.048782
Depth 45 F1 score : 0.078669 Accuracy : 0.853578 Recall : 0.260524 Precision : 0.046330


In [12]:

train = BN_preparation(data[:int(data.shape[0]*0.8)])
test = BN_preparation(data[int(data.shape[0]*0.8):])
train_positive = train[train.Delay==1]
train_negative = train[train.Delay==0]
s = train_positive.shape[0]
train_negative = train_negative[:s*4]

train = pandas.concat([train_negative, train_positive, train_positive])
train_y = train.Delay.values
train_x = train.drop("Delay", axis=1).values
size = train_x.shape[0]//10
def train_clf(depth):
    clf = RandomForestClassifier(max_depth=depth, criterion='entropy')
    for i in range(10):
        clf.fit(train_x[size*i:size*(i+1)], train_y[size*i:size*(i+1)])
    y_pred = clf.predict(test[test.columns[:-1]])
    print("Depth %i"%depth, "F1 score : %f"%f1_score(y_pred, test.Delay) ,"Accuracy : %f"%accuracy_score(y_pred, test.Delay), "Recall : %f"%recall_score(y_pred, test.Delay), "Precision : %f"%precision_score(y_pred, test.Delay))
for i in range(10,50, 5):
    train_clf(i)

Depth 10 F1 score : 0.237776 Accuracy : 0.134929 Recall : 0.134929 Precision : 1.000000
Depth 15 F1 score : 0.237776 Accuracy : 0.134929 Recall : 0.134929 Precision : 1.000000
Depth 20 F1 score : 0.237776 Accuracy : 0.134929 Recall : 0.134929 Precision : 1.000000
Depth 25 F1 score : 0.237776 Accuracy : 0.134929 Recall : 0.134929 Precision : 1.000000
Depth 30 F1 score : 0.237776 Accuracy : 0.134929 Recall : 0.134929 Precision : 1.000000
Depth 35 F1 score : 0.237776 Accuracy : 0.134929 Recall : 0.134929 Precision : 1.000000
Depth 40 F1 score : 0.237776 Accuracy : 0.134929 Recall : 0.134929 Precision : 1.000000
Depth 45 F1 score : 0.237776 Accuracy : 0.134929 Recall : 0.134929 Precision : 1.000000


In [10]:
dummy = np.random.randint(2, size=test.shape[0])
y_pred= dummy
print()

print( "F1 score : %f"%f1_score(y_pred, test.Delay) ,"Accuracy : %f"%accuracy_score(y_pred, test.Delay), "Recall : %f"%recall_score(y_pred, test.Delay), "Precision : %f"%precision_score(y_pred, test.Delay))


F1 score : 0.212756 Accuracy : 0.499975 Recall : 0.135072 Precision : 0.500758


In [11]:
dummy = np.random.random( size=test.shape[0])
y_pred= dummy> 0.13267
print(y_pred)

print( "F1 score : %f"%f1_score(y_pred, test.Delay) ,"Accuracy : %f"%accuracy_score(y_pred, test.Delay), "Recall : %f"%recall_score(y_pred, test.Delay), "Precision : %f"%precision_score(y_pred, test.Delay))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



F1 score : 0.233525 Accuracy : 0.231857 Recall : 0.134929 Precision : 0.867243


In [None]:
[[10, 0.]]