# Final Project

In [2]:
# define imports and configurations
from IPython.display import HTML, display
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import datetime
import pandas
import numpy as np
np.set_printoptions(threshold=np.nan)
from matplotlib import cm, pyplot as plt
from pomegranate import *
pandas.options.display.max_rows=500
from random import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
#pomegranate.utils.enable_gpu()

In [3]:
def load_data(year="2017", airport=None):
    data=pandas.read_csv("./data/processed/"+year+".csv")
    # define delay as DepDelay > 30 or ArrDelay>30 -- which implies traveler anxious at airport or miss connection flight
    data["Delay"] = ((data["DepDelay"] > 30) | (data["ArrDelay"] > 30) | data['Cancelled'])*1 

    # discretize ArrTime, DepTime, AirTime,Distance
    # ArrTime, DepTime are divided by 100, because 22:30 is sored as 2230.
    # AirTime is divided into hours
    # Distance is divided by 250
    data = data.sort_values(['Year', 'Month','DayofMonth','CRSDepTime'], ascending=[True, True, True, True])
    data["ArrTime"] = data["ArrTime"]//100
    data["DepTime"] = data["DepTime"]//100
    data["AirTime"] = data["AirTime"]//60
    data["Distance"] = data["Distance"] // 250
    label_set = pandas.factorize(data["UniqueCarrier"])
    data["UniqueCarrier"]=label_set[0]
    # for keep data from certain airport

    if airport:
        data = data.query("Origin==\'"+airport+"\'")
        # data = data[data["Origin"]=="IAH"]
    display(data.head())
    return data

# utility for model scoring
# the model has two states, 0 or 1. The state 0 and 1 may mean different things for each run, so it may need to be flipped.
def score_model(model, data,truth, flip=False):
    y_pred = model.predict(data)
    print("F1 score : %f"%f1_score(y_pred, truth) , 
           "Accuracy : %f"%accuracy_score(y_pred, truth), 
           "Recall : %f"%recall_score(y_pred, truth),
           "Precision : %f"%precision_score(y_pred, truth))
# score the model given validation serires
def score_model_given_series(model, validate_x, validate_y):
    # is 0 delay or is 1 delay
    zero_score = score_model(model, validate_x[0], validate_y[0])
    one_score = score_model(model, validate_x[0], validate_y[0], True)
    flip = None
#     if zero_score > one_score:
#         flip = False
#     else:
#         flip = True
    sum_score = 0
    for i in range(len(validate_x)):
        score = score_model(model, validate_x[i], validate_y[i], flip)
        sum_score += score
    print(sum_score/len(validate_x))

In [4]:
# splice_series to day
def splice_series(input_data, split=0.8):
    # split is the split ratio for train vs validate
    x = []
    y = []
    for year in input_data.Year.unique():
        for month in input_data.Month.unique():
            for day in input_data.DayofMonth.unique():
                hold = input_data[(input_data.Month == month) & (input_data.DayofMonth==day)& (input_data.Year==year)]
                hold = hold.sort_values(axis=0,by="CRSDepTime")[features]
                if len(hold>0):
                    x.append(hold.loc[:, hold.columns != 'Delay'].as_matrix().astype(int).tolist())
                    y.append(hold.loc[:, hold.columns == 'Delay'].as_matrix().astype(int).flatten())
    x = np.array(x) 
    y = np.array(y)
    indexer = np.random.permutation(x.shape[0])
    x = x[indexer]
    y = y[indexer]
    train_x= x[:int(x.shape[0]*split)]
    train_y = y[:int(x.shape[0]*split)]    
    validate_x = x[int(x.shape[0]*split):]
    validate_y = y[int(x.shape[0]*split):]    
    return train_x, train_y, validate_x, validate_y
# Train BayesianNetwork and Treat Each Flight as Independent Flight
def BN_preparation(input_data):
#     input_data = input_data[['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier',
#        'FlightNum', 'OriginAirportID', 'DestAirportID',
#        'AirTime', 'Distance',
#        'CRSDepTime', 'Delay']]
#     input_data["ArrTime"] = input_data["ArrTime"]//100
#     input_data["DepTime"].apply(lambda x: x//100)
    input_data["AirTime"].apply(lambda x: x//60)
    return input_data

In [5]:
data = load_data("2017","ATL")


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,UniqueCarrier,FlightNum,OriginAirportID,Origin,Dest,DestAirportID,ArrTime,AirTime,Distance,DepDelay,ArrDelay,DepTime,CRSDepTime,Cancelled,Delay
298511,2017,1,1,1,7,9,2916,10397,ATL,MDW,13232,6.0,1.0,2.0,-4.0,-7.0,5.0,530,0.0,0
298458,2017,1,1,1,7,9,245,10397,ATL,DEN,11292,6.0,2.0,4.0,-1.0,-16.0,5.0,550,0.0,0
414067,2017,1,1,1,7,3,224,10397,ATL,EWR,11618,8.0,1.0,2.0,-1.0,-8.0,5.0,600,0.0,0
298474,2017,1,1,1,7,9,1232,10397,ATL,HOU,12191,7.0,1.0,2.0,12.0,-5.0,6.0,610,0.0,0
51277,2017,1,1,1,7,0,1462,10397,ATL,LGA,12953,8.0,1.0,3.0,-6.0,-23.0,6.0,612,0.0,0


In [6]:
train = BN_preparation(data[:int(data.shape[0]*0.8)])
test = BN_preparation(data[int(data.shape[0]*0.8):])

display(train.head())



Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,UniqueCarrier,FlightNum,OriginAirportID,Origin,Dest,DestAirportID,ArrTime,AirTime,Distance,DepDelay,ArrDelay,DepTime,CRSDepTime,Cancelled,Delay
298511,2017,1,1,1,7,9,2916,10397,ATL,MDW,13232,6.0,1.0,2.0,-4.0,-7.0,5.0,530,0.0,0
298458,2017,1,1,1,7,9,245,10397,ATL,DEN,11292,6.0,2.0,4.0,-1.0,-16.0,5.0,550,0.0,0
414067,2017,1,1,1,7,3,224,10397,ATL,EWR,11618,8.0,1.0,2.0,-1.0,-8.0,5.0,600,0.0,0
298474,2017,1,1,1,7,9,1232,10397,ATL,HOU,12191,7.0,1.0,2.0,12.0,-5.0,6.0,610,0.0,0
51277,2017,1,1,1,7,0,1462,10397,ATL,LGA,12953,8.0,1.0,3.0,-6.0,-23.0,6.0,612,0.0,0


In [12]:

BN_data = data[["OriginAirportID","CRSDepTime", "UniqueCarrier", "Delay", "FlightNum"]]
# train = BN_data.as_matrix()

train_positive = BN_data[BN_data.Delay==1]
train_negative = BN_data[BN_data.Delay==0]
s = train_positive.shape[0]
train_negative = train_negative[:s*2]


test_x = BN_data[:int(BN_data.shape[0]*0.8)]
test_y = BN_data[:int(BN_data.shape[0]*0.8)]
test_x.Delay=None

test_x = test_x.as_matrix()

train = pandas.concat([train_negative, train_positive, train_positive])
model = BayesianNetwork.from_samples(BN_data, algorithm='chow-liu',n_jobs=-1)
model.fit(BN_data.as_matrix())
model.bake()
print("Training Done")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Training Done


In [None]:
# score_model(model.predict(test_x), test_y)
y_pred = model.predict(test_x)[:,3]
truth = test_y[:100].Delay
print("F1 score : %f" % f1_score(y_pred, truth),
          "Accuracy : %f" % accuracy_score(y_pred, truth),
          "Recall : %f" % recall_score(y_pred, truth),
          "Precision : %f" % precision_score(y_pred, truth))


In [None]:
text_file = open("model.json", "w")
text_file.write(model.to_json())
text_file.close()
sample = train[:1].values.astype(int)[0].tolist()
sample[-1]=None
model.predict([sample])

In [None]:

train = BN_preparation(data[:int(data.shape[0]*0.8)])
test = BN_preparation(data[int(data.shape[0]*0.8):])
train_y = train.Delay.values
train_x = train.drop("Delay", axis=1).values
size = train_x.shape[0]//10
def train_clf(depth):
    clf = RandomForestClassifier(max_depth=50, criterion='entropy')
    for i in range(10):
        clf.fit(train_x[size*i:size*(i+1)], train_y[size*i:size*(i+1)])
   
    print(depth, accuracy_score(clf.predict(test[test.columns[:-1]]),test.Delay))
for i in range(10,80, 5):
    train_clf(i)

In [None]:
# y_pred = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
y_pred = model.predict(test_x[:1000])[:, 3].astype(int)
truth = test_y[:1000].Delay.as_matrix().astype(int).tolist()
print("F1 score : %f" % f1_score(y_pred, truth),
          "Accuracy : %f" % accuracy_score(y_pred, truth),
          "Recall : %f" % recall_score(y_pred, truth),
          "Precision : %f" % precision_score(y_pred, truth))

In [24]:
y_pred = y_pred.astype(int)
truth = test_y[:100].Delay.as_matrix().astype(int).tolist()
print("F1 score : %f" % f1_score(y_pred, truth),
          "Accuracy : %f" % accuracy_score(y_pred, truth),
          "Recall : %f" % recall_score(y_pred, truth),
          "Precision : %f" % precision_score(y_pred, truth))

F1 score : 0.000000 Accuracy : 0.920000 Recall : 0.000000 Precision : 0.000000


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [None]:
np.mean(train_y.Delay)