In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import joblib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [2]:
data = pd.read_csv("flights_sample.csv")

In [3]:
data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
             "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
data.dropna(inplace=True)

In [4]:
# flight with delay greater than 15 min is considered "DELAYED" at arrival
data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"] > 15) * 1

In [5]:
data.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
0,12,14,1,EV,6131,ORD,ATW,38.0,1412.0,160,1
1,3,23,1,OO,5292,ORD,DLH,97.0,554.0,397,1
2,1,27,2,UA,1262,LAX,ORD,231.0,1317.0,1744,0
3,12,5,6,UA,756,DEN,SFO,120.0,615.0,967,1
4,5,19,2,DL,1703,TPA,DTW,127.0,1021.0,983,0


In [6]:
# label encoding:
cols = ["AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT", "ORIGIN_AIRPORT"]
le = {}
for col in cols:
    le[col] = LabelEncoder()
    data[col] = le[col].fit_transform(data[col])

In [7]:
data.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
0,12,14,1,4,6108,533,325,38.0,1412.0,160,1
1,3,23,1,9,5284,533,394,97.0,554.0,397,1
2,1,27,2,10,1258,481,532,231.0,1317.0,1744,0
3,12,5,6,10,753,390,582,120.0,615.0,967,1
4,5,19,2,3,1699,608,397,127.0,1021.0,983,0


In [8]:
# imbalanced target class
data.ARRIVAL_DELAY.value_counts()

0    657066
1    142777
Name: ARRIVAL_DELAY, dtype: int64

In [9]:
# downsampling class 0 to have a balanced target:
delay = data[data.ARRIVAL_DELAY==0].sample(n=142777)
delay = delay.append(data[data.ARRIVAL_DELAY==1])

In [10]:
delay.ARRIVAL_DELAY.value_counts()

1    142777
0    142777
Name: ARRIVAL_DELAY, dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(np.array(delay.drop(["ARRIVAL_DELAY"], axis=1)), np.array(delay["ARRIVAL_DELAY"]),
                                                random_state=10, test_size=0.25)

Classification

In [32]:
clf = xgb.XGBClassifier(n_estimators=500, max_depth=10, n_jobs=3)
clf.fit(X_train, y_train)
# rgr = xgb.XGBRegressor(n_estimators=500, learning_rate=0.15, n_jobs=3)
# rgr.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=3, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [33]:
preds = clf.predict(X_test)

  if diff:


In [34]:
metrics.accuracy_score(y_test, preds)

0.7091988961884884

In [35]:
def auc(m, X_train, X_test, y_train, y_test): 
    return (metrics.roc_auc_score(y_train, m.predict_proba(X_train)[:,1]), metrics.roc_auc_score(y_test, m.predict_proba(X_test)[:,1]))

In [36]:
auc(clf, X_train, X_test, y_train, y_test)

(0.9605120505996017, 0.7848869884606319)

In [203]:
# running a test to simulate a call to an api endpoint that would run the model

flo_dico = {'MONTH': 8, 'DAY': 1, 'DAY_OF_WEEK': 6, 'AIRLINE': 'WN', 'FLIGHT_NUMBER': 4330, 
           'DESTINATION_AIRPORT': 'LAX', 'ORIGIN_AIRPORT': 'JFK', 'AIRTIME': 300.0,
           'DEPARTURE_TIME': 1151, 'DISTANCE': 1600}
for item in flo_dico.keys():
    print(item)
    if item in cols:
        flo_dico[item] = le[item].transform([flo_dico[item]])[0]
pred = clf.predict([list(flo_dico.values())])

MONTH
DAY
DAY_OF_WEEK
AIRLINE
FLIGHT_NUMBER
DESTINATION_AIRPORT
ORIGIN_AIRPORT
AIRTIME
DEPARTURE_TIME
DISTANCE


  if diff:


In [204]:
pred

array([1], dtype=int64)

In [205]:
print("ratio of delayed in preds: {}".format(sum(preds) / len(preds)))
print("ratio of delayed in actual: {}".format(sum(y_test) / len(y_test)))

ratio of delayed in preds: 0.49259689868186973
ratio of delayed in actual: 0.49981089523596073


In [208]:
# cross validation:
clf = xgb.XGBClassifier(n_estimators=200, max_depth=20, n_jobs=3)
X = np.array(delay.drop(["ARRIVAL_DELAY"], axis=1))
y = np.array(delay["ARRIVAL_DELAY"])
skf = StratifiedKFold(n_splits=3)
cross_val_score(clf, X, y, cv=skf)

  if diff:
  if diff:
  if diff:


array([0.70847604, 0.70555976, 0.71004581])

In [38]:
# saving the classifier
joblib.dump(clf, 'delay.pkl', compress=True)
# saving the encoder
joblib.dump(le, 'le.pkl', compress=True)

['le.pkl']