In [83]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib.lines as mlines
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_predict, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV 
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelBinarizer
from bs4 import BeautifulSoup
from requests import get
import re

In [21]:
train = pd.read_csv("flight_delays_train.csv")
test = pd.read_csv("flight_delays_test.csv")
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [3]:
X_train, y_train = (train[["Distance", "DepTime"]].values, 
train["dep_delayed_15min"].map({"Y": 1, "N": 0}).values)
X_test = test[["Distance", "DepTime"]].values

X_train_part, X_valid, y_train_part, y_valid = (
    train_test_split(X_train, y_train, test_size=0.3, random_state=17))

scaler = StandardScaler()
X_train_part = scaler.fit_transform(X_train_part)
X_valid = scaler.transform(X_valid)

In [4]:
logit = LogisticRegression()
logit.fit(X_train_part, y_train_part)
logit_valid_pred = logit.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, logit_valid_pred)

0.6795691465352607

In [5]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logit.fit(X_train_scaled, y_train)
logit_test_pred = logit.predict_proba(X_test_scaled)[:,1]
pd.Series(logit_test_pred, name="dep_delayed_15min").to_csv('logit_2feat.csv', index_label='id', header=True)

## The second benchmark in the leaderboard was achieved as follows:

Features Distance and DepTime were taken unchanged

A feature Flight was created from features Origin and Dest

Features Month, DayofMonth, DayOfWeek, UniqueCarrier and Flight were transformed with OHE (LabelBinarizer)
A holdout set was allocated

Logistic regression and gradient boosting (xgboost) were trained. The hyperparameters of xgboost were tuned with cross-validation. First, the hyperparameters responsible for the complexity of the model were optimized, then the number of trees was fixed at 500 and the gradient descent step was tuned.

Predictions of models (predicted probabilities) were made with cross-validation using cross_val_predict. A linear mixture of logistic regression and gradient boosting responses was set in the form  w1∗plogit+(1−w1)∗pxgbw1∗plogit+(1−w1)∗pxgb , where  plogitplogit is a probability of class 1, predicted by logistic regression, and  pxgbpxgb - by xgboost.  w1w1  weight was selected manually.

A similar combination of responses of the two models was taken as a prediction on the test set, but this time the models were trained on the hole train set.

Following the same steps is not mandatory. That’s just a description of how the result was achieved by the author of this assignment. Perhaps you might not want to follow the same steps, and instead, let’s say will add a couple of good features and train a random forest of a thousand trees.

Good luck!

In [22]:
train['Flight'] = train['Origin'].map(str)+'-'+train['Dest'].map(str)

In [34]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Flight
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N,ATL-DFW
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N,PIT-MCO
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N,RDU-CLE
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N,DEN-MEM
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y,MDW-OMA


In [30]:
train_transfotmed = train[["Month", "DayofMonth", "DayOfWeek", "DepTime", "UniqueCarrier", "Flight"]].copy()

In [31]:
data = pd.get_dummies(train_transfotmed)

In [32]:
data.head()

Unnamed: 0,DepTime,Month_c-1,Month_c-10,Month_c-11,Month_c-12,Month_c-2,Month_c-3,Month_c-4,Month_c-5,Month_c-6,...,Flight_XNA-IAH,Flight_XNA-LAX,Flight_XNA-LGA,Flight_XNA-ORD,Flight_XNA-SLC,Flight_YAK-CDV,Flight_YAK-JNU,Flight_YUM-IPL,Flight_YUM-LAX,Flight_YUM-PHX
0,1934,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1548,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1422,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1015,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1828,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
data.shape

(100000, 4502)

In [33]:
for row in data.columns:
    print(row)

DepTime
Month_c-1
Month_c-10
Month_c-11
Month_c-12
Month_c-2
Month_c-3
Month_c-4
Month_c-5
Month_c-6
Month_c-7
Month_c-8
Month_c-9
DayofMonth_c-1
DayofMonth_c-10
DayofMonth_c-11
DayofMonth_c-12
DayofMonth_c-13
DayofMonth_c-14
DayofMonth_c-15
DayofMonth_c-16
DayofMonth_c-17
DayofMonth_c-18
DayofMonth_c-19
DayofMonth_c-2
DayofMonth_c-20
DayofMonth_c-21
DayofMonth_c-22
DayofMonth_c-23
DayofMonth_c-24
DayofMonth_c-25
DayofMonth_c-26
DayofMonth_c-27
DayofMonth_c-28
DayofMonth_c-29
DayofMonth_c-3
DayofMonth_c-30
DayofMonth_c-31
DayofMonth_c-4
DayofMonth_c-5
DayofMonth_c-6
DayofMonth_c-7
DayofMonth_c-8
DayofMonth_c-9
DayOfWeek_c-1
DayOfWeek_c-2
DayOfWeek_c-3
DayOfWeek_c-4
DayOfWeek_c-5
DayOfWeek_c-6
DayOfWeek_c-7
UniqueCarrier_AA
UniqueCarrier_AQ
UniqueCarrier_AS
UniqueCarrier_B6
UniqueCarrier_CO
UniqueCarrier_DH
UniqueCarrier_DL
UniqueCarrier_EV
UniqueCarrier_F9
UniqueCarrier_FL
UniqueCarrier_HA
UniqueCarrier_HP
UniqueCarrier_MQ
UniqueCarrier_NW
UniqueCarrier_OH
UniqueCarrier_OO
UniqueCarrie

Flight_DCA-CVG
Flight_DCA-DEN
Flight_DCA-DFW
Flight_DCA-DTW
Flight_DCA-EWR
Flight_DCA-FLL
Flight_DCA-HSV
Flight_DCA-IAH
Flight_DCA-IND
Flight_DCA-JAN
Flight_DCA-JAX
Flight_DCA-JFK
Flight_DCA-LAS
Flight_DCA-LAX
Flight_DCA-LEX
Flight_DCA-LGA
Flight_DCA-MCI
Flight_DCA-MCO
Flight_DCA-MDW
Flight_DCA-MEM
Flight_DCA-MHT
Flight_DCA-MIA
Flight_DCA-MLB
Flight_DCA-MSP
Flight_DCA-MSY
Flight_DCA-ORD
Flight_DCA-PBI
Flight_DCA-PHL
Flight_DCA-PHX
Flight_DCA-PIT
Flight_DCA-PVD
Flight_DCA-PWM
Flight_DCA-RDU
Flight_DCA-ROC
Flight_DCA-RSW
Flight_DCA-SEA
Flight_DCA-SLC
Flight_DCA-STL
Flight_DCA-SYR
Flight_DCA-TPA
Flight_DEN-ABQ
Flight_DEN-ANC
Flight_DEN-ASE
Flight_DEN-ATL
Flight_DEN-AUS
Flight_DEN-BHM
Flight_DEN-BIL
Flight_DEN-BIS
Flight_DEN-BNA
Flight_DEN-BOI
Flight_DEN-BOS
Flight_DEN-BUR
Flight_DEN-BWI
Flight_DEN-BZN
Flight_DEN-CAK
Flight_DEN-CID
Flight_DEN-CLE
Flight_DEN-CLT
Flight_DEN-CMH
Flight_DEN-COD
Flight_DEN-COS
Flight_DEN-CPR
Flight_DEN-CVG
Flight_DEN-DAY
Flight_DEN-DCA
Flight_DEN-DFW
Flight_DEN

Flight_IND-PHL
Flight_IND-PHX
Flight_IND-PIE
Flight_IND-RSW
Flight_IND-SEA
Flight_IND-SFO
Flight_IND-SRQ
Flight_IND-TPA
Flight_IPL-LAX
Flight_IPL-SAN
Flight_IPL-YUM
Flight_ISO-ATL
Flight_ISP-ATL
Flight_ISP-BNA
Flight_ISP-BWI
Flight_ISP-CLE
Flight_ISP-CVG
Flight_ISP-FLL
Flight_ISP-LAS
Flight_ISP-MCO
Flight_ISP-MDW
Flight_ISP-PBI
Flight_ISP-RSW
Flight_ISP-TPA
Flight_ITO-HNL
Flight_ITO-OGG
Flight_IYK-LAX
Flight_JAC-ATL
Flight_JAC-BIL
Flight_JAC-DEN
Flight_JAC-DFW
Flight_JAC-MSP
Flight_JAC-ORD
Flight_JAC-SLC
Flight_JAN-ATL
Flight_JAN-BWI
Flight_JAN-CVG
Flight_JAN-DCA
Flight_JAN-DFW
Flight_JAN-HOU
Flight_JAN-IAH
Flight_JAN-MCO
Flight_JAN-MDW
Flight_JAN-MEM
Flight_JAX-ATL
Flight_JAX-BHM
Flight_JAX-BNA
Flight_JAX-BOS
Flight_JAX-BWI
Flight_JAX-CLE
Flight_JAX-CLT
Flight_JAX-CVG
Flight_JAX-DCA
Flight_JAX-DFW
Flight_JAX-DTW
Flight_JAX-EWR
Flight_JAX-FLL
Flight_JAX-HOU
Flight_JAX-IAD
Flight_JAX-IAH
Flight_JAX-IND
Flight_JAX-JFK
Flight_JAX-LGA
Flight_JAX-MSP
Flight_JAX-MSY
Flight_JAX-ORD
Flight_JAX

Flight_OGG-SAN
Flight_OGG-SEA
Flight_OGG-SFO
Flight_OGG-SJC
Flight_OGG-SLC
Flight_OGG-SNA
Flight_OKC-ATL
Flight_OKC-CVG
Flight_OKC-DAL
Flight_OKC-DEN
Flight_OKC-DFW
Flight_OKC-DTW
Flight_OKC-EWR
Flight_OKC-HOU
Flight_OKC-IAH
Flight_OKC-LAS
Flight_OKC-MCI
Flight_OKC-MCO
Flight_OKC-MEM
Flight_OKC-ORD
Flight_OKC-PHX
Flight_OKC-SLC
Flight_OKC-STL
Flight_OMA-ATL
Flight_OMA-CVG
Flight_OMA-DEN
Flight_OMA-DFW
Flight_OMA-EWR
Flight_OMA-IAH
Flight_OMA-LAS
Flight_OMA-MDW
Flight_OMA-MSP
Flight_OMA-ORD
Flight_OMA-PHX
Flight_OMA-SLC
Flight_OMA-STL
Flight_OME-ANC
Flight_OME-OTZ
Flight_ONT-ATL
Flight_ONT-BNA
Flight_ONT-DEN
Flight_ONT-DFW
Flight_ONT-IAH
Flight_ONT-JFK
Flight_ONT-LAS
Flight_ONT-LAX
Flight_ONT-MSP
Flight_ONT-OAK
Flight_ONT-PDX
Flight_ONT-PHX
Flight_ONT-SEA
Flight_ONT-SFO
Flight_ONT-SJC
Flight_ONT-SLC
Flight_ONT-SMF
Flight_ORD-ABE
Flight_ORD-ABQ
Flight_ORD-ALB
Flight_ORD-ANC
Flight_ORD-ATL
Flight_ORD-ATW
Flight_ORD-AUS
Flight_ORD-AVP
Flight_ORD-AZO
Flight_ORD-BDL
Flight_ORD-BHM
Flight_ORD

Flight_SLC-JAC
Flight_SLC-JFK
Flight_SLC-KOA
Flight_SLC-LAS
Flight_SLC-LAX
Flight_SLC-LGA
Flight_SLC-LGB
Flight_SLC-LIT
Flight_SLC-LWS
Flight_SLC-MCI
Flight_SLC-MCO
Flight_SLC-MDW
Flight_SLC-MEM
Flight_SLC-MFR
Flight_SLC-MIA
Flight_SLC-MKE
Flight_SLC-MRY
Flight_SLC-MSO
Flight_SLC-MSP
Flight_SLC-MSY
Flight_SLC-OAK
Flight_SLC-OGG
Flight_SLC-OKC
Flight_SLC-OMA
Flight_SLC-ONT
Flight_SLC-ORD
Flight_SLC-PDX
Flight_SLC-PHL
Flight_SLC-PHX
Flight_SLC-PIH
Flight_SLC-PSC
Flight_SLC-PSP
Flight_SLC-RAP
Flight_SLC-RDM
Flight_SLC-RDU
Flight_SLC-RNO
Flight_SLC-SAN
Flight_SLC-SAT
Flight_SLC-SBA
Flight_SLC-SEA
Flight_SLC-SFO
Flight_SLC-SGF
Flight_SLC-SGU
Flight_SLC-SJC
Flight_SLC-SMF
Flight_SLC-SNA
Flight_SLC-STL
Flight_SLC-SUN
Flight_SLC-TPA
Flight_SLC-TUL
Flight_SLC-TUS
Flight_SLC-TWF
Flight_SLC-WYS
Flight_SLC-XNA
Flight_SMF-ACV
Flight_SMF-ATL
Flight_SMF-BUR
Flight_SMF-DEN
Flight_SMF-DFW
Flight_SMF-HNL
Flight_SMF-IAD
Flight_SMF-IAH
Flight_SMF-JFK
Flight_SMF-LAS
Flight_SMF-LAX
Flight_SMF-LGB
Flight_SMF

In [39]:
X_train_part_a, X_valid_a, y_train_part_a, y_valid_a = (
    train_test_split(data, \
                     train["dep_delayed_15min"].map({"Y": 1, "N": 0}).values, \
                     test_size=0.3, \
                     random_state=17))

In [75]:
logit = LogisticRegression()
# logit.fit(X_train_part_a, y_train_part_a)
# logit_valid_pred_a = logit.predict_proba(X_valid_a)[:, 1]
# roc_auc_score(y_valid_a, logit_valid_pred_a)

In [77]:
cross_val_predict??

In [79]:
cross_val_predict(logit, X_train_part_a, y_train_part_a, method="predict_proba")[:, 1]

array([0.09471167, 0.10012498, 0.01202324, ..., 0.16593917, 0.2431678 ,
       0.13067912])

In [84]:
logit = LogisticRegressionCV(
    Cs=list(np.power(10.0, np.arange(-10, 10)))
    ,penalty='l2'
    ,scoring='roc_auc'
    ,cv=3
    ,random_state=77
    ,max_iter=1000)
logit.fit(X_train_part_a, y_train_part_a)
y_pred_logit = logit.predict(X_valid_a)
roc_auc_score(y_pred_logit, y_valid_a)

0.7750159747455962

In [89]:
params = {
    "max_depth": [5,6,7],
    "n_estimators": [10],
   "subsample": [0.8, 0.9, 1],
   "colsample_bytree": [0.8, 0.9, 1]
}

In [None]:
clf = GridSearchCV(xgb, params, n_jobs=-1)
clf.fit(X_train_part_a, y_train_part_a)
print(clf.best_params_)
y_pred_xgb = clf.predict(X_valid_a)
roc_auc_score(y_pred_xgb, y_valid_a)