In [115]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelBinarizer

In [5]:
train = pd.read_csv("flight_delays_train.csv")
test = pd.read_csv("flight_delays_test.csv")
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [6]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [10]:
X_train, y_train = (train[["Distance", "DepTime"]].values, 
train["dep_delayed_15min"].map({"Y": 1, "N": 0}).values)
X_test = test[["Distance", "DepTime"]].values

X_train_part, X_valid, y_train_part, y_valid = (
    train_test_split(X_train, y_train, test_size=0.3, random_state=17))

scaler = StandardScaler()
X_train_part = scaler.fit_transform(X_train_part)
X_valid = scaler.transform(X_valid)

In [14]:
X_train_part

array([[-0.29753043, -0.40781684],
       [-0.20531696, -1.46458194],
       [-1.09265409, -1.71669289],
       ..., 
       [-0.94476457, -0.82800177],
       [ 1.1657059 ,  0.12581801],
       [-0.76033764, -0.90153413]])

In [17]:
logit = LogisticRegression()
logit.fit(X_train_part, y_train_part)
logit_valid_pred = logit.predict_proba(X_valid)[:, 1]
roc_auc_score(y_valid, logit_valid_pred)

0.67956914653526068

In [19]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logit.fit(X_train_scaled, y_train)
logit_test_pred = logit.predict_proba(X_test_scaled)[:,1]
pd.Series(logit_test_pred, name="dep_delayed_15min").to_csv('logit_2feat.csv', index_label='id', header=True)

In [96]:
def get_int_value(value):
    if type(value) != str:
        return value
    return int(value.replace("c-", ""))

def to_sin(value, max_value):
    angle = 2. * np.pi * get_int_value(value) / max_value
    return np.sin(angle)

def to_cos(value, max_value):
    angle = 2. * np.pi * get_int_value(value) / max_value
    return np.cos(angle)

def convert_time(t):
    h = t / 100
    m = float(t % 100) / 60
    return h + m

In [99]:
def preprocess(X):
    X["Flight"] = X["Origin"] + "-" + X["Dest"]
    X = X.drop(["Origin", "Dest"], axis=1)
    X["DayOfWeekSin"] = X["DayOfWeek"].apply(lambda x: to_sin(x, 7))
    X["DayOfWeekCos"] = X["DayOfWeek"].apply(lambda x: to_cos(x, 7))
    X = X.drop(["DayOfWeek"], axis=1)
    
    X["DayOfMonthSin"] = X["DayofMonth"].apply(lambda x: to_sin(x, 31))
    X["DayOfMonthCos"] = X["DayofMonth"].apply(lambda x: to_cos(x, 31))
    X = X.drop(["DayofMonth"], axis=1)
    
    X["MonthSin"] = X["Month"].apply(lambda x: to_sin(x, 12))
    X["MonthCos"] = X["Month"].apply(lambda x: to_cos(x, 12))
    X = X.drop(["Month"], axis=1)
    
    X["DepTimeSin"] = X["DepTime"].apply(lambda x: to_sin(convert_time(x), 24))
    X["DepTimeCos"] = X["DepTime"].apply(lambda x: to_cos(convert_time(x), 24))
    X = X.drop(["DepTime"], axis=1)
    
    X = pd.get_dummies(X, columns=["Flight", "UniqueCarrier"])
    
    if "dep_delayed_15min" in X.columns:
        X["dep_delayed_15min"] = X["dep_delayed_15min"].map({"Y": 1, "N":0})
    
    return X

In [100]:
X = preprocess(train.copy())
X.head()

Unnamed: 0,Distance,dep_delayed_15min,DayOfWeekSin,DayOfWeekCos,DayOfMonthSin,DayOfMonthCos,MonthSin,MonthCos,DepTimeSin,DepTimeCos,...,UniqueCarrier_MQ,UniqueCarrier_NW,UniqueCarrier_OH,UniqueCarrier_OO,UniqueCarrier_TZ,UniqueCarrier_UA,UniqueCarrier_US,UniqueCarrier_WN,UniqueCarrier_XE,UniqueCarrier_YV
0,732,0,-2.449294e-16,1.0,-0.897805,-0.440394,-0.866025,-0.5,-0.91706,0.398749,...,0,0,0,0,0,0,0,0,0,0
1,834,0,0.4338837,-0.900969,-0.790776,-0.612106,0.866025,-0.5,-0.838671,-0.544639,...,0,0,0,0,0,0,1,0,0,0
2,416,0,-0.9749279,-0.222521,0.394356,0.918958,-1.0,-1.83697e-16,-0.580703,-0.814116,...,0,0,0,0,0,0,0,0,1,0
3,872,0,-0.7818315,0.62349,-0.937752,0.347305,-0.5,0.8660254,0.442289,-0.896873,...,0,0,0,1,0,0,0,0,0,0
4,423,1,-0.7818315,0.62349,0.988468,0.151428,-0.866025,0.5,-0.992546,0.121869,...,0,0,0,0,0,0,0,1,0,0


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(["dep_delayed_15min"], axis=1), X["dep_delayed_15min"], test_size=0.3, random_state=17)

In [107]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [112]:
logit = LogisticRegression(n_jobs=-1)
logit.fit(X_train_scaled, y_train)
y_pred = logit.predict_proba(X_test_scaled)[:, 1]

ValueError: continuous format is not supported

In [114]:
roc_auc_score(y_test, y_pred)

0.65758740847224617

In [122]:
cl = XGBClassifier()
cl.fit(X_train, y_train)
y_pred = cl.predict(X_test)
roc_auc_score(y_test, y_pred)

0.51851540473506119