In [1]:
import sys
sys.path.append('./source_code')

In [2]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score, \
    KFold, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    roc_auc_score, roc_curve, confusion_matrix
import xgboost as xgb
import classification_helpers as ch

%config InlineBackend.figure_format = 'png'
%matplotlib inline
sns.set(color_codes=True)
plt.style.use('seaborn-colorblind')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 2)
pd.set_option('display.max_info_columns', 200)

In [3]:
with open('pickles/combined_sample_weather_df5.pickle', 'rb') as read_file:
    sample_df = pickle.load(read_file)

In [4]:
# with open('pickles/cv_records_df.pickle', 'rb') as read_file:
#     cv_records_df = pickle.load(read_file)

In [5]:
with open('pickles/records_df.pickle', 'rb') as read_file:
    records_df = pickle.load(read_file)

In [6]:
# with open('pickles/records_df.pickle', 'wb') as to_write:
#     pickle.dump(records_df, to_write)

In [7]:
base_numerical_features = ['airline_avg_arrival_delay',
                           'origin_airport_avg_flights_per_day',
                           'origin_airport_avg_departure_delay',
                           'destination_airport_avg_flights_per_day',
                           'destination_airport_avg_arrival_delay',
                           'month_avg_delay',
                           'day_of_week_avg_delay']

weather_numerical_features = ['Precipitation_origin',
                              'Snow Depth_origin',
                              'Visibility_origin',
                              'Wind Speed_origin',                      
                              'Precipitation_destination',
                              'Snow Depth_destination',
                              'Visibility_destination',
                              'Wind Speed_destination']
                              

weather_categorical_features = ['is_Clear_origin',
                                'is_Partially cloudy_origin',
                                'is_Rain_origin',
                                'is_Snow_origin',
                                'is_Overcast_origin',
                                'is_Clear_destination',
                                'is_Partially cloudy_destination',
                                'is_Rain_destination',
                                'is_Snow_destination',
                                'is_Overcast_destination']

base_categorical_features = ['American Airlines Inc.', 'American Eagle Airlines Inc.',
                             'Atlantic Southeast Airlines', 'Delta Air Lines Inc.',
                             'Frontier Airlines Inc.', 'Hawaiian Airlines Inc.', 'JetBlue Airways',
                             'Skywest Airlines Inc.', 'Southwest Airlines Co.', 'Spirit Air Lines',
                             'US Airways Inc.', 'United Air Lines Inc.', 'Virgin America',
                             'is_thanksgiving_period', 'is_xmas_period',
                             'is_night_departure', 'is_night_arrival']

target = ['is_not_on_time']

all_features = (base_numerical_features 
                + base_categorical_features 
                + weather_numerical_features)

In [8]:
X, y = ch.feature_target_selection(all_features, target, sample_df)

X_train_val, X_test, y_train_val, y_test = ch.initial_split(X, y.values.ravel())
X_train, X_val, y_train, y_val = ch.second_split(X_train_val, y_train_val)

lm = LogisticRegression(penalty='none', max_iter=10000)
rf = RandomForestClassifier()

In [9]:
# gbm = xgb.XGBClassifier(
#     n_estimators=1000, 
#     max_depth=6,
#     objective='binary:logistic',
#     learning_rate=.1,
#     subsample=.7,
#     min_child_weight=1,
#     colsample_bytree=1
# )

# eval_set = [(X_train, y_train), (X_val, y_val)]

# gbm.fit(
#     X_train, y_train, 
#     eval_set=eval_set,
#     eval_metric='auc',
#     early_stopping_rounds=20,
#     verbose=True
# )

In [10]:
records_df = ch.cv(lm, X_train_val, y_train_val, records_df, scale=True)

CROSS VALIDATION
Model name: LogisticRegression
Hyperparameters: max_iter=10000, penalty='none'

Train F1:                                 0.11
Val F1:                                   0.11
Train precision:                          0.60
Val precision:                            0.60
Train recall:                             0.06
Val recall:                               0.06
Train accuracy:                           0.80
Val accuracy:                             0.80
Train AUC:                                0.67
Val AUC:                                  0.67

Feature coefficients:

airline_avg_arrival_delay                 0.08
origin_airport_avg_flights_per_day        0.02
origin_airport_avg_departure_delay        0.12
destination_airport_avg_flights_per_day   0.05
destination_airport_avg_arrival_delay     0.12
month_avg_delay                           0.22
day_of_week_avg_delay                     0.05
American Airlines Inc.                   -0.02
American Eagle Airlines Inc.     

In [9]:
_, records_df = ch.simple_validate(lm, X_train, X_val,
                                   y_train, y_val,
                                   records_df, threshold=0.5,
                                   scale=True)

SIMPLE VALIDATION
Model name: LogisticRegression
Hyperparameters: max_iter=10000, penalty='none'

Train F1:                                 0.11
Val F1:                                   0.11
Train precision:                          0.60
Val precision:                            0.59
Train recall:                             0.06
Val recall:                               0.06
Train accuracy:                           0.80
Val accuracy:                             0.80
Train AUC:                                0.52
Val AUC:                                  0.52

Feature coefficients:

airline_avg_arrival_delay                 0.07
origin_airport_avg_flights_per_day        0.03
origin_airport_avg_departure_delay        0.12
destination_airport_avg_flights_per_day   0.04
destination_airport_avg_arrival_delay     0.12
month_avg_delay                           0.23
day_of_week_avg_delay                     0.04
American Airlines Inc.                   -0.03
American Eagle Airlines Inc.    

In [11]:
records_df

Unnamed: 0,model,validation_type,iteration_desc,feature_engineering,hyperparameter_tuning,train_f1,val_f1,train_precision,val_precision,train_recall,val_recall,train_accuracy,val_accuracy,train_AUC,val_AUC
0,LogisticRegression,simple,testing,,"max_iter=10000, penalty='none'",0.11,0.11,0.6,0.59,0.06,0.06,0.8,0.8,0.53,0.53
1,LogisticRegression,simple,testing,,"max_iter=10000, penalty='none'",0.11,0.11,0.6,0.59,0.06,0.06,0.8,0.8,0.53,0.53
2,RandomForestClassifier,simple,testing,,,0.97,0.21,0.99,0.58,0.94,0.13,0.99,0.81,0.97,0.55
3,XGBClassifier,simple,testing,,"base_score=0.5, booster=None, colsample_byleve...",0.37,0.23,0.89,0.61,0.23,0.14,0.84,0.81,0.61,0.56
4,LogisticRegression,cv,testing,,"max_iter=10000, penalty='none'",0.11,0.11,0.6,0.6,0.06,0.06,0.8,0.8,0.67,0.67
