In [1]:
import sys
sys.path.append('./source_code')

In [2]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, learning_curve, cross_val_score, \
    KFold, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, \
    roc_auc_score, roc_curve, confusion_matrix
import xgboost as xgb
import classification_helpers as ch

%config InlineBackend.figure_format = 'png'
%matplotlib inline
sns.set(color_codes=True)
plt.style.use('seaborn-colorblind')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 2)
pd.set_option('display.max_info_columns', 200)

In [3]:
with open('pickles/combined_sample_weather_df5.pickle', 'rb') as read_file:
    sample_df = pickle.load(read_file)

In [4]:
with open('pickles/cv_records_df.pickle', 'rb') as read_file:
    cv_records_df = pickle.load(read_file)

In [5]:
with open('pickles/records_df.pickle', 'rb') as read_file:
    records_df = pickle.load(read_file)

In [6]:
# with open('pickles/records_df.pickle', 'wb') as to_write:
#     pickle.dump(records_df, to_write)

In [7]:
base_numerical_features = ['airline_avg_arrival_delay',
                           'origin_airport_avg_flights_per_day',
                           'origin_airport_avg_departure_delay',
                           'destination_airport_avg_flights_per_day',
                           'destination_airport_avg_arrival_delay',
                           'month_avg_delay',
                           'day_of_week_avg_delay']

weather_numerical_features = ['Precipitation_origin',
                              'Snow Depth_origin',
                              'Visibility_origin',
                              'Wind Speed_origin',                      
                              'Precipitation_destination',
                              'Snow Depth_destination',
                              'Visibility_destination',
                              'Wind Speed_destination']
                              

weather_categorical_features = ['is_Clear_origin',
                                'is_Partially cloudy_origin',
                                'is_Rain_origin',
                                'is_Snow_origin',
                                'is_Overcast_origin',
                                'is_Clear_destination',
                                'is_Partially cloudy_destination',
                                'is_Rain_destination',
                                'is_Snow_destination',
                                'is_Overcast_destination']

base_categorical_features = ['American Airlines Inc.', 'American Eagle Airlines Inc.',
                             'Atlantic Southeast Airlines', 'Delta Air Lines Inc.',
                             'Frontier Airlines Inc.', 'Hawaiian Airlines Inc.', 'JetBlue Airways',
                             'Skywest Airlines Inc.', 'Southwest Airlines Co.', 'Spirit Air Lines',
                             'US Airways Inc.', 'United Air Lines Inc.', 'Virgin America',
                             'is_thanksgiving_period', 'is_xmas_period',
                             'is_night_departure', 'is_night_arrival']

target = ['is_not_on_time']

all_features = (base_numerical_features 
                + base_categorical_features 
                + weather_numerical_features)

In [8]:
X, y = ch.feature_target_selection(all_features, target, sample_df)

X_train_val, X_test, y_train_val, y_test = ch.initial_split(X, y.values.ravel())
X_train, X_val, y_train, y_val = ch.second_split(X_train_val, y_train_val)

# scaler = StandardScaler()
# X_train_val_scaled = scaler.fit_transform(X_train_val)

lm = LogisticRegression(penalty='none', max_iter=10000)
rf = RandomForestClassifier()

In [14]:
gbm = xgb.XGBClassifier(
    n_estimators=1000, 
    max_depth=6,
    objective='binary:logistic',
    learning_rate=.1,
    subsample=.7,
    min_child_weight=1,
    colsample_bytree=1
)

eval_set = [(X_train, y_train), (X_val, y_val)]

gbm.fit(
    X_train, y_train, 
    eval_set=eval_set,
    eval_metric='auc',
    early_stopping_rounds=20,
    verbose=True
)

[0]	validation_0-auc:0.66351	validation_1-auc:0.64686
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 20 rounds.
[1]	validation_0-auc:0.67172	validation_1-auc:0.65390
[2]	validation_0-auc:0.67705	validation_1-auc:0.65875
[3]	validation_0-auc:0.68059	validation_1-auc:0.66291
[4]	validation_0-auc:0.68244	validation_1-auc:0.66469
[5]	validation_0-auc:0.68482	validation_1-auc:0.66568
[6]	validation_0-auc:0.68704	validation_1-auc:0.66768
[7]	validation_0-auc:0.68831	validation_1-auc:0.66801
[8]	validation_0-auc:0.69048	validation_1-auc:0.66880
[9]	validation_0-auc:0.69202	validation_1-auc:0.66970
[10]	validation_0-auc:0.69409	validation_1-auc:0.67133
[11]	validation_0-auc:0.69608	validation_1-auc:0.67208
[12]	validation_0-auc:0.69797	validation_1-auc:0.67305
[13]	validation_0-auc:0.69920	validation_1-auc:0.67431
[14]	validation_0-auc:0.70072	validation_1-auc:0.67572
[15]	validation_0-auc:0.7026

[146]	validation_0-auc:0.78299	validation_1-auc:0.69881
[147]	validation_0-auc:0.78326	validation_1-auc:0.69875
[148]	validation_0-auc:0.78363	validation_1-auc:0.69869
[149]	validation_0-auc:0.78404	validation_1-auc:0.69865
[150]	validation_0-auc:0.78452	validation_1-auc:0.69893
[151]	validation_0-auc:0.78470	validation_1-auc:0.69888
[152]	validation_0-auc:0.78510	validation_1-auc:0.69894
[153]	validation_0-auc:0.78531	validation_1-auc:0.69886
[154]	validation_0-auc:0.78574	validation_1-auc:0.69886
[155]	validation_0-auc:0.78651	validation_1-auc:0.69922
[156]	validation_0-auc:0.78696	validation_1-auc:0.69906
[157]	validation_0-auc:0.78718	validation_1-auc:0.69919
[158]	validation_0-auc:0.78751	validation_1-auc:0.69910
[159]	validation_0-auc:0.78797	validation_1-auc:0.69894
[160]	validation_0-auc:0.78846	validation_1-auc:0.69889
[161]	validation_0-auc:0.78874	validation_1-auc:0.69890
[162]	validation_0-auc:0.78892	validation_1-auc:0.69890
[163]	validation_0-auc:0.78934	validation_1-auc:

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
              tree_method=None, validate_parameters=False, verbosity=None)

In [10]:
# cv_records_df = ch.cv(gbm, X_train_val_scaled, y_train_val, cv_records_df)

In [16]:
_, records_df = ch.simple_validate(gbm, X_train, X_val,
                                   y_train, y_val,
                                   records_df, threshold=0.5,
                                   scale=True)

Train F1:                                 0.37
Val F1:                                   0.22
Train precision:                          0.89
Val precision:                            0.61
Train recall:                             0.23
Val recall:                               0.14
Train accuracy:                           0.84
Val accuracy:                             0.81
Train AUC:                                0.61
Val AUC:                                  0.56
Validation type: simple
Iteration description: testing
Feature engineering: 


In [17]:
records_df

Unnamed: 0,model,validation_type,iteration_desc,feature_engineering,hyperparameter_tuning,train_f1,val_f1,train_precision,val_precision,train_recall,val_recall,train_accuracy,val_accuracy,train_AUC,val_AUC
0,LogisticRegression,simple,testing,,"max_iter=10000, penalty='none'",0.11,0.11,0.6,0.59,0.06,0.06,0.8,0.8,0.53,0.53
1,LogisticRegression,simple,testing,,"max_iter=10000, penalty='none'",0.11,0.11,0.6,0.59,0.06,0.06,0.8,0.8,0.53,0.53
2,RandomForestClassifier,simple,testing,,,0.97,0.21,0.99,0.58,0.94,0.13,0.99,0.81,0.97,0.55
3,XGBClassifier,simple,testing,,"base_score=0.5, booster=None, colsample_byleve...",0.37,0.23,0.89,0.61,0.23,0.14,0.84,0.81,0.61,0.56


In [19]:
print(records_df.iloc[3, 4])

base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
              tree_method=None, validate_parameters=False, verbosity=None
