In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action = 'ignore')
import io

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import roc_curve, RocCurveDisplay, roc_auc_score
from sklearn.metrics import plot_confusion_matrix, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline


from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('./Data/final_table.csv', index_col = 'flightdate')
df.head()

Unnamed: 0_level_0,year,quarter,month,dayofmonth,dayofweek,dot_id_reporting_airline,iata_code_reporting_airline,flight_number_reporting_airline,originairportid,originairportseqid,...,distance,distancegroup,carrierdelay,weatherdelay,nasdelay,securitydelay,lateaircraftdelay,airline,origincity,destcity
flightdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11/06/19,2019,2,6,11,2,20363,9E,3459,10397,1039707,...,192,1,0.0,0.0,0.0,0.0,214.0,Endeavor Air Inc.,Atlanta,Columbia
10/04/18,2018,2,4,10,2,20363,9E,5198,10397,1039707,...,780,4,0.0,0.0,0.0,0.0,0.0,Endeavor Air Inc.,Atlanta,White Plains
01/10/19,2019,4,10,1,2,20363,9E,3358,12953,1295304,...,544,3,4.0,0.0,0.0,0.0,11.0,Endeavor Air Inc.,New York,Charlotte
27/07/18,2018,3,7,27,5,20363,9E,5388,10821,1082106,...,255,2,0.0,0.0,94.0,0.0,18.0,Endeavor Air Inc.,Baltimore,Raleigh/Durham
04/03/19,2019,1,3,4,1,20363,9E,5179,14122,1412202,...,340,2,0.0,27.0,3.0,0.0,0.0,Endeavor Air Inc.,Pittsburgh,New York


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86761 entries, 11/06/19 to 09/02/18
Data columns (total 49 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   year                             86761 non-null  int64  
 1   quarter                          86761 non-null  int64  
 2   month                            86761 non-null  int64  
 3   dayofmonth                       86761 non-null  int64  
 4   dayofweek                        86761 non-null  int64  
 5   dot_id_reporting_airline         86761 non-null  int64  
 6   iata_code_reporting_airline      86761 non-null  object 
 7   flight_number_reporting_airline  86761 non-null  int64  
 8   originairportid                  86761 non-null  int64  
 9   originairportseqid               86761 non-null  int64  
 10  origincitymarketid               86761 non-null  int64  
 11  originstatefips                  86761 non-null  int64  
 12  originstatena

In [4]:
df = pd.get_dummies(df, columns = ['iata_code_reporting_airline',
                                  'originstatename', 'deststatename', 'airline', 'origincity', 'destcity'])

In [5]:
df.head()

Unnamed: 0_level_0,year,quarter,month,dayofmonth,dayofweek,dot_id_reporting_airline,flight_number_reporting_airline,originairportid,originairportseqid,origincitymarketid,...,destcity_West Yellowstone,destcity_White Plains,destcity_Wichita,destcity_Wichita Falls,destcity_Williston,destcity_Wilmington,destcity_Worcester,destcity_Wrangell,destcity_Yakutat,destcity_Yuma
flightdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11/06/19,2019,2,6,11,2,20363,3459,10397,1039707,30397,...,0,0,0,0,0,0,0,0,0,0
10/04/18,2018,2,4,10,2,20363,5198,10397,1039707,30397,...,0,1,0,0,0,0,0,0,0,0
01/10/19,2019,4,10,1,2,20363,3358,12953,1295304,31703,...,0,0,0,0,0,0,0,0,0,0
27/07/18,2018,3,7,27,5,20363,5388,10821,1082106,30852,...,0,0,0,0,0,0,0,0,0,0
04/03/19,2019,1,3,4,1,20363,5179,14122,1412202,30198,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86761 entries, 11/06/19 to 09/02/18
Columns: 868 entries, year to destcity_Yuma
dtypes: float64(22), int64(21), uint8(825)
memory usage: 97.4+ MB


### Preprocessing

In [7]:
x = df.drop(['arrdel15'],axis= 1)
y = df.arrdel15

#train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 42)


print(x_train.shape, y_train.shape)

(65070, 867) (65070,)


In [8]:
def score(model,X_train, X_test):
    
    """
    This function scores the model by returning the confusion matrix and classification report of the model
    
    """
    
    # Get predictions
    preds = model.predict(X_test)
    cm = confusion_matrix(y_test, preds)
    ConfusionMatrixDisplay(cm, display_labels=['WNV Not Present','WNV Present']).plot(cmap='GnBu')

    print("Training Score:",model.score(X_train,y_train))
    print("Testing Score:",model.score(X_test,y_test))
    print("\n")
    print(classification_report(y_test, preds)) 


In [9]:
def fimportance_df(list_of_features,list_of_feature_scores):
    
    """
    This function returns a dataframe, ranking the top features as the top rows.
    
    """
    abs_list = [abs(i) for i in list_of_feature_scores]
    fimportance = zip(list_of_features, list_of_feature_scores, abs_list)
    df = pd.DataFrame(fimportance,columns=['Feature','Feature_Significance','Feature_Significance_Abs'])
    df = df.sort_values("Feature_Significance_Abs",ascending=False).reset_index(drop=True)
    return df[['Feature','Feature_Significance']]

In [10]:
def fimportance(list_of_features, list_of_feature_scores):

    """
    This function plots the feature importance/coefficients as horizontal bar plot.
    
    It takes in dataframe generated from the fimportance_df function.
    
    """    
    
    # Initialize the matplotlib figure
    df=fimportance_df(list_of_features, list_of_feature_scores)
    
    with sns.axes_style("whitegrid"):
        f, ax = plt.subplots(figsize=(8, 12))
        
        # Plot the features
        sns.set_color_codes("muted")
        hbar = sns.barplot(x="Feature_Significance",
                            y="Feature",
                            data=df,
                            color="b")
        for bar in hbar.patches:
   
        # Using Matplotlib's annotate function and
        # passing the coordinates where the annotation shall be done
        # x-coordinate: bar.get_width()
        # y-coordinate: bar.get_y()+bar.get_height()/2
        # free space to be left to make graph pleasing: (0, 0)
        # ha and va stand for the horizontal and vertical alignment
            hbar.annotate(format(bar.get_width(), '.3f'), # text is width of bar
                      (bar.get_width(), bar.get_y()+bar.get_height()/2), # coordinates of each annotation
                      ha='left', va='center',
                       size=10, xytext=(2, 0),
                       textcoords='offset points')
          # Add axis label
    ax.set(title= "Feature Significance",
           xlabel="Score",
          ylabel="Feature")
    ax.xaxis.get_label().set_fontsize(20)
    ax.yaxis.get_label().set_fontsize(20)
    ax.title.set_fontsize(30)
    sns.despine(left=True, bottom=True)


In [12]:
def roc(model, model_name, X_train, X_test): 
    
    """
    This function generates the ROC/AUC curve of the model.
    
    Due to the imbalanced nature of the data, the preferred metrics for this project is ROC/AUC.
    
    """      
    
    plt.figure(figsize=(16,10))
    
    # Predict probabilities from X_test
    df = pd.DataFrame(model.predict_proba(X_test))
    preds = df.drop(columns=0)
    
    # 
    fpr, tpr, _ = roc_curve(y_test, preds)
    auc = roc_auc_score(y_test, preds)

    df2 = pd.DataFrame(model.predict_proba(X_train))
    preds2 = df2.drop(columns=0)
    fpr2, tpr2, _ = roc_curve(y_train, preds2)
    auc2 = roc_auc_score(y_train, preds2)
    
    with sns.axes_style(style="darkgrid"):
        plt.plot(fpr,tpr,color='orange',label="%s test, auc=%s" %(model_name, format(auc,".3f")))
        plt.plot(fpr2,tpr2,color='blue',label="%s train, auc=%s" %(model_name, format(auc2,".3f")))
        plt.plot(y_test,y_test,color='gray',linestyle=':') #baseline 50%
    plt.legend(loc='lower right')    
    
    sns.despine(left=True,bottom=True)

In [13]:
preprocess_pipe = make_pipeline(MinMaxScaler())

In [14]:
# preprocess data through pipeline for modelling

x_train_t = preprocess_pipe.fit_transform(x_train)
x_test_t = preprocess_pipe.transform(x_test)

pca = PCA(n_components=2)
enn = EditedNearestNeighbours()
smote = SMOTE(random_state=0)
knn = KNeighborsClassifier(n_neighbors=1)

In [15]:
#create pipeline
lr_pipe = make_pipeline(SMOTE(random_state = 42), 
                        LogisticRegression(max_iter=200, random_state = 42)
                       )
    

# define grid params for logistic regression
lr_params = {
    'logisticregression__C': [0.02, 0.05, 0.1],
    'logisticregression__solver' : ['newton-cg','saga', 'sag', 'lbfgs'],
    'logisticregression__multi_class': ['multinomial', 'ovr', 'auto'],
    'logisticregression__penalty':['l1','l2','elasticnet'],
    'logisticregression__l1_ratio':[None, 0.25,0.5,0.75],
    'logisticregression__class_weight':[None, 'balanced']
}
    

# Instantiate GridSearchCV
# Due to imbalanced data, scoring = roc & auc
grid_lr = GridSearchCV(lr_pipe, param_grid = lr_params, verbose = 1, n_jobs=-1, cv=5, scoring='roc_auc')

In [16]:
preprocess_pipe = make_pipeline(MinMaxScaler())

In [17]:
# preprocess data through pipeline for modelling

x_train_t = preprocess_pipe.fit_transform(x_train)
x_test_t = preprocess_pipe.transform(x_test)

In [None]:
grid_lr.fit(x_train_t,y_train)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits
