In [48]:
#import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from time import time
import re
from numpy.random import normal
from scipy.stats import kurtosis, skew
import math
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_curve, auc, plot_confusion_matrix, mean_squared_error, mean_absolute_error, classification_report, precision_score, recall_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from six import StringIO 
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from IPython.display import Image
import pydotplus
import warnings
warnings.filterwarnings('ignore')

flights = pd.read_csv('final_data.csv', index_col=0)

In [2]:
flights.head()

Unnamed: 0,SCHEDULED_DEPARTURE,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,...,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,DELAYED
0,1340,56,128,1436,-13,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1910,155,867,2145,-12,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,630,110,672,820,-8,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,810,100,546,950,152,0,0,0,152,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,1000,498,3904,1318,-2,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1


### Functions Used for Project

In [3]:
#write function to train and predict model, then print score results 

def model_fit(smotex, smotey, trainx, trainy, testx, testy, model):  
    
    model.fit(smotex, smotey)

    test_prediction = model.predict(testx)
    train_prediction = model.predict(trainx)
    
    print("\n")
    
    print("Train Score Results")
    print(classification_report(trainy, train_prediction))       
    print(f'Train Set Accuracy: {accuracy_score(trainy, train_prediction):.4f}')
    print(f'Train Set Precision: {precision_score(trainy, train_prediction):.4f}')
    print(f'Train Set Recall: {recall_score(trainy, train_prediction):.4f}')
    print(f'Train Set F1-Score: {f1_score(trainy, train_prediction):.4f}')

    print("\n")
    
    print("Test Score Results")    
    print(classification_report(testy, test_prediction))       
    print(f'Test Set Accuracy: {accuracy_score(testy, test_prediction):.4f}')
    print(f'Test Set Precision: {precision_score(testy, test_prediction):.4f}')
    print(f'Test Set Recall: {recall_score(testy, test_prediction):.4f}')
    print(f'Test Set F1-Score: {f1_score(testy, test_prediction):.4f}')
    
    print("\n")
    
    fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True)
    trainmatrix = plot_confusion_matrix(model, trainx, trainy, cmap=plt.cm.Blues, ax=ax1)
    testmatrix = plot_confusion_matrix(model, testx, testy, cmap=plt.cm.Blues, ax=ax2)
    ax1.set_title('Train Confusion Matrix')
    ax2.set_title('Test Confusion Matrix')
    plt.tight_layout()
    plt.show()

In [4]:
#Create graph that shows the feature importances per tree 
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

In [5]:
# function for plotting a feature importance histogram (very similar to function above - may not need both)
def features_plot(df, title):
    df.columns = ['feature', 'feature_importance']
    sns.barplot(x = 'feature_importance', y = 'feature', data = df, orient = 'h', color = 'blue') \
       .set_title(title, fontsize = 20)

In [6]:
#Function for creating a feature importance dataframe
def features_df(column_names, importances):
    df = pd.DataFrame({'feature': column_names,
                       'feature_importance': importances}) \
           .sort_values('feature_importance', ascending = False) \
           .reset_index(drop = True)
    return df

### Train Test Split

In [49]:
y = flights['DELAYED']
X = flights.drop(['DELAYED'], axis=1)

In [50]:
#Investigate the value counts of our y-variable
y.value_counts(normalize=True)

0    0.6284
1    0.3716
Name: DELAYED, dtype: float64

We see that there is quite a bit of a class imbalance in our target variable; 62.8% of our flights were not delayed while only 37.1% were delayed. As a result, we will apply a SMOTE technique in order to create synthesized data in order to improve our model. 

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Test smote below with category dummies

In [53]:
dummies = ['AIRLINE_Alaska Airlines Inc.',
       'AIRLINE_American Airlines Inc.',
       'AIRLINE_American Eagle Airlines Inc.',
       'AIRLINE_Atlantic Southeast Airlines', 'AIRLINE_Delta Air Lines Inc.',
       'AIRLINE_Frontier Airlines Inc.', 'AIRLINE_Hawaiian Airlines Inc.',
       'AIRLINE_JetBlue Airways', 'AIRLINE_Skywest Airlines Inc.',
       'AIRLINE_Southwest Airlines Co.', 'AIRLINE_Spirit Air Lines',
       'AIRLINE_US Airways Inc.', 'AIRLINE_United Air Lines Inc.',
       'AIRLINE_Virgin America', 'ORIGIN_AIRPORT_TYPE_High',
       'ORIGIN_AIRPORT_TYPE_Low', 'ORIGIN_AIRPORT_TYPE_Medium',
       'ORIGIN_AIRPORT_TYPE_Very Low', 'DESTINATION_AIRPORT_TYPE_High',
       'DESTINATION_AIRPORT_TYPE_Low', 'DESTINATION_AIRPORT_TYPE_Medium',
       'DESTINATION_AIRPORT_TYPE_Very Low', 'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2',
       'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6',
       'DAY_OF_WEEK_7', 'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5',
       'MONTH_6', 'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11',
       'MONTH_12']

In [54]:
X_train[dummies] = X_train[dummies].astype('category')
X_test[dummies] = X_test[dummies].astype('category')

In [56]:
# Previous original class distribution
print('Original class distribution: \n')
print(y.value_counts())
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train) 
# Preview synthetic sample class distribution
print('-----------------------------------------')
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_resampled).value_counts()) 

Original class distribution: 

0    6284
1    3716
Name: DELAYED, dtype: int64
-----------------------------------------
Synthetic sample class distribution: 

1    4729
0    4729
Name: DELAYED, dtype: int64


In [65]:
X_train_resampled.isna().sum()

SCHEDULED_DEPARTURE                       0
SCHEDULED_TIME                            0
DISTANCE                                  0
SCHEDULED_ARRIVAL                         0
ARRIVAL_DELAY                             0
AIR_SYSTEM_DELAY                          0
SECURITY_DELAY                            0
AIRLINE_DELAY                             0
LATE_AIRCRAFT_DELAY                       0
WEATHER_DELAY                             0
AIRLINE_Alaska Airlines Inc.             86
AIRLINE_American Airlines Inc.          441
AIRLINE_American Eagle Airlines Inc.    172
AIRLINE_Atlantic Southeast Airlines     268
AIRLINE_Delta Air Lines Inc.            451
AIRLINE_Frontier Airlines Inc.           55
AIRLINE_Hawaiian Airlines Inc.           21
AIRLINE_JetBlue Airways                 170
AIRLINE_Skywest Airlines Inc.           262
AIRLINE_Southwest Airlines Co.          689
AIRLINE_Spirit Air Lines                 71
AIRLINE_US Airways Inc.                  97
AIRLINE_United Air Lines Inc.   

In [69]:
X_train_resampled = X_train_resampled.fillna(value=0)

In [70]:
X_train_resampled.isna().sum()

SCHEDULED_DEPARTURE                     0
SCHEDULED_TIME                          0
DISTANCE                                0
SCHEDULED_ARRIVAL                       0
ARRIVAL_DELAY                           0
AIR_SYSTEM_DELAY                        0
SECURITY_DELAY                          0
AIRLINE_DELAY                           0
LATE_AIRCRAFT_DELAY                     0
WEATHER_DELAY                           0
AIRLINE_Alaska Airlines Inc.            0
AIRLINE_American Airlines Inc.          0
AIRLINE_American Eagle Airlines Inc.    0
AIRLINE_Atlantic Southeast Airlines     0
AIRLINE_Delta Air Lines Inc.            0
AIRLINE_Frontier Airlines Inc.          0
AIRLINE_Hawaiian Airlines Inc.          0
AIRLINE_JetBlue Airways                 0
AIRLINE_Skywest Airlines Inc.           0
AIRLINE_Southwest Airlines Co.          0
AIRLINE_Spirit Air Lines                0
AIRLINE_US Airways Inc.                 0
AIRLINE_United Air Lines Inc.           0
AIRLINE_Virgin America            

### 1. KNN

In [71]:
#Scale the data
scaler = StandardScaler()  
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train_resampled)  
X_test_scaled = scaler.transform(X_test)

In [72]:
#Fit and predict the vanilla model
knnmodel = KNeighborsClassifier()

In [73]:
#Apply the model and print metrics
model_fit(X_train_resampled, y_train_resampled, X_train_scaled, y_train, X_test_scaled, y_test, knnmodel)



Train Score Results


ValueError: Found input variables with inconsistent numbers of samples: [7500, 9458]

### 2. Bayes Classification Model

In [38]:
#Instansiate the vanilla model
bayes = GaussianNB()

In [39]:
bayes.fit(X_train_resampled, y_train_resampled)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
#Apply the modelfit function
model_fit(X_train_resampled, y_train_resampled, X_train, y_train, X_test, y_test, bayes)

### 3. Decision Tree Classifier

In [None]:
#Instansiate the vanilla model 
decisiontree = DecisionTreeClassifier(random_state=42)

In [None]:
#Apply the model fit function 
model_fit(X_train_resampled, y_train_resampled, X_train, y_train, X_test, y_test, decisiontree)

In [None]:
#Code for plotting decision tree


# dot_data = StringIO()
# export_graphviz(decisiontree3, out_file=dot_data,  
#                 special_characters=True)
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

### 4. Ensemble Methods

#### a. Bagging Method

In [None]:
#Instansiate the vanilla model
bagged_tree = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=20)

In [None]:
#Apply the model prediction
model_fit(X_train_resampled, y_train_resampled, X_train, y_train, X_test, y_test, bagged_tree)

#### b. Random Forest

In [None]:
#Instansiate the vanilla model 
randomforest = RandomForestClassifier(random_state=42)

In [None]:
model_fit(X_train_resampled, y_train_resampled, X_train, y_train, X_test, y_test, randomforest)

In [None]:
#Look at another performance measure: mean 3-Fold cross validation score 
mean_rf_cv_score = np.mean(cross_val_score(randomforest, X_train, y_train, cv=3))
mean_rf_cv_score

### 5. Boosting

#### a. Adaboost

In [None]:
#Instansiate the vanilla model
adb = AdaBoostClassifier(random_state=42)

In [None]:
#Fit and predict model and show results
model_fit(X_train_resampled, y_train_resampled, X_train, y_train, X_test, y_test, adb)

In [None]:
#Look at another performance measure: mean 3-Fold cross validation score 
adb_mean_cv = cross_val_score(adb, X, y, cv=3).mean()

print(f"Mean Adaboost Cross-Val Score (k=3): {adb_mean_cv: .2%}")

#### b. Gradient Boosting

In [None]:
#Instansiate the vanilla model
gb = GradientBoostingClassifier(random_state=42)

In [None]:
#Fit and predict model and show results
model_fit(X_train_resampled, y_train_resampled, X_train, y_train, X_test, y_test, gb)

In [None]:
#Look at another performance measure: mean 3-Fold cross validation score 
gb_mean_cv = cross_val_score(gb, X, y, cv=3).mean()

print(f"Mean Adaboost Cross-Val Score (k=3): {gb_mean_cv: .2%}")

#### c. XGBoost

In [None]:
#Instansiate the vanilla model
xgb = XGBClassifier()

In [None]:
#Fit and predict model and show results
model_fit(X_train_resampled, y_train_resampled, X_train, y_train, X_test, y_test, xgb)

### 6. Support Vector Machines

In [None]:
#Instansiate the vanilla model
svclassifier = SVC()

In [None]:
#Fit and predict model and show results
model_fit(X_train_resampled, y_train_resampled, X_train, y_train, X_test, y_test, svclassifier)