In [12]:
#import necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from time import time
import re
from numpy.random import normal
from scipy.stats import kurtosis, skew
import math
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import scipy.stats as stats
from sklearn import tree
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import roc_curve, auc, plot_confusion_matrix, mean_squared_error, mean_absolute_error, classification_report, precision_score, recall_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from six import StringIO 
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from sklearn.pipeline import Pipeline
from IPython.display import Image
import pydotplus
import warnings
warnings.filterwarnings('ignore')

flights = pd.read_csv('test.csv', index_col=0)

In [2]:
#write function to train and predict model, then print score results 

def model_fit(smotex, smotey, trainx, trainy, testx, testy, model):  
    
    model.fit(smotex, smotey)

    test_prediction = model.predict(testx)
    train_prediction = model.predict(trainx)
    
    print("\n")
    
    print("Train Score Results")
    print(classification_report(trainy, train_prediction))       
    print(f'Train Set Accuracy: {accuracy_score(trainy, train_prediction):.4f}')
    print(f'Train Set Precision: {precision_score(trainy, train_prediction):.4f}')
    print(f'Train Set Recall: {recall_score(trainy, train_prediction):.4f}')
    print(f'Train Set F1-Score: {f1_score(trainy, train_prediction):.4f}')

    print("\n")
    
    print("Test Score Results")    
    print(classification_report(testy, test_prediction))       
    print(f'Test Set Accuracy: {accuracy_score(testy, test_prediction):.4f}')
    print(f'Test Set Precision: {precision_score(testy, test_prediction):.4f}')
    print(f'Test Set Recall: {recall_score(testy, test_prediction):.4f}')
    print(f'Test Set F1-Score: {f1_score(testy, test_prediction):.4f}')
    
    print("\n")
    
    fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True)
    trainmatrix = plot_confusion_matrix(model, trainx, trainy, cmap=plt.cm.Blues, ax=ax1)
    testmatrix = plot_confusion_matrix(model, testx, testy, cmap=plt.cm.Blues, ax=ax2)
    ax1.set_title('Train Confusion Matrix')
    ax2.set_title('Test Confusion Matrix')
    plt.tight_layout()
    plt.show()

In [3]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 23 columns):
MONTH                       10000 non-null int64
DAY_OF_WEEK                 10000 non-null int64
AIRLINE_CODE                10000 non-null object
ORIGIN_AIRPORT_CODE         10000 non-null object
DESTINATION_AIRPORT_CODE    10000 non-null object
SCHEDULED_DEPARTURE         10000 non-null int64
SCHEDULED_TIME              10000 non-null int64
DISTANCE                    10000 non-null int64
SCHEDULED_ARRIVAL           10000 non-null int64
ARRIVAL_DELAY               10000 non-null int64
AIR_SYSTEM_DELAY            10000 non-null int64
SECURITY_DELAY              10000 non-null int64
AIRLINE_DELAY               10000 non-null int64
LATE_AIRCRAFT_DELAY         10000 non-null int64
WEATHER_DELAY               10000 non-null int64
AIRLINE                     10000 non-null object
AIRPORT_CODE                10000 non-null object
CITY                        10000 non-null object
STAT

In [3]:
category = ['AIRLINE_CODE', 'ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE',
           'AIRPORT_CODE', 'CITY', 'STATE', 'COUNTRY']
flights[category]=flights[category].astype('str')

In [4]:
floats = ['MONTH', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'SCHEDULED_TIME',
       'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_DELAY', 'AIR_SYSTEM_DELAY',
       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WEATHER_DELAY']
flights[floats]=flights[floats].astype('int')

In [5]:
dummies = ['AIRLINE', 
            'ORIGIN_AIRPORT_TYPE', 'DESTINATION_AIRPORT_TYPE']
flights[dummies]=flights[dummies].astype('category')

In [6]:
flights = flights.drop(['AIRLINE_CODE', 'AIRPORT_CODE', 'ORIGIN_AIRPORT_CODE', 'DESTINATION_AIRPORT_CODE',
                      'CITY', 'STATE', 'COUNTRY'], axis=1)

In [7]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 16 columns):
MONTH                       10000 non-null int64
DAY_OF_WEEK                 10000 non-null int64
SCHEDULED_DEPARTURE         10000 non-null int64
SCHEDULED_TIME              10000 non-null int64
DISTANCE                    10000 non-null int64
SCHEDULED_ARRIVAL           10000 non-null int64
ARRIVAL_DELAY               10000 non-null int64
AIR_SYSTEM_DELAY            10000 non-null int64
SECURITY_DELAY              10000 non-null int64
AIRLINE_DELAY               10000 non-null int64
LATE_AIRCRAFT_DELAY         10000 non-null int64
WEATHER_DELAY               10000 non-null int64
AIRLINE                     10000 non-null category
ORIGIN_AIRPORT_TYPE         10000 non-null category
DESTINATION_AIRPORT_TYPE    10000 non-null category
DELAYED                     10000 non-null int64
dtypes: category(3), int64(13)
memory usage: 1.1 MB


In [8]:
y = flights['DELAYED']
X = flights.drop(['DELAYED'], axis=1)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7500 entries, 4901 to 7270
Data columns (total 15 columns):
MONTH                       7500 non-null int64
DAY_OF_WEEK                 7500 non-null int64
SCHEDULED_DEPARTURE         7500 non-null int64
SCHEDULED_TIME              7500 non-null int64
DISTANCE                    7500 non-null int64
SCHEDULED_ARRIVAL           7500 non-null int64
ARRIVAL_DELAY               7500 non-null int64
AIR_SYSTEM_DELAY            7500 non-null int64
SECURITY_DELAY              7500 non-null int64
AIRLINE_DELAY               7500 non-null int64
LATE_AIRCRAFT_DELAY         7500 non-null int64
WEATHER_DELAY               7500 non-null int64
AIRLINE                     7500 non-null category
ORIGIN_AIRPORT_TYPE         7500 non-null category
DESTINATION_AIRPORT_TYPE    7500 non-null category
dtypes: category(3), int64(12)
memory usage: 784.8 KB


In [13]:
X_train.shape

(7500, 15)

# test smote THEN ohe - issue with string variables

In [10]:
# Previous original class distribution
print('Original class distribution: \n')
print(y.value_counts())
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train) 
# Preview synthetic sample class distribution
print('-----------------------------------------')
print('Synthetic sample class distribution: \n')
print(pd.Series(y_train_resampled).value_counts()) 

Original class distribution: 

0    6284
1    3716
Name: DELAYED, dtype: int64


ValueError: could not convert string to float: 'Southwest Airlines Co.'