In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
df_raw = pd.read_csv('..//Datasets//airline_2007.csv')

# There are well over 6 million rows in this dataset. In order to make our exploration and initial modeling faster,
# I'm going to take a randomly sampled subset without replacement from our data to work with. 
# Then, when it's time to run the full model, we can utilize the entire dataset
df_subset = df_raw.sample(10000, random_state=42)
df_subset.tail()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
1888104,2007,4,20,5,1856.0,1845,2041.0,2035,WN,2215,...,6,9,0,,0,0,0,0,0,0
6548658,2007,11,26,1,1353.0,1335,1535.0,1517,DL,1412,...,4,17,0,,0,0,0,0,0,18
976653,2007,2,6,2,1226.0,1200,1329.0,1310,MQ,3164,...,3,14,0,,0,0,0,0,0,19
6411482,2007,11,5,1,905.0,900,1037.0,1031,OO,4032,...,10,13,0,,0,0,0,0,0,0
5511972,2007,9,25,2,1036.0,1040,1141.0,1145,AA,1675,...,2,14,0,,0,0,0,0,0,0


In [32]:
# Drop a few useless features from the get-go
df_features = df_subset.drop(['UniqueCarrier', 'FlightNum', 'TailNum', 'Year', 'CancellationCode'], axis=1)
df_features.drop(['Origin', 'Dest'], axis=1, inplace=True)


# Drop rows with cancelled flights
df_features.drop(df_features[df_features.Cancelled != 0].index, inplace=True)
# Drop rows with diverted flights
df_features.drop(df_features[df_features.Diverted != 0].index, inplace=True)
# Drop Cancelled & Diverted columns, because 0 is the only value left in the columns
df_features.drop(['Cancelled', 'Diverted'], axis=1, inplace=True)

# # Change destination and origin data into dummy variables. This adds over 600 features.
# df_features = pd.get_dummies(df_features, columns=['Dest', 'Origin'])

# Make DayOfWeek and Month a dummy variable. We will drop the DayofMonth column. The holidays' impacts should
# hopefully be captured in the Month variable
df_features = pd.get_dummies(df_features, columns=['DayOfWeek', 'Month'])
df_features.drop('DayofMonth', axis=1, inplace=True)

# Make the delay column categorical, cutoff is 30 minutes
df_features['ArrDelay'] = np.where(df_features['ArrDelay'] > 30, 1, 0)

df_features.dropna(inplace=True)
df_features.head()

Unnamed: 0,DepTime,CRSDepTime,ArrTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
5323722,1743.0,1739,1821.0,1819,98.0,100.0,73.0,0,4.0,483,...,0,0,0,0,0,0,1,0,0,0
7332575,635.0,640,819.0,825,224.0,225.0,204.0,0,-5.0,1476,...,0,0,0,0,0,0,0,0,0,1
7176659,626.0,625,720.0,723,114.0,118.0,98.0,0,1.0,613,...,0,0,0,0,0,0,0,0,0,1
6568756,1020.0,1020,1142.0,1133,82.0,73.0,55.0,0,0.0,164,...,0,0,0,0,0,0,0,0,1,0
7246298,955.0,950,1058.0,1045,63.0,55.0,30.0,0,5.0,134,...,0,0,0,0,0,0,0,0,0,1


In [27]:
# Do we have balanced classes?
print(df_features.ArrDelay.value_counts())
class_balance = df_features.ArrDelay.value_counts()[0] / df_features.ArrDelay.value_counts().sum()
print('\nPercentage of negative examples: \n{:.1f}%'.format(class_balance*100))

0    8306
1    1466
Name: ArrDelay, dtype: int64

Percentage of negative examples: 
85.0%


We can see that we have a pretty big class imbalance. We will need to undersample the majority class, the negative examples.

In [57]:
# The number of positive examples
sample_size = sum(df_features.ArrDelay == 1)

# Index positions of each class
negative_indices = df_features[df_features['ArrDelay'] == 0].index
positive_indices = df_features[df_features['ArrDelay'] == 1].index

# Choose a random undersample from majority class, the negative examples (planes that are on time) 
random_indices = np.random.choice(negative_indices, sample_size, replace=False)

# Combine the indices
undersampled_indices = positive_indices.union(random_indices)

# Create a train/test split from the resampled data
X_resampled = df_features.loc[undersampled_indices, :].drop('ArrDelay', axis=1)
y_resampled = df_features.loc[undersampled_indices, :]['ArrDelay']

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2)

1466


In [58]:
# Let's check to make sure that worked
print(y_train.value_counts())
class_balance = y_train.value_counts()[0] / y_train.value_counts().sum()
print('\nPercentage of negative examples: \n{:.1f}%'.format(class_balance*100))

0    1187
1    1158
Name: ArrDelay, dtype: int64

Percentage of negative examples: 
50.6%


Excellent. Now we will initialize and run SVC and Random Forest Classifiers

In [59]:
# Initialize Random Forest
rfc = RandomForestClassifier(n_estimators=100)
start_time = time.time()
rfc.fit(X_train, y_train)
cv_scores_rfc = cross_val_score(rfc, X_train, y_train, cv=3)
print('Fit & CV took ---{:.3} seconds--- \n'.format(time.time() - start_time))
print('Cross validated scores: \n', cv_scores_rfc, '\n')
print('Test score: \n{:.4f}'.format(rfc.score(X_test, y_test)))

Fit & CV took ---2.04 seconds--- 

Cross validated scores: 
 [ 0.95652174  0.98721228  0.97823303] 

Test score: 
0.9847


In [60]:
svc_clf = SVC()
start_time = time.time()
svc_clf.fit(X_train, y_train)
cv_scores_svc = cross_val_score(svc_clf, X_train, y_train, cv=2)
print('Fit & CV took ---{:.3} seconds--- \n'.format(time.time() - start_time))
print('Cross validated scores: \n', cv_scores_svc, '\n')
print('Test score: \n{:.2f}'.format(svc_clf.score(X_test, y_test)))

Fit & CV took ---3.02 seconds--- 

Cross validated scores: 
 [ 0.50639386  0.5059727 ] 

Test score: 
0.48


I think it's pretty obvious which classifier we should use. I want to note that I originally ran the above two cells before I performed undersampling. The Random Forest attained similar scores, while the SVC model attained a score of 0.85. This, along with the above results, lets me conclude the SVC is about as good as randomly guessing.

Moving forward, I will be tuning the hyper parameters for the random forest on the full dataset. My only worry is that this will become too memory intensive, in which case I will have to switch to linear regression. The next couple of cells will be copy/pasted from above, you can skip it. The only difference will be that we won't be taking a subset like before.

# Begin reused code

In [61]:
# Drop a few useless features from the get-go
df_features = df_raw.drop(['UniqueCarrier', 'FlightNum', 'TailNum', 'Year', 'CancellationCode'], axis=1)
df_features.drop(['Origin', 'Dest'], axis=1, inplace=True)


# Drop rows with cancelled flights
df_features.drop(df_features[df_features.Cancelled != 0].index, inplace=True)
# Drop rows with diverted flights
df_features.drop(df_features[df_features.Diverted != 0].index, inplace=True)
# Drop Cancelled & Diverted columns, because 0 is the only value left in the columns
df_features.drop(['Cancelled', 'Diverted'], axis=1, inplace=True)

# # Change destination and origin data into dummy variables. This adds over 600 features.
# df_features = pd.get_dummies(df_features, columns=['Dest', 'Origin'])

# Make DayOfWeek and Month a dummy variable. We will drop the DayofMonth column. The holidays' impacts should
# hopefully be captured in the Month variable
df_features = pd.get_dummies(df_features, columns=['DayOfWeek', 'Month'])
df_features.drop('DayofMonth', axis=1, inplace=True)

# Make the delay column categorical, cutoff is 30 minutes
df_features['ArrDelay'] = np.where(df_features['ArrDelay'] > 30, 1, 0)

df_features.dropna(inplace=True)
df_features.head()

Unnamed: 0,DepTime,CRSDepTime,ArrTime,CRSArrTime,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,...,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12
0,1232.0,1225,1341.0,1340,69.0,75.0,54.0,0,7.0,389,...,0,0,0,0,0,0,0,0,0,0
1,1918.0,1905,2043.0,2035,85.0,90.0,74.0,0,13.0,479,...,0,0,0,0,0,0,0,0,0,0
2,2206.0,2130,2334.0,2300,88.0,90.0,73.0,1,36.0,479,...,0,0,0,0,0,0,0,0,0,0
3,1230.0,1200,1356.0,1330,86.0,90.0,75.0,0,30.0,479,...,0,0,0,0,0,0,0,0,0,0
4,831.0,830,957.0,1000,86.0,90.0,74.0,0,1.0,479,...,0,0,0,0,0,0,0,0,0,0


In [62]:
# Do we have balanced classes?
print(df_features.ArrDelay.value_counts())
class_balance = df_features.ArrDelay.value_counts()[0] / df_features.ArrDelay.value_counts().sum()
print('\nPercentage of negative examples: \n{:.1f}%'.format(class_balance*100))

0    6208745
1    1066543
Name: ArrDelay, dtype: int64

Percentage of negative examples: 
85.3%


In [63]:
# The number of positive examples
sample_size = sum(df_features.ArrDelay == 1)

# Index positions of each class
negative_indices = df_features[df_features['ArrDelay'] == 0].index
positive_indices = df_features[df_features['ArrDelay'] == 1].index

# Choose a random undersample from majority class, the negative examples (planes that are on time) 
random_indices = np.random.choice(negative_indices, sample_size, replace=False)

# Combine the indices
undersampled_indices = positive_indices.union(random_indices)

# Create a train/test split from the resampled data
X_resampled = df_features.loc[undersampled_indices, :].drop('ArrDelay', axis=1)
y_resampled = df_features.loc[undersampled_indices, :]['ArrDelay']

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2)

In [64]:
# Let's check to make sure that worked
print(y_train.value_counts())
class_balance = y_train.value_counts()[0] / y_train.value_counts().sum()
print('\nPercentage of negative examples: \n{:.1f}%'.format(class_balance*100))

1    853493
0    852975
Name: ArrDelay, dtype: int64

Percentage of negative examples: 
50.0%


# New code

In [None]:
# Initialize Random Forest
rfc = RandomForestClassifier(n_estimators=100, n_jobs=3, max_depth=5)
start_time = time.time()
rfc.fit(X_train, y_train)
score = rfc.score(X_train, y_train)
print('Fit & CV took ---{:.3} seconds--- \n'.format(time.time() - start_time))
print('Cross validated scores: \n', cv_scores_rfc, '\n')
print('Test score: \n{:.4f}'.format(rfc.score(X_test, y_test)))

Now I'm pretty happy with those results, and I think that a gridsearch could help us pinpoint the best hyperparameters, but with each run taking more than ten minutes, I want to talk to Katherine about strategies for hyperparameter tuning before I struggle through it ten minutes at a time. For now though, 0.96 is pretty good.