In [8]:
import pandas as pd
import numpy as np

In [9]:
df_list = []
col_list = ["Mkt_Ccode", "Dep_Airport", "Arr_Airport", "Day_Week", "month", "Sched_Dep_Time_OAG_Block", "Arr_Delay_Time_Actual_Flag"]
names = ["main_airline.csv", "main_airline_2018.csv"]

for i in range(2):
   url = f"/content/drive/MyDrive/Resources/{names[i]}"
   df_list.append(pd.read_csv(url, usecols = col_list, low_memory = False))

df = pd.concat(df_list, ignore_index = True)

del df_list

In [5]:
# If you are using the entire data set...

# y1 = df[["Arr_Delay_Time_Actual_Flag"]]
# X = df[["Mkt_Ccode", "Dep_Airport", "Arr_Airport", "Day_Week", "month", "Sched_Dep_Time_OAG_Block"]]

If you take a subsample, you will need to drop several rare airports. It seems you need a sample size of ~ 350K + to avoid other missing airports, but you must check.

In [10]:
df = df.loc[~df['Dep_Airport'].isin(['IFP', 'YNG'])]
df = df.loc[~df['Arr_Airport'].isin(['IFP', 'YNG'])]

In [11]:
# If you are taking a subsample...

X = df[["Arr_Delay_Time_Actual_Flag", "Mkt_Ccode", "Dep_Airport", "Arr_Airport", "Day_Week", "month", "Sched_Dep_Time_OAG_Block"]]

In [12]:
X_dummy = pd.get_dummies(X)

In [None]:
X_dummy.shape

(15949502, 787)

In [13]:
X_dummy2 = X_dummy.sample(5000000)

In [None]:
X_dummy2.shape

(1000000, 787)

In [14]:
dummy_count = X_dummy2.sum().to_list()

In [15]:
li_new = [i for i, element in enumerate(dummy_count) if element == 0]

print(li_new)

[]


In [16]:
y1 = X_dummy2[["Arr_Delay_Time_Actual_Flag"]]
X_dummy2.drop(columns = ["Arr_Delay_Time_Actual_Flag"], axis = 1, inplace = True)

In [None]:
X_dummy2.iloc[0,:].to_frame().transpose().to_csv("first_row_revised.csv")

In [17]:
from sklearn.metrics import *

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow import keras

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

In [21]:
from keras.wrappers.scikit_learn import KerasClassifier

If you are doing a grid search, do not train/test/split.
You only need to label encode y1 and not y_test and y_train.

In [None]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(np.ravel(y1, order = "c"))
encoded_y1 = label_encoder.transform(np.ravel(y1, order = "c"))

In [None]:
# Step 2: Convert encoded labels to one-hot-encoding
y1_categorical = to_categorical(encoded_y1)

In [None]:
def create_model(learning_rate = 0.01):
    model = Sequential()
    model.add(Dense(40, input_dim = 786))
    model.add(Activation('relu')) # An "activation" is just a non-linear function applied to the output
#     model.add(Dropout(0.1))   # Dropout helps protect the model from memorizing or "overfitting" the training data
    # model.add(Dense(20))
    model.add(Activation('relu'))
#     model.add(Dropout(0.1))
    model.add(Dense(2))
    model.add(Activation('softmax')) # This special "softmax" a
    model.compile(loss='categorical_crossentropy', optimizer = tf.optimizers.Adam(learning_rate = learning_rate), metrics=['accuracy', 'AUC']) 
    return model

In [None]:
model = KerasClassifier(build_fn = create_model, epochs = 30, batch_size = 32)

learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate = learning_rate)

grid = GridSearchCV(estimator = model, param_grid = param_grid)
grid_result = grid.fit(X_dummy2, y1_categorical)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

Train/test/split for evaluation

In [22]:
from sklearn.model_selection import train_test_split



First split is if you are using all the data. The second one is if you subsampled.

In [13]:
# X_train, X_test, y_train, y_test = train_test_split(X_dummy, y1, random_state = 45)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_dummy2, y1, random_state = 45)

In [24]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(np.ravel(y_train, order = "c"))
label_encoder.fit(np.ravel(y_test, order = "c"))
encoded_y_train = label_encoder.transform(np.ravel(y_train, order = "c"))
encoded_y_test = label_encoder.transform(np.ravel(y_test, order = "c"))

In [25]:
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [26]:
from tensorflow.keras.optimizers import *
import tensorflow.keras as tf

In [28]:
init_mode = 'HeUniform'

model = Sequential()
model.add(Dense(units = 16, activation = 'relu', input_dim = 786, kernel_initializer = init_mode))  # remember to match the input dimensions to the column number
# model.add(Dense(units = 4, activation = 'relu'))
model.add(Dense(units = 2, activation = 'softmax', kernel_initializer = init_mode))

In [29]:
# Compile the model

opt = tf.optimizers.Adam(learning_rate = 0.001)

model.compile(optimizer = opt,
              loss = 'binary_crossentropy',
              metrics = ['AUC', 'accuracy'],)

In [30]:
# Fit the model to the training data
model.fit(
    X_train,
    y_train_categorical,
    batch_size = 32,
    epochs = 10,
    shuffle = True,
    verbose = 2
)

Epoch 1/10
117188/117188 - 241s - loss: 0.6246 - auc: 0.7090 - accuracy: 0.6617
Epoch 2/10
117188/117188 - 238s - loss: 0.6218 - auc: 0.7131 - accuracy: 0.6639
Epoch 3/10
117188/117188 - 238s - loss: 0.6207 - auc: 0.7146 - accuracy: 0.6647
Epoch 4/10
117188/117188 - 237s - loss: 0.6203 - auc: 0.7152 - accuracy: 0.6650
Epoch 5/10
117188/117188 - 237s - loss: 0.6200 - auc: 0.7157 - accuracy: 0.6651
Epoch 6/10
117188/117188 - 238s - loss: 0.6198 - auc: 0.7159 - accuracy: 0.6653
Epoch 7/10
117188/117188 - 237s - loss: 0.6196 - auc: 0.7162 - accuracy: 0.6653
Epoch 8/10
117188/117188 - 237s - loss: 0.6195 - auc: 0.7163 - accuracy: 0.6655
Epoch 9/10
117188/117188 - 237s - loss: 0.6194 - auc: 0.7165 - accuracy: 0.6655
Epoch 10/10
117188/117188 - 237s - loss: 0.6193 - auc: 0.7166 - accuracy: 0.6654


<keras.callbacks.History at 0x7fbce05dd210>

In [31]:
model.evaluate(X_test, y_test_categorical)



[0.6209902167320251, 0.7143959999084473, 0.6645703911781311]

In [None]:
# Save the model
model.save("nn_v2.h5")