Preprocessing

In [5]:
# Dependencies and Setup
import pandas as pd
import tensorflow as tf
import os 
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [6]:
# File to Load
flight_data_toload = Path("Resources/Combined_Flights_2022.csv")

flight_data = pd.read_csv(flight_data_toload)
flight_data = flight_data.loc[:1000000]
flight_data


Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",GJT,DEN,False,False,1133,1123.0,0.0,-10.0,...,1140.0,1220.0,8.0,1245,-17.0,0.0,-2.0,1200-1259,1,0
1,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",HRL,IAH,False,False,732,728.0,0.0,-4.0,...,744.0,839.0,9.0,849,-1.0,0.0,-1.0,0800-0859,2,0
2,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",DRO,DEN,False,False,1529,1514.0,0.0,-15.0,...,1535.0,1622.0,14.0,1639,-3.0,0.0,-1.0,1600-1659,2,0
3,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",IAH,GPT,False,False,1435,1430.0,0.0,-5.0,...,1446.0,1543.0,4.0,1605,-18.0,0.0,-2.0,1600-1659,2,0
4,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",DRO,DEN,False,False,1135,1135.0,0.0,0.0,...,1154.0,1243.0,8.0,1245,6.0,0.0,0.0,1200-1259,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999996,2022-02-23,Republic Airlines,DTW,ORD,False,False,1756,1756.0,0.0,0.0,...,1808.0,1809.0,18.0,1829,-2.0,0.0,-1.0,1800-1859,1,0
999997,2022-02-23,Republic Airlines,ORD,RDU,False,False,715,707.0,0.0,-8.0,...,731.0,1001.0,4.0,1018,-13.0,0.0,-1.0,1000-1059,3,0
999998,2022-02-23,Republic Airlines,SRQ,EWR,False,False,630,621.0,0.0,-9.0,...,636.0,856.0,8.0,924,-20.0,0.0,-2.0,0900-0959,5,0
999999,2022-02-23,Republic Airlines,EWR,CLE,False,False,1829,1818.0,0.0,-11.0,...,1853.0,2014.0,4.0,2015,3.0,0.0,0.0,2000-2059,2,0


In [7]:
#flight_data.info()

In [8]:
# Drop the non-beneficial ID columns
flight_data = flight_data.drop(columns=['FlightDate','Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek','Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners', 'DOT_ID_Marketing_Airline', 
               'IATA_Code_Marketing_Airline', 'Flight_Number_Marketing_Airline', 'Operating_Airline', 'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',
               'Tail_Number', 'Flight_Number_Operating_Airline','OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName', 'OriginState', 'OriginStateFips',
               'OriginStateName','OriginWac', 'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName','DestState', 'DestStateFips', 'DestStateName',
               'DestWac','ArrDel15', 'ArrivalDelayGroups','DivAirportLandings','DepTimeBlk','TaxiOut','WheelsOff','WheelsOn','TaxiIn','ArrTimeBlk',
               'DistanceGroup','DepartureDelayGroups','Diverted','CRSElapsedTime','ActualElapsedTime'])

flight_data

Unnamed: 0,Airline,Origin,Dest,Cancelled,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,Distance,DepDel15,CRSArrTime,ArrDelay
0,"Commutair Aka Champlain Enterprises, Inc.",GJT,DEN,False,1133,1123.0,0.0,-10.0,1228.0,0.0,40.0,212.0,0.0,1245,-17.0
1,"Commutair Aka Champlain Enterprises, Inc.",HRL,IAH,False,732,728.0,0.0,-4.0,848.0,0.0,55.0,295.0,0.0,849,-1.0
2,"Commutair Aka Champlain Enterprises, Inc.",DRO,DEN,False,1529,1514.0,0.0,-15.0,1636.0,0.0,47.0,251.0,0.0,1639,-3.0
3,"Commutair Aka Champlain Enterprises, Inc.",IAH,GPT,False,1435,1430.0,0.0,-5.0,1547.0,0.0,57.0,376.0,0.0,1605,-18.0
4,"Commutair Aka Champlain Enterprises, Inc.",DRO,DEN,False,1135,1135.0,0.0,0.0,1251.0,6.0,49.0,251.0,0.0,1245,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999996,Republic Airlines,DTW,ORD,False,1756,1756.0,0.0,0.0,1827.0,0.0,61.0,235.0,0.0,1829,-2.0
999997,Republic Airlines,ORD,RDU,False,715,707.0,0.0,-8.0,1005.0,0.0,90.0,646.0,0.0,1018,-13.0
999998,Republic Airlines,SRQ,EWR,False,630,621.0,0.0,-9.0,904.0,0.0,140.0,1034.0,0.0,924,-20.0
999999,Republic Airlines,EWR,CLE,False,1829,1818.0,0.0,-11.0,2018.0,3.0,81.0,404.0,0.0,2015,3.0


In [9]:
# Determine the number of unique values in each column.
#flight_data.nunique()

In [10]:
# Look at Flight data value counts for binning
flight_data["Dest"].value_counts()

ATL    48602
DFW    37895
ORD    36129
DEN    35295
CLT    32966
       ...  
ILG       21
ADK       17
OWB       17
OGD       12
PPG        4
Name: Dest, Length: 367, dtype: int64

In [7]:
# Choose a cutoff value and create a list of airline codes to be replaced
flightdata_types_to_replace = ["BKG","OGD","PPG","PIR","ATY",'GST','HYA','ADK','OWD','ILG','WYS',
'HGR','SMX','STC','OWB','PIH','EKO','BGM','BRW','RKS','SCC','SPN','TWF','BIH','OGS','DLG','RIW',
'AKN','IAG','PSM','ALS','PUB','INL','CYS','CNY','VEL','MCW','MKG','HOB']

# Replace in dataframe
for app in flightdata_types_to_replace:
    flight_data['Dest'] = flight_data['Dest'].replace(app,"Other")

# Check to make sure binning was successful
flight_data['Dest'].value_counts()

ATL    48602
DFW    37895
ORD    36129
DEN    35295
CLT    32966
       ...  
DBQ       59
DRT       59
DIK       51
PQI       51
OTH       41
Name: Dest, Length: 336, dtype: int64

In [11]:
# Check the value counts >1
value_counts = flight_data["Dest"].value_counts()

value_counts_filtered = value_counts[value_counts>1]

value_counts_filtered

ATL    48602
DFW    37895
ORD    36129
DEN    35295
CLT    32966
       ...  
ILG       21
ADK       17
OWB       17
OGD       12
PPG        4
Name: Dest, Length: 367, dtype: int64

In [12]:
# Choose a cutoff value and create a list of classifications to be replaced
#flightdata_types_to_replace = list(flight_data.Dest.value_counts()[flight_data.Dest.value_counts() < 126099].index)

# Replace in dataframe
for cls in list(flight_data.Dest.value_counts()[flight_data.Dest.value_counts() < 126099].index):
    flight_data['Dest'] = flight_data['Dest'].replace(cls,"Other")

# Check to make sure binning was successful
flight_data['Dest'].value_counts()

Other    1000001
Name: Dest, dtype: int64

In [13]:
# Convert categorical data to numeric with `pd.get_dummies`
flight_data = pd.get_dummies(flight_data)
flight_data

Unnamed: 0,Cancelled,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,Distance,DepDel15,...,Origin_VEL,Origin_VLD,Origin_VPS,Origin_WRG,Origin_XNA,Origin_XWA,Origin_YAK,Origin_YKM,Origin_YUM,Dest_Other
0,False,1133,1123.0,0.0,-10.0,1228.0,0.0,40.0,212.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,False,732,728.0,0.0,-4.0,848.0,0.0,55.0,295.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,False,1529,1514.0,0.0,-15.0,1636.0,0.0,47.0,251.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,False,1435,1430.0,0.0,-5.0,1547.0,0.0,57.0,376.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,False,1135,1135.0,0.0,0.0,1251.0,6.0,49.0,251.0,0.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999996,False,1756,1756.0,0.0,0.0,1827.0,0.0,61.0,235.0,0.0,...,0,0,0,0,0,0,0,0,0,1
999997,False,715,707.0,0.0,-8.0,1005.0,0.0,90.0,646.0,0.0,...,0,0,0,0,0,0,0,0,0,1
999998,False,630,621.0,0.0,-9.0,904.0,0.0,140.0,1034.0,0.0,...,0,0,0,0,0,0,0,0,0,1
999999,False,1829,1818.0,0.0,-11.0,2018.0,3.0,81.0,404.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
## Split our preprocessed data into our features and target arrays
# X = flight_data.drop(columns = ["DepDel15"])
# y = flight_data["DepDel15"]


# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(flight_data.drop(columns = ["DepDel15"]),flight_data["DepDel15"], random_state=1)


In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

Compile, Train and Evaluate the Model

In [19]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  60
hidden_nodes_layer2 = 30
hidden_nodes_layer3 = 20
#hidden_nodes_layer2 = 30
#hidden_nodes_layer2 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 60)                24060     
                                                                 
 dense_1 (Dense)             (None, 30)                1830      
                                                                 
 dense_2 (Dense)             (None, 20)                620       
                                                                 
 dense_3 (Dense)             (None, 1)                 21        
                                                                 
Total params: 26531 (103.64 KB)
Trainable params: 26531 (103.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [21]:
# Normalize the input data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints_optimized_v3/",exist_ok=True)
checkpoint_path = "checkpoints_optimized_v3/weights.{epoch:02d}.hdf5"

In [23]:
# Create a callback that saves the model's weights 
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_path,
    verbose = 1,
    save_weights_only = True,
    save_freq = 4000)

# Train the model
fit_model = nn.fit(X_train,y_train,epochs=60,callbacks=[cp_callback])

# Define the model with early sttopping
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor ='val_loss', 
    patience =5,
    restore_best_weights= True)


# Train the model with early stopping
fit_model = nn.fit(X_train, y_train, epochs=100, callbacks=[cp_callback, early_stopping_callback], validation_data=(X_test, y_test))


Epoch 1/60
 3995/23438 [====>.........................] - ETA: 52s - loss: nan - accuracy: 0.7636
Epoch 1: saving model to checkpoints_optimized_v3\weights.01.hdf5
Epoch 1: saving model to checkpoints_optimized_v3\weights.01.hdf5
Epoch 1: saving model to checkpoints_optimized_v3\weights.01.hdf5
Epoch 1: saving model to checkpoints_optimized_v3\weights.01.hdf5
Epoch 1: saving model to checkpoints_optimized_v3\weights.01.hdf5
Epoch 2/60
  558/23438 [..............................] - ETA: 54s - loss: nan - accuracy: 0.7644
Epoch 2: saving model to checkpoints_optimized_v3\weights.02.hdf5
 4543/23438 [====>.........................] - ETA: 45s - loss: nan - accuracy: 0.7648
Epoch 2: saving model to checkpoints_optimized_v3\weights.02.hdf5
Epoch 2: saving model to checkpoints_optimized_v3\weights.02.hdf5
Epoch 2: saving model to checkpoints_optimized_v3\weights.02.hdf5
Epoch 2: saving model to checkpoints_optimized_v3\weights.02.hdf5
Epoch 2: saving model to checkpoints_optimized_v3\weights

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test, y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

7813/7813 - 10s - loss: nan - accuracy: 0.7639 - 10s/epoch - 1ms/step
Loss: nan, Accuracy: 0.763884961605072


In [None]:
# Predict the probabilities of being delayed for the test data
y_pred_prob = nn.predict(X_test)

# Convert probabilities to binary predictions (0 or 1) based on a threshold (e.g., 0.5)
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

# Calculate the total number of flights, delayed flights, and canceled flights
total_flights = len(y_test)
delayed_flights = sum(y_pred)
canceled_flights = sum(y_test)

print(f"Total Flights: {total_flights}")
print(f"Delayed Flights: {delayed_flights}")
print(f"Canceled Flights: {canceled_flights}")

In [None]:
# Export our model to HDF5 file
nn.save_weights('Flights.h5')