# Import Library

In [20]:
from feature_engineering import apply_feature_engineering, add_dummies
from data_loading import load_data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Data Loading

In [2]:
# Call the load_data to get the data as a pandas dataframe
df = load_data()
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,travelDuration,isBasicEconomy,isRefundable,isNonStop,totalFare,seatsRemaining,totalTravelDistance,segmentsDepartureTimeRaw,segmentsAirlineCode,segmentsCabinCode
0,2022-04-16,2022-04-17,ATL,BOS,PT2H29M,False,False,True,248.6,9,947.0,2022-04-17T12:57:00.000-04:00,DL,coach
1,2022-04-16,2022-04-17,ATL,BOS,PT2H30M,False,False,True,248.6,4,947.0,2022-04-17T06:30:00.000-04:00,DL,coach
2,2022-04-16,2022-04-17,ATL,BOS,PT2H30M,False,False,True,248.6,9,947.0,2022-04-17T11:35:00.000-04:00,DL,coach
3,2022-04-16,2022-04-17,ATL,BOS,PT2H32M,False,False,True,248.6,8,947.0,2022-04-17T13:59:00.000-04:00,DL,coach
4,2022-04-16,2022-04-17,ATL,BOS,PT2H34M,False,False,True,248.6,9,947.0,2022-04-17T09:59:00.000-04:00,DL,coach


In [3]:
# The data is too large to use in entirety, set a sample of 800,000 rows
sample_size = 800000

# Get the first 800,000 rows
df_sample = df.iloc[:sample_size]

# Feature Engineering

In [4]:
# Call the apply_feature_engineering function from feature_engineering to get the data ready for ML Modeling
df_sample = apply_feature_engineering(df_sample)

Starting feature engineering...
Converting date columns...
Date conversion done. Time elapsed: 0.52s
Extracting travel duration...
Travel duration extraction done. Time elapsed: 1.94s
Imputing missing travel distances...
Imputation done. Time elapsed: 1.96s
Processing departure times...
Departure time processing done. Time elapsed: 154.28s
Extracting departure hour and float...
Departure time extraction done. Time elapsed: 154.31s
Processing airline codes...
Airline code processing done. Time elapsed: 155.30s
Processing cabin codes...
Cabin class processing done. Time elapsed: 157.07s
Binning seatsRemaining...
Seats binning done. Time elapsed: 157.08s
Calculating days to departure...
Day of week processing done. Time elapsed: 157.14s
Processing holiday features...
Holiday features processing done. Time elapsed: 157.17s
Dropping columns...
Dropping columns done. Time elapsed: 157.25s
Renaming columns...
Renaming done. Total time elapsed: 157.25s
Adding dummies...
Dummies added. Total ti

In [5]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 46 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   travelDuration            800000 non-null  int64  
 1   isRefundable              800000 non-null  bool   
 2   isNonStop                 800000 non-null  bool   
 3   totalFare                 800000 non-null  float64
 4   seatsRemaining            800000 non-null  int64  
 5   travelDistance            800000 non-null  int64  
 6   departureTimeHour         800000 non-null  int32  
 7   departureTimeFloat        800000 non-null  float64
 8   daysToDeparture           800000 non-null  int64  
 9   departureDayOfWeek        800000 non-null  int32  
 10  isWeekend                 800000 non-null  bool   
 11  isHoliday                 800000 non-null  bool   
 12  nearHoliday               800000 non-null  bool   
 13  startingAirport_BOS       800000 non-null  b

# RNN

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [27]:
# Our X variables in these models will be all columns that are not price
X = df_sample.drop(columns= ['totalFare'], axis= 1)

# OUr y variable is of course price which is called 'totalFare'
y = df_sample['totalFare']

# Split the data into train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (640000, 45)
X_test shape: (160000, 45)


In [28]:
scaler_x = MinMaxScaler()
scale_columns = ['travelDuration', 'travelDistance', 'departureTimeFloat']

X_train[scale_columns] = scaler_x.fit_transform(X_train[scale_columns])
X_test[scale_columns] = scaler_x.transform(X_test[scale_columns])


In [29]:
scaler_y = MinMaxScaler()
y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test = scaler_y.transform(y_test.values.reshape(-1, 1))

In [30]:
rnn_model = Sequential([
    SimpleRNN(50, activation='tanh', return_sequences=False, input_shape=(X_train.shape[1], 1)), 
    Dense(1)
])

rnn_model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])

rnn_model.summary()

In [10]:
import numpy as np

print("NaN in X_train:", np.isnan(X_train).sum())
print("NaN in X_test:", np.isnan(X_test).sum())
print("NaN in y_train:", np.isnan(y_train).sum())
print("NaN in y_test:", np.isnan(y_test).sum())

print("Inf in X_train:", np.isinf(X_train).sum())
print("Inf in X_test:", np.isinf(X_test).sum())
print("Inf in y_train:", np.isinf(y_train).sum())
print("Inf in y_test:", np.isinf(y_test).sum())


NaN in X_train: travelDuration                  0
isRefundable                    0
isNonStop                       0
seatsRemaining                  0
totalTravelDistance         38471
travelDistance                  0
departureTimeHour               0
departureTimeFloat              0
daysToDeparture                 0
departureDayOfWeek              0
isWeekend                       0
isHoliday                       0
nearHoliday                     0
startingAirport_BOS             0
startingAirport_CLT             0
startingAirport_DEN             0
startingAirport_DFW             0
startingAirport_DTW             0
startingAirport_EWR             0
startingAirport_IAD             0
startingAirport_JFK             0
startingAirport_LAX             0
startingAirport_LGA             0
startingAirport_MIA             0
startingAirport_OAK             0
startingAirport_ORD             0
startingAirport_PHL             0
startingAirport_SFO             0
destinationAirport_BOS          

In [31]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = rnn_model.fit(
    X_train, y_train,
    epochs=50, batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping]
)


Epoch 1/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 2ms/step - loss: 0.0034 - mae: 0.0358 - mse: 0.0034 - val_loss: 0.0027 - val_mae: 0.0291 - val_mse: 0.0027
Epoch 2/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - loss: 0.0025 - mae: 0.0298 - mse: 0.0025 - val_loss: 0.0024 - val_mae: 0.0279 - val_mse: 0.0024
Epoch 3/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 2ms/step - loss: 0.0024 - mae: 0.0291 - mse: 0.0024 - val_loss: 0.0023 - val_mae: 0.0272 - val_mse: 0.0023
Epoch 4/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 2ms/step - loss: 0.0024 - mae: 0.0290 - mse: 0.0024 - val_loss: 0.0023 - val_mae: 0.0281 - val_mse: 0.0023
Epoch 5/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 2ms/step - loss: 0.0024 - mae: 0.0290 - mse: 0.0024 - val_loss: 0.0024 - val_mae: 0.0303 - val_mse: 0.0024
Epoch 6/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━

In [32]:
y_pred = rnn_model.predict(X_test)
y_pred_original = scaler_y.inverse_transform(y_pred)

[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 865us/step


In [34]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Convert y_test back to the original scale
y_test_original = scaler_y.inverse_transform(y_test)

# Compute error metrics
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = mean_squared_error(y_test_original, y_pred_original)  # RMSE
r2 = r2_score(y_test_original, y_pred_original)

# Print results
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")
print(f"R² Score: {r2:.4f}")


Mean Absolute Error (MAE): $101.17
Root Mean Squared Error (RMSE): $31860.17
R² Score: 0.4219


In [35]:
lstm_model = Sequential([
    LSTM(50, activation='tanh', return_sequences=False, input_shape=(X_train.shape[1], 1)),
    Dense(1)  # No activation (for regression)
])

lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])

lstm_model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = rnn_model.fit(
    X_train, y_train,
    epochs=50, batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping]
)

In [36]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = lstm_model.fit(
    X_train, y_train,
    epochs=50, batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping]
)

Epoch 1/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 6ms/step - loss: 0.0031 - mae: 0.0345 - mse: 0.0031 - val_loss: 0.0025 - val_mae: 0.0296 - val_mse: 0.0025
Epoch 2/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 5ms/step - loss: 0.0025 - mae: 0.0292 - mse: 0.0025 - val_loss: 0.0024 - val_mae: 0.0311 - val_mse: 0.0024
Epoch 3/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 5ms/step - loss: 0.0023 - mae: 0.0280 - mse: 0.0023 - val_loss: 0.0023 - val_mae: 0.0297 - val_mse: 0.0023
Epoch 4/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 5ms/step - loss: 0.0021 - mae: 0.0272 - mse: 0.0021 - val_loss: 0.0021 - val_mae: 0.0259 - val_mse: 0.0021
Epoch 5/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 6ms/step - loss: 0.0020 - mae: 0.0265 - mse: 0.0020 - val_loss: 0.0020 - val_mae: 0.0264 - val_mse: 0.0020
Epoch 6/50
[1m20000/20000[0m [32m━━━━━━━━━━━━━━

In [39]:
y_pred = lstm_model.predict(X_test)
y_pred_original = scaler_y.inverse_transform(y_pred)

[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step


In [40]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Convert y_test back to the original scale
y_test_original = scaler_y.inverse_transform(y_test)

# Compute error metrics
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = mean_squared_error(y_test_original, y_pred_original)  # RMSE
r2 = r2_score(y_test_original, y_pred_original)

# Print results
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")
print(f"R² Score: {r2:.4f}")

Mean Absolute Error (MAE): $88.80
Root Mean Squared Error (RMSE): $24140.35
R² Score: 0.5620
