In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
# Load dataset
df = pd.read_parquet('../data/cleaned_data_snappy.parquet')

In [3]:
# Downsample: Take 1 million rows for faster training
df_sample = df.sample(n=1000000, random_state=42)

In [4]:
# Log-transform target variable if right-skewed
df_sample['totalFare'] = np.log1p(df_sample['totalFare'])

In [5]:
# Sort by `daysToDeparture` instead of `flightDate`
df_sample = df_sample.sort_values(by=['daysToDeparture'], ascending=False)

In [6]:
# Ensure `durationToDistanceRatio` exists
if 'durationToDistanceRatio' not in df_sample.columns:
    df_sample['durationToDistanceRatio'] = df_sample['totalAirtime'] / df_sample['totalTravelDistance']


In [7]:
# Replace infinite values and drop NaNs
df_sample['durationToDistanceRatio'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_sample.dropna(subset=['durationToDistanceRatio'], inplace=True)

In [8]:
# Define features based on EDA insights
features = ['daysToDeparture', 'pricePerMile', 'isHoliday', 'preHolidayFlight', 
            'postHolidayFlight', 'totalLayoverTime', 'durationToDistanceRatio']

In [9]:
# Create lag features based on `daysToDeparture`
df_sample['fareLag_1'] = df_sample['totalFare'].shift(1)
df_sample['fareLag_7'] = df_sample['totalFare'].shift(7)
features += ['fareLag_1', 'fareLag_7']

In [10]:
# Drop NaNs created by lag features
df_sample.dropna(inplace=True)

In [11]:
# Define features (X) and target (y)
X = df_sample[features]
y = df_sample['totalFare']

In [12]:
# Normalize numerical data using Min-Max Scaling (needed for LSTM)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Convert to numpy arrays for LSTM processing
X_array = np.array(X_scaled)
y_array = np.array(y)

In [14]:
# Reshape for LSTM (samples, time steps, features)
X_lstm = np.reshape(X_array, (X_array.shape[0], 1, X_array.shape[1]))  # 1 time step


In [15]:
# Train-Test Split (80% Train, 10% Validation, 10% Test)
X_train, X_temp, y_train, y_temp = train_test_split(X_lstm, y_array, test_size=0.2, shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)


In [16]:
# Build LSTM Model
model = Sequential([
    LSTM(50, return_sequences=True, stateful=False, input_shape=(1, X_lstm.shape[2])),
    Dropout(0.2),
    BatchNormalization(),

    LSTM(50, return_sequences=False, stateful=False),
    Dropout(0.2),

    Dense(25, activation='relu'),
    Dense(1)  # Output Layer
])


  super().__init__(**kwargs)


In [17]:
# Compile Model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])


In [18]:
# Train Model with Early Stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [None]:
# Reduce batch size for memory efficiency (32 instead of 64)
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                    epochs=20, batch_size=32, callbacks=[early_stop])


Epoch 1/20
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 4ms/step - loss: 0.8393 - mae: 0.4218 - val_loss: 0.0142 - val_mae: 0.0889
Epoch 2/20
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 3ms/step - loss: 0.0247 - mae: 0.1218 - val_loss: 0.0213 - val_mae: 0.1169
Epoch 3/20
[1m15269/25000[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m25s[0m 3ms/step - loss: 0.0216 - mae: 0.1136

In [None]:
# Evaluate Model on Test Data
y_pred = model.predict(X_test)
y_pred_real = np.expm1(y_pred)  # Convert back from log scale
y_test_real = np.expm1(y_test)  # Convert back from log scale

In [None]:
# Compute Evaluation Metrics
mae = mean_absolute_error(y_test_real, y_pred_real)
r2 = r2_score(y_test_real, y_pred_real)

In [None]:
print(f"Test MAE: {mae:.4f}")
print(f"Test R² Score: {r2:.4f}")