In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
### **LOAD & PREPROCESS DATA (2M Rows for Speed)**
df = pd.read_parquet('../data/cleaned_data_snappy.parquet')


In [3]:
# Sample 2 million rows for faster training while maintaining statistical distribution
df_sample = df.sample(n=2000000, random_state=42)


In [4]:
# Log-transform target variable to stabilize variance
df_sample['totalFare'] = np.log1p(df_sample['totalFare'])


In [5]:
# Sort by `daysToDeparture` instead of `flightDate` (preserves time ordering)
df_sample = df_sample.sort_values(by=['daysToDeparture'], ascending=False)


In [6]:
# Ensure `durationToDistanceRatio` exists
df_sample['durationToDistanceRatio'] = df_sample['totalAirtime'] / df_sample['totalTravelDistance']
df_sample['durationToDistanceRatio'].replace([np.inf, -np.inf], np.nan, inplace=True)
df_sample.dropna(subset=['durationToDistanceRatio'], inplace=True)


In [7]:
# Define key features
features = ['daysToDeparture', 'pricePerMile', 'isHoliday', 'preHolidayFlight', 
            'postHolidayFlight', 'totalLayoverTime', 'durationToDistanceRatio']


In [8]:
# Add Lag Features (LSTM needs sequential data)
df_sample['fareLag_1'] = df_sample['totalFare'].shift(1)
df_sample['fareLag_7'] = df_sample['totalFare'].shift(7)
features += ['fareLag_1', 'fareLag_7']


In [9]:
# Drop NaNs caused by shifting
df_sample.dropna(inplace=True)


In [10]:
# Define features (X) and target (y)
X = df_sample[features]
y = df_sample['totalFare']


In [11]:
# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [12]:
# Convert to numpy arrays
X_array = np.array(X_scaled)
y_array = np.array(y)

In [13]:
# **SEQUENCE CREATION (Sequence Length = 7, Batch Size = 64)**
sequence_length = 7

In [14]:
def create_tf_dataset(X, y, seq_length, batch_size=64):
    """Creates a tf.data.Dataset for efficient sequence batching."""
    def generator():
        for i in range(len(X) - seq_length):
            yield X[i:i+seq_length], y[i+seq_length]

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(seq_length, X.shape[1]), dtype=tf.float32),
            tf.TensorSpec(shape=(), dtype=tf.float32)
        )
    )
    
    return dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)  # Speed up training


In [15]:
# **Train-Test Split (80% Train, 10% Validation, 10% Test)**
train_size = int(len(X_array) * 0.8)
X_train, X_test = X_array[:train_size], X_array[train_size:]
y_train, y_test = y_array[:train_size], y_array[train_size:]


In [16]:
# Further split test set into validation and test (50/50 split)
val_size = int(len(X_test) * 0.5)
X_val, X_test = X_test[:val_size], X_test[val_size:]
y_val, y_test = y_test[:val_size], y_test[val_size:]


In [17]:
# Create tf.data.Dataset
train_dataset = create_tf_dataset(X_train, y_train, sequence_length, batch_size=64)
val_dataset = create_tf_dataset(X_val, y_val, sequence_length, batch_size=64)
test_dataset = create_tf_dataset(X_test, y_test, sequence_length, batch_size=64)


In [18]:
### **GRID SEARCH (LIMITED TO 2 VALUES PER PARAMETER)**
best_mae = float('inf')
best_params = None


In [19]:
param_grid = {
    'lstm_units': [32, 50],  
    'dropout_rate': [0.2, 0.3],  
    'batch_size': [32, 64]  
}

In [20]:
for lstm_units in param_grid['lstm_units']:
    for dropout_rate in param_grid['dropout_rate']:
        for batch_size in param_grid['batch_size']:
            # Build LSTM Model
            model = Sequential([
                LSTM(lstm_units, return_sequences=True, input_shape=(sequence_length, X_train.shape[1])),
                Dropout(dropout_rate),
                BatchNormalization(),

                LSTM(lstm_units, return_sequences=False),
                Dropout(dropout_rate),

                Dense(16, activation='relu'),
                Dense(1)
            ])

            model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

            # Train Model with Early Stopping
            early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
            history = model.fit(train_dataset, validation_data=val_dataset, epochs=10, 
                                batch_size=batch_size, callbacks=[early_stop], verbose=0)

            # Evaluate on Validation Data
            y_val_pred = np.expm1(model.predict(val_dataset).squeeze())
            y_val_real = np.expm1(y_val[-len(y_val_pred):])

            mae = mean_absolute_error(y_val_real, y_val_pred)

            if mae < best_mae:
                best_mae = mae
                best_params = {'lstm_units': lstm_units, 'dropout_rate': dropout_rate, 'batch_size': batch_size}

print(f"Best Hyperparameters: {best_params}")

  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 8ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step
Best Hyperparameters: {'lstm_units': 32, 'dropout_rate': 0.3, 'batch_size': 64}


In [21]:
### **FINAL MODEL TRAINING WITH BEST PARAMETERS**
final_model = Sequential([
    LSTM(best_params['lstm_units'], return_sequences=True, input_shape=(sequence_length, X_train.shape[1])),
    Dropout(best_params['dropout_rate']),
    BatchNormalization(),

    LSTM(best_params['lstm_units'], return_sequences=False),
    Dropout(best_params['dropout_rate']),

    Dense(16, activation='relu'),
    Dense(1)
])

final_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])


  super().__init__(**kwargs)


In [22]:
# **Final Training (15 Epochs, Early Stop, Reduce LR)**
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)

history = final_model.fit(train_dataset, validation_data=val_dataset, epochs=15, 
                          batch_size=best_params['batch_size'], callbacks=[early_stop, reduce_lr], verbose=1)


Epoch 1/15
  24994/Unknown [1m164s[0m 6ms/step - loss: 0.6681 - mae: 0.4846

  self.gen.throw(typ, value, traceback)


[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 7ms/step - loss: 0.6680 - mae: 0.4846 - val_loss: 0.2037 - val_mae: 0.3449 - learning_rate: 0.0010
Epoch 2/15
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 7ms/step - loss: 0.1794 - mae: 0.3120 - val_loss: 0.2037 - val_mae: 0.3450 - learning_rate: 0.0010
Epoch 3/15
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 7ms/step - loss: 0.1790 - mae: 0.3117 - val_loss: 0.2034 - val_mae: 0.3434 - learning_rate: 0.0010
Epoch 4/15
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 7ms/step - loss: 0.1779 - mae: 0.3110 - val_loss: 0.2036 - val_mae: 0.3445 - learning_rate: 0.0010
Epoch 5/15
[1m24995/25000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 6ms/step - loss: 0.1772 - mae: 0.3104
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 7ms/step - loss:

In [23]:
### **MODEL EVALUATION ON TEST DATA**
y_pred = np.expm1(final_model.predict(test_dataset).squeeze())
y_test_real = np.expm1(y_test[-len(y_pred):])


[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 19ms/step


In [24]:
# **Compute Final Metrics**
final_mae = mean_absolute_error(y_test_real, y_pred)
final_r2 = r2_score(y_test_real, y_pred)

print(f"Final Test MAE: {final_mae:.4f}")
print(f"Final Test R² Score: {final_r2:.4f}")

Final Test MAE: 120.3741
Final Test R² Score: -0.0457


In [None]:
# Save the trained LSTM model in the 'models' folder
import os

models_dir = "models"
os.makedirs(models_dir, exist_ok=True)  # Ensure the directory exists

model_filename = os.path.join(models_dir, "lstm_model.h5")
final_model.save(model_filename)
print(f"LSTM model saved as {model_filename}")
