In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
import tensorflow as tf
from tensorflow.keras import layers, callbacks, optimizers, models

2025-11-06 15:05:52.832165: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
train = pd.read_csv("/Users/mac/Documents/MySchoolDocs/PERSONAL/JOB APPLICATIONS/ENERGY_PROJECT/data/final_datasets/train_df.csv")
eval = pd.read_csv("/Users/mac/Documents/MySchoolDocs/PERSONAL/JOB APPLICATIONS/ENERGY_PROJECT/data/final_datasets/eval_df.csv")
holdout = pd.read_csv("/Users/mac/Documents/MySchoolDocs/PERSONAL/JOB APPLICATIONS/ENERGY_PROJECT/data/final_datasets/holdout_df.csv")

In [4]:
X_train = train.drop(columns=["Date", "Strain_Index", "target_strain_index"])
y_train = train["target_strain_index"]

X_test = eval.drop(columns=["Date", "Strain_Index", "target_strain_index"])
y_test = eval["target_strain_index"]

X_valid = holdout.drop(columns=["Date", "Strain_Index", "target_strain_index"])
y_valid = holdout["target_strain_index"]

In [5]:
#Sliding windows

def make_sequences(X, y, lookback=12):
    X_seq, y_seq = [], []
    for i in range(len(X) - lookback):
        X_seq.append(X[i:i+lookback, :])     # past lookback steps
        y_seq.append(y[i+lookback])          # predict next step (already t+1 target)
    return np.array(X_seq), np.array(y_seq)

In [6]:
#Feature Scaling

feature_cols = X_train.columns  # keep column order

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train.values)
X_test_s = scaler.transform(X_test.values)
X_valid_s  = scaler.transform(X_valid.values)

LOOKBACK = 12  # try 6 or 12

Xtr_seq, ytr_seq = make_sequences(X_train_s, y_train.values, lookback=LOOKBACK)
Xva_seq, yva_seq = make_sequences(X_test_s, y_test.values, lookback=LOOKBACK)
Xho_seq, yho_seq = make_sequences(X_valid_s,  y_valid.values,  lookback=LOOKBACK)

Xtr_seq.shape, Xva_seq.shape, Xho_seq.shape



((108, 12, 47), (24, 12, 47), (7, 12, 47))

In [7]:
def build_lstm(n_steps, n_feats, units=64, dropout=0.2):
    model = models.Sequential([
        layers.Input(shape=(n_steps, n_feats)),
        layers.LSTM(units, return_sequences=False, dropout=dropout, recurrent_dropout=0.0),
        layers.Dense(32, activation="relu"),
        layers.Dropout(dropout),
        layers.Dense(1, activation="linear")
    ])
    model.compile(
        optimizer=optimizers.Adam(learning_rate=3e-3),
        loss="mae",   # robust to outliers; strain is spiky
        metrics=["mae"]
    )
    return model

model = build_lstm(LOOKBACK, Xtr_seq.shape[-1], units=64, dropout=0.25)

es = callbacks.EarlyStopping(monitor="val_mae", patience=40, restore_best_weights=True)
rlr = callbacks.ReduceLROnPlateau(monitor="val_mae", factor=0.5, patience=15, min_lr=1e-5)

In [8]:
history = model.fit(
    Xtr_seq, ytr_seq,
    validation_data=(Xva_seq, yva_seq),
    epochs=100,
    batch_size=8,
    callbacks=[es, rlr],
    verbose=0
)

In [9]:
# VALID

def mape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), eps, None))) * 100

def smape(y_true, y_pred, eps=1e-6):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) /
                         (np.abs(y_true) + np.abs(y_pred) + eps))


yva_pred = model.predict(Xva_seq, verbose=0).ravel()
print("LSTM VALID")
print("----------")
print(f"MAE  : {mean_absolute_error(yva_seq, yva_pred):.4f}")
print(f"RMSE : {root_mean_squared_error(yva_seq, yva_pred):.4f}")
print(f"MAPE : {mape(yva_seq, yva_pred):.2f}%")
print(f"SMAPE: {smape(yva_seq, yva_pred):.2f}%")

# HOLDOUT
yho_pred = model.predict(Xho_seq, verbose=0).ravel()
print("\nLSTM HOLDOUT")
print("------------")
print(f"MAE  : {mean_absolute_error(yho_seq, yho_pred):.4f}")
print(f"RMSE : {root_mean_squared_error(yho_seq, yho_pred):.4f}")
print(f"MAPE : {mape(yho_seq, yho_pred):.2f}%")
print(f"SMAPE: {smape(yho_seq, yho_pred):.2f}%")

LSTM VALID
----------
MAE  : 0.0403
RMSE : 0.0506
MAPE : 50.58%
SMAPE: 43.36%

LSTM HOLDOUT
------------
MAE  : 0.0818
RMSE : 0.0898
MAPE : 148.74%
SMAPE: 75.65%
