In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
columns = ["Date", "Close", "High", "Low", "Open", "Volume"]

AAPL_prices = pd.read_csv("../data/raw/yf_prices/AAPL.csv")
AMZN_prices = pd.read_csv("../data/raw/yf_prices/AMZN.csv")
BA_prices = pd.read_csv("../data/raw/yf_prices/BA.csv")
CAT_prices = pd.read_csv("../data/raw/yf_prices/CAT.csv")
CVX_prices = pd.read_csv("../data/raw/yf_prices/CVX.csv")
GS_prices = pd.read_csv("../data/raw/yf_prices/GS.csv")
JNJ_prices = pd.read_csv("../data/raw/yf_prices/JNJ.csv")
JPM_prices = pd.read_csv("../data/raw/yf_prices/JPM.csv")
MSFT_prices = pd.read_csv("../data/raw/yf_prices/MSFT.csv")
NEE_prices = pd.read_csv("../data/raw/yf_prices/NEE.csv")
NVDA_prices = pd.read_csv("../data/raw/yf_prices/NVDA.csv")
PFE_prices = pd.read_csv("../data/raw/yf_prices/PFE.csv")
TSLA_prices = pd.read_csv("../data/raw/yf_prices/TSLA.csv")
XOM_prices = pd.read_csv("../data/raw/yf_prices/XOM.csv")

In [None]:
def clean_prices_df(df):
    df = df.copy()
    df.rename(columns={"Price": "Date"}, inplace=True)
    df["Close"] = pd.to_numeric(df["Close"], errors="coerce")
    df = df.dropna(subset=["Close"]).reset_index(drop=True)
    for col in ["High", "Low", "Open", "Volume"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    return df

In [None]:
AAPL_prices = clean_prices_df(AAPL_prices)
AMZN_prices = clean_prices_df(AMZN_prices)
BA_prices = clean_prices_df(BA_prices)
CAT_prices = clean_prices_df(CAT_prices)
CVX_prices = clean_prices_df(CVX_prices)
GS_prices = clean_prices_df(GS_prices)
JNJ_prices = clean_prices_df(JNJ_prices)
JPM_prices = clean_prices_df(JPM_prices)
MSFT_prices = clean_prices_df(MSFT_prices)
NEE_prices = clean_prices_df(NEE_prices)
NVDA_prices = clean_prices_df(NVDA_prices)
PFE_prices = clean_prices_df(PFE_prices)
TSLA_prices = clean_prices_df(TSLA_prices)
XOM_prices = clean_prices_df(XOM_prices)

In [None]:
def feature_engineer_prices_df(df):
    df = df.copy()

    # --- Basic price change features ---
    df["Return_1d"] = df["Close"].pct_change()
    df["Return_7d"] = df["Close"].pct_change(7)

    # --- Moving averages ---
    df["MA_7"] = df["Close"].rolling(window=7).mean()
    df["MA_14"] = df["Close"].rolling(window=14).mean()
    df["MA_30"] = df["Close"].rolling(window=30).mean()

    # --- Volatility ---
    df["Volatility_7d"] = df["Return_1d"].rolling(window=7).std()
    df["Volatility_14d"] = df["Return_1d"].rolling(window=14).std()

    # --- Volume features ---
    df["Volume_MA_7"] = df["Volume"].rolling(window=7).mean()
    df["Volume_Change_1d"] = df["Volume"].pct_change()

    # --- Lag features (previous close prices) ---
    df["Close_Lag_1"] = df["Close"].shift(1)
    df["Close_Lag_2"] = df["Close"].shift(2)
    df["Close_Lag_3"] = df["Close"].shift(3)
    df["Close_Lag_5"] = df["Close"].shift(5)
    df["Close_Lag_7"] = df["Close"].shift(7)
    df["Close_Lag_14"] = df["Close"].shift(14)
    df["Close_Lag_21"] = df["Close"].shift(21)

    # --- RSI (Relative Strength Index) ---
    delta = df["Close"].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)

    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()

    rs = avg_gain / avg_loss
    df["RSI_14"] = 100 - (100 / (1 + rs))

    # --- MACD (Moving Average Convergence Divergence) ---
    # Fast and Slow Exponential Moving Averages
    df["EMA_12"] = df["Close"].ewm(span=12, adjust=False).mean()
    df["EMA_26"] = df["Close"].ewm(span=26, adjust=False).mean()

    # MACD Line
    df["MACD"] = df["EMA_12"] - df["EMA_26"]

    # Signal Line (9-day EMA of MACD)
    df["MACD_Signal"] = df["MACD"].ewm(span=9, adjust=False).mean()

    # interaction features
    df["Price_vs_MA30"] = df["Close"] / df["MA_30"]
    df["Vol_x_Return"] = df["Volume_Change_1d"] * df["Return_1d"]

    # Bollinger Bands
    df["MA_20"] = df["Close"].rolling(window=20).mean()
    df["Std_Dev_20"] = df["Close"].rolling(window=20).std()
    df["Bollinger_Upper"] = df["MA_20"] + (2 * df["Std_Dev_20"])
    df["Bollinger_Lower"] = df["MA_20"] - (2 * df["Std_Dev_20"])
    df["Price_vs_Bollinger"] = (df["Close"] - df["MA_20"]) / (
        2 * df["Std_Dev_20"]
    )  # Normalize position within bands

    # date features
    df["DayOfWeek"] = df["Date"].dt.dayofweek
    df["Month"] = df["Date"].dt.month
    df["WeekOfYear"] = df["Date"].dt.isocalendar().week.astype(int)
    df["Year"] = df["Date"].dt.year

    # range features
    df["Price_Range_HL"] = df["High"] - df["Low"]
    df["Price_Change_OC"] = df["Close"] - df["Open"]

    # --- Drop very early rows that have NaNs due to rolling windows ---
    df = df.dropna().reset_index(drop=True)

    return df

In [None]:
def prepare_lstm_data(df, target_col="Close", target_days=7, window_size=30):
    df = df.copy()
    
    # Target and base price setup
    df["Prediction_Base_Price"] = df[target_col].shift(target_days)
    df["Target_Return"] = df[target_col].pct_change(periods=target_days).shift(-target_days)
    df.dropna(subset=["Target_Return", "Prediction_Base_Price", target_col], inplace=True)

    # Scale the 'Close' prices
    scaler = MinMaxScaler()
    df["Close_scaled"] = scaler.fit_transform(df[[target_col]])

    # Create sequences
    data = df["Close_scaled"].values
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(df["Target_Return"].values[i + window_size])
    X, y = np.array(X), np.array(y)
    X = X.reshape((X.shape[0], X.shape[1], 1))  # LSTM expects 3D input

    # Align base prices and dates
    df = df.iloc[window_size:]  # Align to match sequence start
    base_prices = df["Prediction_Base_Price"].values
    dates = df["Date"].values

    # Train/test split
    split_idx = int(0.8 * len(X))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    base_prices_train, base_prices_test = base_prices[:split_idx], base_prices[split_idx:]
    dates_train, dates_test = dates[:split_idx], dates[split_idx:]

    return X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler


In [None]:
def create_model(units=50, dropout_rate=0.2, learning_rate=0.001, input_shape=(30, 1)):
    model = Sequential()
    model.add(LSTM(units, input_shape=input_shape))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="mean_squared_error")
    return model

In [None]:
def cross_validate_lstm(X_train, y_train):
    tscv = TimeSeriesSplit(n_splits=5)
    val_losses = []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train)):
        print(f"\nFold {fold+1}")
        
        X_train_cv, X_val_cv = X_train[train_idx], X_train[val_idx]
        y_train_cv, y_val_cv = y_train[train_idx], y_train[val_idx]

        model = create_model(input_shape=(X_train.shape[1], X_train.shape[2]))
        early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
        
        history = model.fit(
            X_train_cv, y_train_cv,
            validation_data=(X_val_cv, y_val_cv),
            epochs=50,
            batch_size=32,
            callbacks=[early_stop],
            verbose=1
        )
        
        val_loss = min(history.history["val_loss"])
        val_losses.append(val_loss)
        print(f"Fold {fold+1} Validation Loss: {val_loss:.4f}")
    
    return val_losses

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, stock_name): 
    print(f"\nTraining LSTM model for {stock_name}...")
    model = create_model(input_shape=(X_train.shape[1], X_train.shape[2]))
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=50,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )

    y_pred = model.predict(X_test).flatten()
    predicted_prices = base_prices_test * (1 + y_pred)
    actual_prices = base_prices_test * (1 + y_test)

    # Plot predictions
    plt.figure(figsize=(12, 6))
    plt.plot(dates_test, actual_prices, label="Actual Prices")
    plt.plot(dates_test, predicted_prices, label="Predicted Prices")
    plt.xlabel("Date")
    plt.ylabel("Price")
    plt.title(f"{stock_name} Price Prediction")
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # RMSE on predicted percentage changes
    rmse_percentage_change = rmse(y_test, y_pred)
    print(f"RMSE on Percentage Change: {rmse_percentage_change:.4f}")

    # RMSE on predicted prices
    rmse_prices = rmse(actual_prices, predicted_prices)
    print(f"RMSE on Prices: {rmse_prices:.4f}")

    return model

In [None]:
# Apple LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(AAPL_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Apple")


In [None]:
# AMZN LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(AMZN_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Amazon")



In [None]:
# BA LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(BA_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Boeing")



In [None]:
# CAT LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(CAT_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Caterpillar")



In [None]:
# CVX LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(CVX_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Chevron")



In [None]:
# GS LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(GS_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Goldman Sachs")



In [None]:
# JNJ LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(JNJ_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Johnson & Johnson")



In [None]:
# JPM LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(JPM_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "JPMorgan Chase")



In [None]:
# MSFT LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(MSFT_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Microsoft")



In [None]:
# NEE LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(NEE_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "NextEra Energy")



In [None]:
# NVDA LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(NVDA_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Nvidia")



In [None]:
# PFE LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(PFE_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Pfizer")



In [None]:
# TSLA LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(TSLA_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Tesla")



In [None]:
# XOM LSTM Model
X_train, X_test, y_train, y_test, base_prices_test, dates_test, scaler = prepare_lstm_data(XOM_prices)
model = train_lstm_model(X_train, y_train, X_test, y_test, base_prices_test, dates_test, scaler, "Exxon Mobil")

