# LSTM + GRU Hybrid Model for Stock Prediction

In [3]:
# Load Phase-1 artifacts inside Phase-2 notebook
from pathlib import Path
import joblib
import numpy as np

root = Path.cwd()
while root != root.parent and not (root / "data").exists():
    root = root.parent

art_dir = root / "data" / "processed"
artifacts = joblib.load(art_dir / "phase1_preprocessing_artifacts.joblib")

scaler = artifacts["scaler"]
pca = artifacts["pca"]
feature_cols = artifacts["feature_cols"]

In [7]:
#check what datatype scaler is
print(type(scaler))
print(type(pca))
print(feature_cols)

<class 'sklearn.preprocessing._data.StandardScaler'>
<class 'sklearn.decomposition._pca.PCA'>
['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'adj_close', 'logret_1d', 'ret_1d', 'hl_range', 'oc_change', 'ret_mean_5', 'ret_vol_5', 'ret_mean_10', 'ret_vol_10', 'ret_mean_20', 'ret_vol_20', 'ret_mean_60', 'ret_vol_60', 'sma_5', 'sma_10', 'sma_20', 'sma_60', 'sma_spread_5', 'sma_spread_20', 'sma_spread_60', 'ema_12', 'ema_26', 'macd', 'macd_signal', 'macd_hist', 'rsi_14', 'bb_mid_20', 'bb_std_20', 'bb_upper_20', 'bb_lower_20', 'bb_width_20', 'vol_change', 'vol_z_20', 'obv', 'dow', 'month', 'is_month_end', 'logret_1d_lag1', 'logret_1d_lag2', 'logret_1d_lag3', 'logret_1d_lag4', 'logret_1d_lag5', 'logret_1d_lag6', 'logret_1d_lag7', 'logret_1d_lag8', 'logret_1d_lag9', 'logret_1d_lag10', 'logret_1d_lag11', 'logret_1d_lag12', 'logret_1d_lag13', 'logret_1d_lag14', 'logret_1d_lag15', 'logret_1d_lag16', 'logret_1d_lag17', 'logret_1d_lag18', 'logret_1d_lag19', 'logret_1d_lag20', 'logret_1d_la

In [8]:
artifacts.keys()

dict_keys(['scaler', 'pca', 'feature_cols', 'target_col', 'top_exog', 'pca_feature_cols', 'drop_cols'])

In [1]:
#cell 1: imports + project paths
from pathlib import Path
import numpy as np
import pandas as pd

cwd = Path.cwd().resolve()
project_root = None
for p in [cwd] + list(cwd.parents):
    if (p / "data").exists():
        project_root = p
        break
if project_root is None:
    raise RuntimeError("Project root not found: missing /data folder")

DATA_DIR = project_root / "data"
PROC_DIR = DATA_DIR / "processed"

panel_path = PROC_DIR / "modeling_panel_targets.parquet"
print("project_root:", project_root)
print("panel_path:", panel_path)


project_root: /media/workhorse/Windows/MyWorkPlace/Projects/StockPredictor
panel_path: /media/workhorse/Windows/MyWorkPlace/Projects/StockPredictor/data/processed/modeling_panel_targets.parquet


In [2]:
#cell 2: load + filter AAPL + clean column names
df = pd.read_parquet(panel_path)
df["date"] = pd.to_datetime(df["date"])

df = df.sort_values(["ticker", "date"]).reset_index(drop=True)
df = df[df["ticker"] == "AAPL"].copy()
df = df.sort_values("date").reset_index(drop=True)

#fix repeated substring in exog lag columns
df.columns = (
    pd.Index(df.columns)
    .str.replace("_logret_laglogret_lag", "_logret_lag", regex=False)
)

#drop all columns related to other tickers
cols_to_drop = [col for col in df.columns if col.startswith(("MSFT_", "GOOG_", "AMZN_"))]
df = df.drop(columns=cols_to_drop)

print("AAPL df shape:", df.shape)
df.head(3)


AAPL df shape: (3285, 286)


Unnamed: 0,date,ticker,Open,High,Low,Close,Adj Close,Volume,adj_close,logret_1d,...,^VIX_logret_lag19,AAPL_logret_lag20,CL=F_logret_lag20,GC=F_logret_lag20,QQQ_logret_lag20,SPY_logret_lag20,UUP_logret_lag20,XLK_logret_lag20,^TNX_logret_lag20,^VIX_logret_lag20
0,2013-01-02,AAPL,19.779285,19.821428,19.343929,19.608213,16.612209,560518000.0,16.612209,,...,,,,,,,,,,
1,2013-01-03,AAPL,19.567142,19.631071,19.321428,19.360714,16.402523,352965200.0,16.402523,-0.012703,...,,,,,,,,,,
2,2013-01-04,AAPL,19.1775,19.236786,18.779642,18.821428,15.945646,594333600.0,15.945646,-0.028249,...,,,,,,,,,,


In [3]:
#print all columns with NaN values and counts in descending
print("Columns with NaN values:")
print(df.isna().sum()[df.isna().sum() > 0].sort_values(ascending=False))

Columns with NaN values:
logret_1d_lag60     61
logret_1d_lag59     60
ret_mean_60         60
ret_vol_60          60
logret_1d_lag58     59
                    ..
AAPL_logret_lag1     2
vol_change           1
ret_1d               1
target_logret_1d     1
logret_1d            1
Length: 266, dtype: int64


In [4]:
#print the column names
print("Column names:")
print(df.columns.tolist())

Column names:
['date', 'ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'adj_close', 'logret_1d', 'ret_1d', 'target_logret_1d', 'hl_range', 'oc_change', 'ret_mean_5', 'ret_vol_5', 'ret_mean_10', 'ret_vol_10', 'ret_mean_20', 'ret_vol_20', 'ret_mean_60', 'ret_vol_60', 'sma_5', 'sma_10', 'sma_20', 'sma_60', 'sma_spread_5', 'sma_spread_20', 'sma_spread_60', 'ema_12', 'ema_26', 'macd', 'macd_signal', 'macd_hist', 'rsi_14', 'bb_mid_20', 'bb_std_20', 'bb_upper_20', 'bb_lower_20', 'bb_width_20', 'vol_change', 'vol_z_20', 'obv', 'dow', 'month', 'is_month_end', 'logret_1d_lag1', 'logret_1d_lag2', 'logret_1d_lag3', 'logret_1d_lag4', 'logret_1d_lag5', 'logret_1d_lag6', 'logret_1d_lag7', 'logret_1d_lag8', 'logret_1d_lag9', 'logret_1d_lag10', 'logret_1d_lag11', 'logret_1d_lag12', 'logret_1d_lag13', 'logret_1d_lag14', 'logret_1d_lag15', 'logret_1d_lag16', 'logret_1d_lag17', 'logret_1d_lag18', 'logret_1d_lag19', 'logret_1d_lag20', 'logret_1d_lag21', 'logret_1d_lag22', 'logret_1d_lag23'

In [5]:
#cell 3: define X/y and time split
TARGET = "target_logret_1d"

drop_cols = {
    "date", "ticker",
    TARGET,
}

#keep numeric predictors only
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in num_cols if c not in drop_cols]

#drop rows with missing target or missing features
model_df = df[["date", TARGET] + feature_cols].dropna().reset_index(drop=True)

n = len(model_df)
split_idx = int(n * 0.80)

train_df = model_df.iloc[:split_idx].copy()
test_df  = model_df.iloc[split_idx:].copy()

X_train = train_df[feature_cols].copy()
y_train = train_df[TARGET].copy()

X_test  = test_df[feature_cols].copy()
y_test  = test_df[TARGET].copy()

print("model_df shape:", model_df.shape)
print("train rows:", len(train_df), "test rows:", len(test_df))
print("train date range:", train_df["date"].min(), "to", train_df["date"].max())
print("test  date range:", test_df["date"].min(), "to", test_df["date"].max())


model_df shape: (3097, 285)
train rows: 2477 test rows: 620
train date range: 2013-04-02 00:00:00 to 2023-06-01 00:00:00
test  date range: 2023-06-02 00:00:00 to 2026-01-22 00:00:00


In [6]:
#cell 4: feature ranking diagnostics (train only)
from sklearn.feature_selection import mutual_info_regression

corr = pd.concat([X_train, y_train], axis=1).corr(numeric_only=True)[TARGET].drop(TARGET)
corr_rank = corr.abs().sort_values(ascending=False).head(25)

mi = mutual_info_regression(X_train.values, y_train.values, random_state=42)
mi_rank = pd.Series(mi, index=feature_cols).sort_values(ascending=False).head(25)

print("Top 25 by |corr|")
display(corr_rank.to_frame("abs_corr"))

print("Top 25 by mutual information")
display(mi_rank.to_frame("mi"))


Top 25 by |corr|


Unnamed: 0,abs_corr
XLK_logret_lag7,0.108424
QQQ_logret_lag7,0.107897
SPY_logret_lag7,0.107107
XLK_logret_lag6,0.103955
logret_1d_lag7,0.097293
AAPL_logret_lag7,0.097293
SPY_logret_lag6,0.097103
QQQ_logret_lag6,0.096027
SPY_logret_lag8,0.090016
QQQ_logret_lag14,0.086478


Top 25 by mutual information


Unnamed: 0,mi
ret_vol_10,0.05856
hl_range,0.055978
ret_1d,0.055113
logret_1d,0.053923
sma_5,0.051344
macd,0.047551
High,0.038625
rsi_14,0.035834
^VIX_logret_lag4,0.034488
Close,0.032971


In [7]:
#cell 5: PCA explained variance diagnostic
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train.values)
X_test_s  = scaler.transform(X_test.values)

pca = PCA(n_components=0.80, svd_solver="full")
X_train_p = pca.fit_transform(X_train_s)
X_test_p  = pca.transform(X_test_s)

print("PCA components for 80% variance:", pca.n_components_)
print("Explained variance ratio sum:", pca.explained_variance_ratio_.sum())


PCA components for 80% variance: 91
Explained variance ratio sum: 0.8022804105397852


In [8]:
#print the columns in X_train_p
pca_columns = [f"pca_{i+1}" for i in range(pca.n_components_)]
print("PCA feature columns:", pca_columns)

PCA feature columns: ['pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34', 'pca_35', 'pca_36', 'pca_37', 'pca_38', 'pca_39', 'pca_40', 'pca_41', 'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48', 'pca_49', 'pca_50', 'pca_51', 'pca_52', 'pca_53', 'pca_54', 'pca_55', 'pca_56', 'pca_57', 'pca_58', 'pca_59', 'pca_60', 'pca_61', 'pca_62', 'pca_63', 'pca_64', 'pca_65', 'pca_66', 'pca_67', 'pca_68', 'pca_69', 'pca_70', 'pca_71', 'pca_72', 'pca_73', 'pca_74', 'pca_75', 'pca_76', 'pca_77', 'pca_78', 'pca_79', 'pca_80', 'pca_81', 'pca_82', 'pca_83', 'pca_84', 'pca_85', 'pca_86', 'pca_87', 'pca_88', 'pca_89', 'pca_90', 'pca_91']


In [9]:
#cell: PCA loadings (which original features drive each component)
loadings = pd.DataFrame(
    pca.components_,
    columns=feature_cols,
    index=[f"pca_{i+1}" for i in range(pca.n_components_)]
)

def top_loadings(component_name, k=10):
    s = loadings.loc[component_name].abs().sort_values(ascending=False).head(k)
    return s

print("Top 10 original features for pca_1")
display(top_loadings("pca_1", 10).to_frame("abs_loading"))

print("Top 10 original features for pca_2")
display(top_loadings("pca_2", 10).to_frame("abs_loading"))


Top 10 original features for pca_1


Unnamed: 0,abs_loading
bb_upper_20,0.241004
ema_12,0.240628
sma_10,0.240588
ema_26,0.240551
sma_20,0.240532
bb_mid_20,0.240532
sma_5,0.240531
High,0.240523
Open,0.240417
adj_close,0.240411


Top 10 original features for pca_2


Unnamed: 0,abs_loading
sma_spread_20,0.274255
ret_mean_20,0.270796
rsi_14,0.265498
sma_spread_60,0.26144
ret_mean_10,0.243044
macd,0.224651
ret_mean_5,0.197022
macd_signal,0.185618
ret_mean_60,0.183355
sma_spread_5,0.165674


In [11]:
# Cell 14: Comprehensive Dataset & PCA Information for LLM Understanding
print("=" * 80)
print("COMPLETE DATASET AND PCA INFORMATION")
print("=" * 80)

# ============================================================================
# 1. DATASET OVERVIEW (INCLUDES DATE INDEX)
# ============================================================================
print("\n" + "="*80)
print("1. DATASET OVERVIEW")
print("="*80)
print("Dataset: AAPL Stock Price Prediction")
print(f"Target Variable: {TARGET}")
print("Index Column: date (preserved as a datetime column, used for temporal ordering)")
print(f"Total Rows (after dropna): {len(model_df):,}")
print(f"Total Original Features (excluding date and target): {len(feature_cols)}")
print("\nTrain/Test Split (80/20 temporal split):")
print(f"  - Training rows: {len(train_df):,} ({len(train_df)/len(model_df)*100:.1f}%)")
print(f"  - Test rows: {len(test_df):,} ({len(test_df)/len(model_df)*100:.1f}%)")
print(f"  - Train date range: {train_df['date'].min().date()} to {train_df['date'].max().date()}")
print(f"  - Test date range: {test_df['date'].min().date()} to {test_df['date'].max().date()}")

# ============================================================================
# 2. INDEX (DATE) DETAILS
# ============================================================================
print("\n" + "="*80)
print("2. INDEX (DATE) DETAILS")
print("="*80)
print("Date column retained (not dropped) to preserve temporal index.")
print(f"  - Unique dates: {model_df['date'].nunique():,}")
print(f"  - Min date: {model_df['date'].min().date()}")
print(f"  - Max date: {model_df['date'].max().date()}")
# Approximate dominant frequency
if len(model_df) > 1:
    day_diffs = model_df['date'].diff().dt.days.dropna()
    if not day_diffs.empty:
        dominant_freq = day_diffs.mode().iloc[0]
        print(f"  - Dominant gap between rows (days): {dominant_freq}")
    else:
        print("  - Dominant gap between rows (days): n/a")
else:
    print("  - Dominant gap between rows (days): n/a")

# ============================================================================
# 3. FEATURE INFORMATION
# ============================================================================
print("\n" + "="*80)
print("3. FEATURE INFORMATION")
print("="*80)
print(f"Total number of numeric features (excludes date and target): {len(feature_cols)}")
print(f"\nAll columns including index and target ({len(feature_cols)+2} columns):")
all_cols_with_index = ["date (index)"] + feature_cols + [TARGET]
for i, col in enumerate(all_cols_with_index, 1):
    print(f"  {i:3d}. {col}")

print(f"\nFeature names only ({len(feature_cols)} features):")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:3d}. {col}")

print(f"\n--- Training Data Feature Statistics (numeric features only) ---")
train_stats = X_train.describe().T
print("Feature value ranges (min, max, mean, std):")
print(train_stats[['min', 'max', 'mean', 'std']].to_string())

# ============================================================================
# 4. TARGET VARIABLE INFORMATION
# ============================================================================
print("\n" + "="*80)
print("4. TARGET VARIABLE INFORMATION")
print("="*80)
print(f"Target: {TARGET}")
print("Description: Next day log return of AAPL stock")
print("\nTraining Target Statistics:")
print(f"  - Mean: {y_train.mean():.6f}")
print(f"  - Std: {y_train.std():.6f}")
print(f"  - Min: {y_train.min():.6f}")
print(f"  - Max: {y_train.max():.6f}")
print(f"  - Median: {y_train.median():.6f}")
print("\nTest Target Statistics:")
print(f"  - Mean: {y_test.mean():.6f}")
print(f"  - Std: {y_test.std():.6f}")
print(f"  - Min: {y_test.min():.6f}")
print(f"  - Max: {y_test.max():.6f}")
print(f"  - Median: {y_test.median():.6f}")

# ============================================================================
# 5. FEATURE IMPORTANCE & CORRELATION ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("5. FEATURE IMPORTANCE & CORRELATION ANALYSIS")
print("="*80)
print("Top 25 Features by Absolute Correlation with Target:")
print(corr_rank.to_frame('abs_corr').to_string())

print("\nTop 25 Features by Mutual Information with Target:")
print(mi_rank.to_frame('mutual_info').to_string())

# ============================================================================
# 6. PCA TRANSFORMATION DETAILS
# ============================================================================
print("\n" + "="*80)
print("6. PCA TRANSFORMATION DETAILS")
print("="*80)
print("PCA Configuration:")
print("  - Variance retention threshold: 80%")
print("  - SVD solver: full")
print(f"  - Number of components selected: {pca.n_components_}")
print(f"  - Original feature dimension: {len(feature_cols)} (date excluded)")
print(f"  - Reduced dimension: {pca.n_components_}")
print(f"  - Dimensionality reduction: {(1 - pca.n_components_/len(feature_cols))*100:.1f}%")
print(f"  - Cumulative explained variance: {pca.explained_variance_ratio_.sum():.4f}")

print("\nExplained Variance per Component:")
for i, var in enumerate(pca.explained_variance_ratio_, 1):
    cumsum = pca.explained_variance_ratio_[:i].sum()
    print(f"  pca_{i}: {var:.4f} (cumulative: {cumsum:.4f})")

print("\nPCA Transformed Data Shapes:")
print(f"  - X_train_p: {X_train_p.shape}")
print(f"  - X_test_p: {X_test_p.shape}")

# ============================================================================
# 7. PCA LOADINGS - FEATURE COMPOSITION OF PRINCIPAL COMPONENTS
# ============================================================================
print("\n" + "="*80)
print("7. PCA LOADINGS - FEATURE COMPOSITION OF PRINCIPAL COMPONENTS")
print("="*80)
print("This shows which original features (excluding date) contribute most to each PCA component\n")

for comp_idx in range(min(10, pca.n_components_)):  # Show first 10 components
    comp_name = f"pca_{comp_idx+1}"
    print(f"\n--- {comp_name.upper()} (Explains {pca.explained_variance_ratio_[comp_idx]:.4f} variance) ---")
    top_10 = top_loadings(comp_name, 10)
    print(top_10.to_frame('abs_loading').to_string())

# ============================================================================
# 8. STANDARDIZATION (PRE-PCA) DETAILS
# ============================================================================
print("\n" + "="*80)
print("8. STANDARDIZATION (PRE-PCA) DETAILS")
print("="*80)
print("StandardScaler applied before PCA (numeric features only; date excluded):")
print("  - Method: z-score normalization (mean=0, std=1)")
print("  - Fitted on: Training data only")
print("  - Applied to: Both training and test data")
print("\nScaler statistics (from training data):")
scaler_means = pd.Series(scaler.mean_, index=feature_cols)
scaler_stds = pd.Series(scaler.scale_, index=feature_cols)
scaler_stats = pd.DataFrame({'mean': scaler_means, 'std': scaler_stds})
print(scaler_stats.to_string())

# ============================================================================
# 9. BASELINE MODEL PERFORMANCE (FROM PHASE 1)
# ============================================================================
print("\n" + "="*80)
print("9. BASELINE MODEL PERFORMANCE (FROM PHASE 1)")
print("="*80)
metrics_path = PROC_DIR / "phase1_metrics.csv"
if metrics_path.exists():
    metrics_df = pd.read_csv(metrics_path)
    print("Baseline model results to compare against:")
    print(metrics_df.to_string(index=False))
else:
    print("Phase 1 metrics file not found")

# ============================================================================
# 10. DATA QUALITY SUMMARY
# ============================================================================
print("\n" + "="*80)
print("10. DATA QUALITY SUMMARY")
print("="*80)
print(f"Missing values in original df: {df.isna().sum().sum():,}")
print(f"Missing values after dropna: {model_df.isna().sum().sum()}")
print(f"Rows dropped due to NaN: {len(df) - len(model_df):,}")
print(f"Percentage of data retained: {len(model_df)/len(df)*100:.2f}%")

print("\n" + "="*80)
print("END OF DATASET & PCA INFORMATION")
print("="*80)

COMPLETE DATASET AND PCA INFORMATION

1. DATASET OVERVIEW
Dataset: AAPL Stock Price Prediction
Target Variable: target_logret_1d
Index Column: date (preserved as a datetime column, used for temporal ordering)
Total Rows (after dropna): 3,097
Total Original Features (excluding date and target): 283

Train/Test Split (80/20 temporal split):
  - Training rows: 2,477 (80.0%)
  - Test rows: 620 (20.0%)
  - Train date range: 2013-04-02 to 2023-06-01
  - Test date range: 2023-06-02 to 2026-01-22

2. INDEX (DATE) DETAILS
Date column retained (not dropped) to preserve temporal index.
  - Unique dates: 3,097
  - Min date: 2013-04-02
  - Max date: 2026-01-22
  - Dominant gap between rows (days): 1.0

3. FEATURE INFORMATION
Total number of numeric features (excludes date and target): 283

All columns including index and target (285 columns):
    1. date (index)
    2. Open
    3. High
    4. Low
    5. Close
    6. Adj Close
    7. Volume
    8. adj_close
    9. logret_1d
   10. ret_1d
   11. hl_r