In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    auc,
    confusion_matrix
)


from data_preprocessing.merge import df


## CLASSIFIERS

In [None]:
feature_columns = [
    # Price/technical:
    'momentum_3m', 'momentum_6m', 'momentum_12m', 'volatility_12m',

    # Basic YTD fundamentals (optional—tree can split on scale):
    'revty', 'saley', 'capxy', 'oibdpy', 'rdipay', 'xsgay', 'txpdy', 'epsfxy', 'cshfdy', 'xoptepsy',

    # Engineered ratios:
    'EBIT_margin', 'R&D_intensity', 'SGA_intensity', 'Tax_rate', 'Capex_to_Revenue',

    # QoQ growth rates:
    'revty_QoQ_growth', 'oibdpy_QoQ_growth', 'rdipay_QoQ_growth', 'xsgay_QoQ_growth',

    # Months
    'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',

    # Indicators
    'EMA', 'Volatility', 'RSI', 'MACD'
]

In [None]:
# Drop any rows where engineered features are NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=feature_columns + ['y']).copy()

In [None]:
X = df[feature_columns]
y = df['y']

# Instead of a fixed 80/20 cutoff, we build an expanding‐window cross‐validation
# but we keep a final out‐of‐sample test set (last 20% of months).
n_obs = len(df)
cutpoint = int(n_obs * 0.8)

X_train = X.iloc[:cutpoint]
y_train = y.iloc[:cutpoint]

X_test  = X.iloc[cutpoint:]
y_test  = y.iloc[cutpoint:]

### Random Forest Classifier

In [None]:
from models.rf_classifier import build_rf_pipeline

pipe, param_grid = build_rf_pipeline()

tscv = TimeSeriesSplit(n_splits=5)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=tscv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
best_model = grid.best_estimator_


In [None]:
from utils import report

report(X_test, y_train, y_test,  best_model, feature_columns)

### XGB Classifier

In [None]:
from models.xgb_classifier import build_xgb_pipeline

pipe, param_grid = build_xgb_pipeline()


grid = GridSearchCV(pipe, param_grid, cv=TimeSeriesSplit(n_splits=5), scoring='roc_auc', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print("XGBoost Best params:", grid.best_params_)
best_model = grid.best_estimator_

In [None]:
report(X_test, y_train, y_test,  best_model, feature_columns)

## DEEP LEARNING

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_recall_curve, auc, accuracy_score
)

# Device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

### "EnhancedLSTM"

In [None]:
from utils import build_sequences, split_data, make_dataloaders, make_loss

# Scale features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[feature_columns] = scaler.fit_transform(df[feature_columns])

# Group key
group_key = 'PERMNO' if 'PERMNO' in df_scaled.columns else 'CUSIP'

In [None]:
# Build sequences with 24-month window
WINDOW     = 24
BATCH_SIZE = 128

X, y = build_sequences(
    df            = df_scaled,
    feature_columns = feature_columns,
    label_column  = 'y',
    group_key     = group_key,
    window        = WINDOW
)

# Split
splits = split_data(X, y, train_frac=0.8, val_frac=0.2)

# DataLoaders
dl_train, dl_val, dl_test = make_dataloaders(splits, BATCH_SIZE)

# Loss
_, (X_val, y_val), _ = splits
criterion = make_loss(splits[0][1])

In [None]:
from models.nn_models import EnhancedLSTM
from utils import train, evaluate

model = EnhancedLSTM(in_dim=len(feature_columns)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=4)
clip_grad = 1.0

# Training loop
best_auc = 0.0
patience, trials = 12, 0
epochs= 100

train(epochs, model, scheduler, clip_grad, 
      optimizer, patience, criterion, dl_val, 
      dl_train=dl_train, device=device, early_stop=True)


In [None]:
evaluate(model=model, dl_test=dl_test, device=device)

### "SmallLSTM"

In [None]:
WINDOW = 6
BATCH_SIZE = 32

X, y = build_sequences(
    df            = df_scaled,
    feature_columns = feature_columns,
    label_column  = 'y',
    group_key     = group_key,
    window        = WINDOW
)

# Split
splits = split_data(X, y, train_frac=0.8, val_frac=0.2)

# DataLoaders
dl_train, dl_val, dl_test = make_dataloaders(splits, BATCH_SIZE)

# Loss
_, (X_val, y_val), _ = splits
criterion = make_loss(splits[0][1])

In [None]:
from models.nn_models import SmallLSTM


model = SmallLSTM(len(feature_columns)).to(device)

pos_w = (y_train == 0).sum() / (y_train == 1).sum()
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(pos_w, device=device, dtype=torch.float32)
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)



train(epochs, model, scheduler, clip_grad, 
      optimizer, patience, criterion, dl_val, 
      dl_train=dl_train, device=device, early_stop=True)

In [None]:
evaluate(model=model, dl_test=dl_test, device=device)

### "LargeLSTM"

In [None]:
from models.nn_models import LargeLSTM


model = LargeLSTM(len(feature_columns)).to(device)

pos_w = (y_train == 0).sum() / (y_train == 1).sum()
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(pos_w, device=device, dtype=torch.float32)
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


train(epochs, model, scheduler, clip_grad, 
      optimizer, patience, criterion, dl_val, 
      dl_train=dl_train, device=device, early_stop=True)

In [None]:
evaluate(model=model, dl_test=dl_test, device=device)

### "StockTransformer"

In [None]:
from models.nn_models import StockTransformer


model = StockTransformer(len(feature_columns), window=WINDOW).to(device)

# Loss & optimizer (with class weighting)
pos_w = (y_train == 0).sum() / (y_train == 1).sum()
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(pos_w, device=device, dtype=torch.float32)
)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5)

best_val_auc = 0.0
patience, trials = 5, 0


train(epochs, model, scheduler, clip_grad, 
      optimizer, patience, criterion, dl_val, 
      dl_train=dl_train, device=device, early_stop=True)

In [None]:
evaluate(model=model, dl_test=dl_test, device=device)

### "InceptionModule"

In [None]:
from models.nn_models import InceptionModule, InceptionTime

model = InceptionTime(
    in_dim=len(feature_columns),
    num_blocks=3,
    out_channels=32,
    kernel_sizes=[3,5,7],
    bottleneck_channels=32,
    use_residual=True,
    dropout=0.2
).to(device)


pos_w = (y_train == 0).sum() / (y_train == 1).sum()
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(pos_w, device=device, dtype=torch.float32)
)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5)

best_val_auc = 0.0
patience, trials = 5, 0

train(epochs, model, scheduler, clip_grad, 
      optimizer, patience, criterion, dl_val, 
      dl_train=dl_train, device=device, early_stop=True)


In [None]:
evaluate(model=model, dl_test=dl_test, device=device, inception=True)