In [4]:
import pandas as pd
import numpy as np
from datetime import timedelta

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    auc,
    confusion_matrix
)



In [5]:
# Load Monthly CRSP

CRSP_PATH = '../data/monthly_crsp.csv'
df_crsp = pd.read_csv(
    CRSP_PATH ,
    parse_dates=['MthCalDt'],
    usecols=['PERMNO','CUSIP','MthCalDt','MthRet']
)


# Load Compustat Fundamentals

COMP_PATH = '../data/CompFirmCharac.csv'

df_comp = pd.read_csv(
    COMP_PATH,
    parse_dates=['datadate'], dayfirst=True,
)


  df_comp = pd.read_csv(
  df_comp = pd.read_csv(


### CLEAN CRSP


In [6]:
# 2.1: Keep only rows where MthRet is available and cast to float
df_crsp = df_crsp.dropna(subset=['MthRet']).copy()
df_crsp['MthRet'] = df_crsp['MthRet'].astype(float)

# 2.2: Sort by CUSIP, date so that shift is correct
df_crsp['date'] = pd.to_datetime(df_crsp['MthCalDt'].astype(str), format='mixed')
df_crsp = df_crsp.sort_values(['CUSIP','date']).reset_index(drop=True)

# 2.3: Create next‐month return target (binary)
df_crsp['Ret_t1'] = df_crsp.groupby('CUSIP')['MthRet'].shift(-1)
# df_crsp['y'] = df_crsp.groupby('CUSIP')['MthRet'].shift(-1)
df_crsp['y'] = (df_crsp['Ret_t1'] > 0).astype(int)
df_crsp = df_crsp.dropna(subset=['y']).copy()

In [7]:
import calendar

# Add technical indicators and months

def compute_close(y_series):
    close = (1 + y_series.fillna(0)).cumprod()
    close.iloc[0] = 1.0
    return close

df_crsp = df_crsp.sort_values(["PERMNO", "date"])
df_crsp["close"] = df_crsp.groupby("PERMNO")["MthRet"].apply(compute_close).reset_index(level=0, drop=True)

def calculate_rsi(series, window=14):
    delta = series.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    ma_up = up.rolling(window).mean()
    ma_down = down.rolling(window).mean()
    rs = ma_up / ma_down
    return 100 - (100 / (1 + rs))

# Group-wise calculations
def add_technical_indicators(group):
    group = group.copy()
    group['EMA_Close'] = group['close'].ewm(span=14, adjust=False).mean()
    group['EMA'] = (group['close'] - group['EMA_Close'])/ group['close']

    rolling_std = group['close'].rolling(window=14).std()
    group['Volatility'] = rolling_std / group['close']

    group['RSI'] = calculate_rsi(group['close'])
    
    # MACD
    ema12 = group['close'].ewm(span=12, adjust=False).mean()
    ema26 = group['close'].ewm(span=26, adjust=False).mean()
    group['MACD_diff'] = ema12 - ema26
    group['MACD_Signal'] = group['MACD_diff'].ewm(span=9, adjust=False).mean()
    group['MACD'] = (group['MACD_diff'] - group['MACD_Signal']) / group['close']


    return group

# Apply technical indicators to each stock (PERMNO)
df_crsp = df_crsp.groupby("PERMNO", group_keys=False).apply(add_technical_indicators, include_groups=False)

df_crsp.drop(columns=["close", "EMA_Close", "MACD_diff", "MACD_Signal", 'y_forward'], inplace=True, errors='ignore')

# Add months
df_crsp['month'] = df_crsp['date'].dt.month.map(lambda x: calendar.month_name[x])

# Create dummy variables with month names as column names
month_dummies = pd.get_dummies(df_crsp['month']).astype(int)

# Concatenate dummies with original dataframe
df_crsp = pd.concat([df_crsp, month_dummies], axis=1)

# drop the intermediate 'month' column 
df_crsp = df_crsp.drop(columns=['month'])

# Get the current columns
cols = list(df_crsp.columns)

# Move 'y' to the end if it exists
if 'y' in cols:
    cols.remove('y')
    cols.append('y')

# Reorder the DataFrame columns
df_crsp = df_crsp[cols]

df_crsp = df_crsp.dropna()
print(df_crsp)

            CUSIP   MthCalDt    MthRet       date    Ret_t1       EMA  \
2970518  68391610 1987-03-31 -0.384615 1987-03-31 -0.062500 -4.416586   
2970519  68391610 1987-04-30 -0.062500 1987-04-30 -0.066667 -4.140666   
1746075  39040610 1987-03-31  0.037486 1987-03-31 -0.039216  0.028843   
1746076  39040610 1987-04-30 -0.039216 1987-04-30 -0.071429 -0.009357   
1746077  39040610 1987-05-29 -0.071429 1987-05-29  0.052687 -0.075400   
...           ...        ...       ...        ...       ...       ...   
3917811  88160R10 2024-07-31  0.172781 2024-07-31 -0.077390  0.094132   
3917812  88160R10 2024-08-30 -0.077390 2024-08-30  0.221942  0.015727   
3917813  88160R10 2024-09-30  0.221942 2024-09-30 -0.045025  0.168567   
3917814  88160R10 2024-10-31 -0.045025 2024-10-31  0.381469  0.112118   
3917815  88160R10 2024-11-29  0.381469 2024-11-29  0.170008  0.309653   

         Volatility        RSI      MACD  April  ...  February  January  July  \
2970518    6.008675  31.218276 -1.072760  

In [8]:
# 2.4: Generate price‐history features (momentum + volatility)
#
#   - 3M momentum: cumulative return over past 3 months (t-3 → t-1)
#   - 6M momentum: cumulative return over past 6 months
#   - 12M momentum: cumulative return over past 12 months
#   - 12M rolling volatility: std of monthly returns over past 12 months
#
def compute_momentum_and_vol(df):
    df = df.sort_values('date')
    # Rolling log(1+return), because cumulative product of (1 + ret) = exp(sum(log(1+ret)))
    df['log1p_ret'] = np.log1p(df['MthRet'])
    df['log1p_ret_shift1'] = df.groupby('CUSIP')['log1p_ret'].shift(1)
    df['cum12_1_log'] = df.groupby('CUSIP')['log1p_ret_shift1'].rolling(window=11).sum().reset_index(0,drop=True)
    df['mom_12_1'] = np.expm1(df['cum12_1_log'])
    df['cum3m_log'] = df.groupby('CUSIP')['log1p_ret'].rolling(window=3, min_periods=3).sum().reset_index(0,drop=True)
    df['cum6m_log'] = df.groupby('CUSIP')['log1p_ret'].rolling(window=6, min_periods=6).sum().reset_index(0,drop=True)
    df['cum12m_log'] = df.groupby('CUSIP')['log1p_ret'].rolling(window=12, min_periods=12).sum().reset_index(0,drop=True)
    df['momentum_3m'] = np.expm1(df['cum3m_log'])    # exp(sum)-1 => (1+r1)*(1+r2)*(1+r3) - 1
    df['momentum_6m'] = np.expm1(df['cum6m_log'])
    df['momentum_12m'] = np.expm1(df['cum12m_log'])
    df['volatility_12m'] = df.groupby('CUSIP')['MthRet'].rolling(window=12, min_periods=12).std().reset_index(0,drop=True)
    # Drop intermediate log columns
    return df.drop(columns=['log1p_ret','cum3m_log','cum6m_log','cum12m_log'])

df_crsp = compute_momentum_and_vol(df_crsp)

# 2.5: Trim CUSIP to 8 characters (for merging) and drop NA
df_crsp['cusip'] = df_crsp['CUSIP'].astype(str).str[:8]
df_crsp = df_crsp.dropna(subset=['cusip']).copy()

### CLEAN COMPFIRM

In [9]:
# 3.1: Keep only Industrial & Consolidated
df_comp = df_comp[
    (df_comp['consol'] == 'C')
].copy()

# 3.2: Trim & parse keys/dates
df_comp['cusip'] = df_comp['cusip'].astype(str).str[:8]
df_comp['datadate'] = pd.to_datetime(df_comp['datadate'])
df_comp = df_comp.dropna(subset=['cusip','datadate']).copy()

# 3.3: Build “effective_date” = datadate + 45 calendar days,
#      so that we only use Q data ~45 days after quarter‐end.
df_comp['effective_date'] = df_comp['datadate'] + pd.Timedelta(days=45)
df_comp = df_comp.set_index('effective_date').sort_index()

# 3.4: Select a larger fundamental set (YTD flows + per‐share metrics)
fundamental_cols = [
    'revty',    # Revenue YTD
    'saley',    # Sales YTD
    'capxy',    # CapEx YTD
    'oibdpy',   # EBITDA YTD
    'rdipay',   # R&D expense YTD
    'xsgay',    # SG&A expense YTD
    'txpdy',    # Tax provision YTD
    'epsfxy',   # Diluted EPS ex‐extra YTD
    'cshfdy',   # Diluted shares YTD (millions)
    'xoptepsy'  # Option expense per share YTD
]

df_comp_small = df_comp[['cusip'] + fundamental_cols].copy()

# 3.5: For each “cusip + quarter,” drop exact duplicates
df_comp_small = df_comp_small.reset_index().drop_duplicates(
    subset=['cusip','effective_date']
).set_index('effective_date').sort_index()

### MERGE

In [10]:
# 4.1: Set df_crsp index to “date”
df_crsp = df_crsp.set_index('date').sort_index()

# 4.2: Merge (for every month, get the most recent Compustat row ≤ that month’s date)
df_merged = pd.merge_asof(
    left = df_crsp.reset_index(),
    right = df_comp_small.reset_index(),
    left_on = 'date',
    right_on = 'effective_date',
    by = 'cusip',
    direction = 'backward',
    allow_exact_matches=True
).set_index('date')

# 4.3: Drop rows where any of our fundamentals are NA, since we can’t compute ratios otherwise
df_merged = df_merged.dropna(subset=fundamental_cols + ['y']).copy()


In [11]:
df = df_merged.copy()

# 5.1: Engineer simple ratios (safe‐guard divisions by zero)
df['EBIT_margin']      = df['oibdpy'] / df['saley'].replace({0: np.nan})
df['R&D_intensity']    = df['rdipay'] / df['saley'].replace({0: np.nan})
df['SGA_intensity']    = df['xsgay'] / df['saley'].replace({0: np.nan})
df['Tax_rate']         = df['txpdy']  / df['oibdpy'].replace({0: np.nan})
df['Capex_to_Revenue'] = df['capxy']  / df['revty'].replace({0: np.nan})

# 5.2: Compute QoQ growth rates on YTD fundamentals
for col in ['revty','oibdpy','rdipay','xsgay']:
    df[col + '_QoQ_growth'] = df.groupby('cusip')[col].pct_change()  # (This QY / Last QY) - 1

# 5.3: Optionally drop some raw dollar‐amount YTD columns if you want just ratios
#      (otherwise let the model pick scale vs. ratio)
# df = df.drop(columns=['revty','saley','capxy','oibdpy','rdipay','xsgay','txpdy','epsfxy','cshfdy','xoptepsy'])

### TRAINING

In [12]:
feature_columns = [
    # Price/technical:
    'momentum_3m', 'momentum_6m', 'momentum_12m', 'volatility_12m',

    # Basic YTD fundamentals (optional—tree can split on scale):
    'revty', 'saley', 'capxy', 'oibdpy', 'rdipay', 'xsgay', 'txpdy', 'epsfxy', 'cshfdy', 'xoptepsy',

    # Engineered ratios:
    'EBIT_margin', 'R&D_intensity', 'SGA_intensity', 'Tax_rate', 'Capex_to_Revenue',

    # QoQ growth rates:
    'revty_QoQ_growth', 'oibdpy_QoQ_growth', 'rdipay_QoQ_growth', 'xsgay_QoQ_growth',

    # Months
    'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',

    # Indicators
    'EMA', 'Volatility', 'RSI', 'MACD'
]

# Drop any rows where engineered features are NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=feature_columns + ['y']).copy()


X = df[feature_columns]
y = df['y']

In [13]:
# Instead of a fixed 80/20 cutoff, build an expanding‐window cross‐validation
# but keep a final out‐of‐sample test set (last 20% of months).
n_obs = len(df)
cutpoint = int(n_obs * 0.8)

X_train = X.iloc[:cutpoint]
y_train = y.iloc[:cutpoint]

X_test  = X.iloc[cutpoint:]
y_test  = y.iloc[cutpoint:]

In [14]:
# from sklearn.ensemble import RandomForestRegressor

# pipe = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler()),
#     ('reg', RandomForestRegressor(random_state=42))
# ])

# param_grid = {
#     'reg__n_estimators': [100, 200],
#     'reg__max_depth': [5, 7, 9],
#     'reg__max_features': ['sqrt', 'log2']
# }

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',   StandardScaler()),   # scale ratio features so splits are easier
    ('clf',      RandomForestClassifier(random_state=42, class_weight='balanced'))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [5, 7, 9],
    'clf__max_features': ['sqrt', 'log2']
}


tscv = TimeSeriesSplit(n_splits=5)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=tscv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
best_model = grid.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'clf__max_depth': 7, 'clf__max_features': 'log2', 'clf__n_estimators': 100}


In [15]:
# 9.1: Prediction & Classification Report
y_pred = best_model.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))

# 9.2: ROC AUC + Precision‐Recall AUC
y_proba = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)

print(f"Test ROC AUC:  {roc_auc:.4f}")
print(f"Test PR AUC:   {pr_auc:.4f}")

# 9.3: Display a confusion matrix if you like
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (low values = better balance):\n", cm)

# ───────────
# 10. FEATURE IMPORTANCE AND NEXT STEPS
# ───────────

importances = best_model.named_steps['clf'].feature_importances_
feat_imp = pd.Series(importances, index=feature_columns).sort_values(ascending=False)
print("\nTop 10 Feature Importances:")
print(feat_imp.head(10))

# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# y_pred = best_model.predict(X_test)

# print("\nRegression Metrics on Test Set:")
# print(f"MAE:  {mean_absolute_error(y_test, y_pred):.4f}")
# print(f"MSE:  {mean_squared_error(y_test, y_pred):.4f}")
# print(f"R²:   {r2_score(y_test, y_pred):.4f}")



Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.40      0.46      0.43       344
           1       0.61      0.55      0.58       531

    accuracy                           0.52       875
   macro avg       0.51      0.51      0.50       875
weighted avg       0.53      0.52      0.52       875

Test ROC AUC:  0.5200
Test PR AUC:   0.6401
Confusion Matrix (low values = better balance):
 [[159 185]
 [238 293]]

Top 10 Feature Importances:
Capex_to_Revenue    0.046911
Volatility          0.046534
momentum_12m        0.046269
momentum_3m         0.042559
RSI                 0.042087
MACD                0.041529
momentum_6m         0.038611
volatility_12m      0.038029
epsfxy              0.037996
SGA_intensity       0.036842
dtype: float64


TRAINING 2

In [16]:
feature_columns = [
    # Price/technical:
    'momentum_3m', 'momentum_6m', 'mom_12_1', 'volatility_12m',

    # Basic YTD fundamentals (optional—tree can split on scale):
    'revty', 'saley', 'capxy', 'oibdpy', 'rdipay', 'xsgay', 'txpdy', 'epsfxy', 'cshfdy', 'xoptepsy',

    # Engineered ratios:
    'EBIT_margin', 'R&D_intensity', 'SGA_intensity', 'Tax_rate', 'Capex_to_Revenue',

    # QoQ growth rates:
    'revty_QoQ_growth', 'oibdpy_QoQ_growth', 'rdipay_QoQ_growth', 'xsgay_QoQ_growth',

    # Months
    'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December',

    # Indicators
    'EMA', 'Volatility', 'RSI', 'MACD'
]

# Drop any rows where engineered features are NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=feature_columns + ['y']).copy()


X = df[feature_columns]
y = df['y']

In [17]:
# Instead of a fixed 80/20 cutoff, build an expanding‐window cross‐validation
# but keep a final out‐of‐sample test set (last 20% of months).
n_obs = len(df)
cutpoint = int(n_obs * 0.8)

X_train = X.iloc[:cutpoint]
y_train = y.iloc[:cutpoint]

X_test  = X.iloc[cutpoint:]
y_test  = y.iloc[cutpoint:]

In [19]:
from xgboost import XGBClassifier

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',   StandardScaler()),  
    ('clf',      XGBClassifier(
         objective='binary:logistic',
         eval_metric='auc',
         use_label_encoder=False,
         n_estimators=200,
         max_depth=5,
         learning_rate=0.05,
         random_state=42
    ))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.05]
}

grid = GridSearchCV(pipe, param_grid, cv=TimeSeriesSplit(n_splits=5), scoring='roc_auc', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print("XGBoost Best params:", grid.best_params_)
best_model = grid.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


XGBoost Best params: {'clf__learning_rate': 0.01, 'clf__max_depth': 3, 'clf__n_estimators': 100}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [20]:
# 9.1: Prediction & Classification Report
y_pred = best_model.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))

# 9.2: ROC AUC + Precision‐Recall AUC
y_proba = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)

print(f"Test ROC AUC:  {roc_auc:.4f}")
print(f"Test PR AUC:   {pr_auc:.4f}")

# 9.3: Display a confusion matrix if you like
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (low values = better balance):\n", cm)

# ───────────
# 10. FEATURE IMPORTANCE AND NEXT STEPS
# ───────────

importances = best_model.named_steps['clf'].feature_importances_
feat_imp = pd.Series(importances, index=feature_columns).sort_values(ascending=False)
print("\nTop 10 Feature Importances:")
print(feat_imp.head(10))


Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.43      0.19      0.27       344
           1       0.61      0.83      0.71       531

    accuracy                           0.58       875
   macro avg       0.52      0.51      0.49       875
weighted avg       0.54      0.58      0.53       875

Test ROC AUC:  0.5138
Test PR AUC:   0.6361
Confusion Matrix (low values = better balance):
 [[ 66 278]
 [ 88 443]]

Top 10 Feature Importances:
xoptepsy             0.069767
May                  0.067024
October              0.051338
April                0.049300
June                 0.048363
momentum_6m          0.047852
MACD                 0.044416
January              0.042819
oibdpy_QoQ_growth    0.042389
March                0.042241
dtype: float32


DEEP LEARNING PART 

In [24]:
# ───────────
# 11. Enhanced LSTM (PyTorch)
# ───────────

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_recall_curve, auc, accuracy_score
)

# 11.1: Scale features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[feature_columns] = scaler.fit_transform(df[feature_columns])

# 11.2: Group key
group_key = 'PERMNO' if 'PERMNO' in df_scaled.columns else 'CUSIP'

# 11.3: Build sequences with 24-month window
WINDOW = 24
Xs, ys = [], []
for key_val, grp in df_scaled.groupby(group_key):
    grp = grp.sort_index()
    arr = grp[feature_columns].values
    labels = grp['y'].values
    for i in range(WINDOW, len(arr)):
        Xs.append(arr[i-WINDOW:i])
        ys.append(labels[i])

X = np.stack(Xs).astype(np.float32)
y = np.array(ys).astype(np.float32)

# 11.4: Chronological split
n = len(y)
cut = int(n * 0.8)
X_train_all, X_test = X[:cut], X[cut:]
y_train_all, y_test = y[:cut], y[cut:]
val_cut = int(len(X_train_all) * 0.8)
X_train, X_val = X_train_all[:val_cut], X_train_all[val_cut:]
y_train, y_val = y_train_all[:val_cut], y_train_all[val_cut:]

# 11.5: DataLoaders
batch_size = 128
to_tensor = lambda a: torch.from_numpy(a)
train_ds = TensorDataset(to_tensor(X_train), to_tensor(y_train))
val_ds   = TensorDataset(to_tensor(X_val),   to_tensor(y_val))
test_ds  = TensorDataset(to_tensor(X_test),  to_tensor(y_test))
dl_train = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
dl_val   = DataLoader(val_ds,   batch_size=batch_size)
dl_test  = DataLoader(test_ds,  batch_size=batch_size)

# 11.6: Class weighting & label smoothing
pos_weight = torch.tensor((y_train == 0).sum() / (y_train == 1).sum())
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# 11.7: Enhanced LSTM model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class EnhancedLSTM(nn.Module):
    def __init__(self, in_dim, hid_dim=256, num_layers=2, dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=in_dim,
            hidden_size=hid_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        self.norm = nn.LayerNorm(hid_dim*2)
        self.head = nn.Sequential(
            nn.Linear(hid_dim*2, hid_dim//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hid_dim//2, 1)
        )

    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]          # (B, hid_dim*2)
        normed = self.norm(last)
        return self.head(normed).squeeze(1)

model = EnhancedLSTM(in_dim=len(feature_columns)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=4)
clip_grad = 1.0

# 11.8: Training loop
best_auc = 0.0
patience, trials = 12, 0

for epoch in range(1, 101):
    model.train()
    losses = []
    for xb, yb in dl_train:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step()
        losses.append(loss.item())
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dl_val:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            preds.extend(probs)
            trues.extend(yb.numpy())
    auc_val = roc_auc_score(trues, preds)
    scheduler.step(auc_val)
    print(f"Epoch {epoch:02d} | Loss {np.mean(losses):.4f} | Val AUC {auc_val:.4f}")
    if auc_val > best_auc:
        best_auc, trials = auc_val, 0
        torch.save(model.state_dict(), 'best_lstm.pt')
    else:
        trials += 1
        if trials >= patience:
            print("Early stopping.")
            break

# 11.9: Test evaluation + full report
model.load_state_dict(torch.load('best_lstm.pt'))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in dl_test:
        xb = xb.to(device)
        all_preds.extend(torch.sigmoid(model(xb)).cpu().numpy())
        all_labels.extend(yb.numpy())

# Binarize at 0.5
yhat = np.array(all_preds) > 0.5

print("\nClassification Report on Test Set:")
print(classification_report(all_labels, yhat, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(all_labels, yhat))

roc = roc_auc_score(all_labels, all_preds)
precision, recall, _ = precision_recall_curve(all_labels, all_preds)
pr = auc(recall, precision)
acc = accuracy_score(all_labels, yhat)

print(f"\nTest ROC AUC:  {roc:.4f}")
print(f"Test PR AUC:   {pr:.4f}")
print(f"Test Accuracy: {acc:.4f}")


Epoch 01 | Loss 0.6839 | Val AUC 0.4226
Epoch 02 | Loss 0.6642 | Val AUC 0.4065
Epoch 03 | Loss 0.6548 | Val AUC 0.4071
Epoch 04 | Loss 0.6459 | Val AUC 0.4131
Epoch 05 | Loss 0.6326 | Val AUC 0.4214
Epoch 06 | Loss 0.6315 | Val AUC 0.4208
Epoch 07 | Loss 0.6214 | Val AUC 0.4202
Epoch 08 | Loss 0.6313 | Val AUC 0.4238
Epoch 09 | Loss 0.6321 | Val AUC 0.4220
Epoch 10 | Loss 0.6258 | Val AUC 0.4250
Epoch 11 | Loss 0.6130 | Val AUC 0.4268
Epoch 12 | Loss 0.6126 | Val AUC 0.4286
Epoch 13 | Loss 0.6111 | Val AUC 0.4292
Epoch 14 | Loss 0.6230 | Val AUC 0.4310
Epoch 15 | Loss 0.6214 | Val AUC 0.4298
Epoch 16 | Loss 0.6225 | Val AUC 0.4310
Epoch 17 | Loss 0.6152 | Val AUC 0.4286
Epoch 18 | Loss 0.6114 | Val AUC 0.4286
Epoch 19 | Loss 0.6050 | Val AUC 0.4268
Epoch 20 | Loss 0.6235 | Val AUC 0.4268
Epoch 21 | Loss 0.6089 | Val AUC 0.4268
Epoch 22 | Loss 0.6029 | Val AUC 0.4268
Epoch 23 | Loss 0.6147 | Val AUC 0.4268
Epoch 24 | Loss 0.6182 | Val AUC 0.4256
Epoch 25 | Loss 0.6091 | Val AUC 0.4256


In [26]:
# ───────────
# 11. LSTM on M1 (MPS) — lightweight version
# ───────────

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_recall_curve, auc, accuracy_score
)

# 1) Scale features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[feature_columns] = scaler.fit_transform(df[feature_columns])

# 2) Choose key & build 6-month sequences
group_key = 'PERMNO' if 'PERMNO' in df_scaled.columns else 'CUSIP'
WINDOW = 6

Xs, ys = [], []
for key_val, grp in df_scaled.groupby(group_key):
    arr = grp.sort_index()[feature_columns].values
    lbl = grp['y'].values
    for i in range(WINDOW, len(arr)):
        Xs.append(arr[i-WINDOW:i])
        ys.append(lbl[i])

X = np.stack(Xs).astype(np.float32)  # (n_samples, 6, n_features)
y = np.array(ys).astype(np.float32)

# 3) Chronological split 80/20 train/test, then 80/20 train/val
n = len(y)
cp = int(n * 0.8)
X_train_all, X_test = X[:cp], X[cp:]
y_train_all, y_test = y[:cp], y[cp:]
val_cut = int(len(X_train_all) * 0.8)
X_train, X_val = X_train_all[:val_cut], X_train_all[val_cut:]
y_train, y_val = y_train_all[:val_cut], y_train_all[val_cut:]

# 4) DataLoaders (batch_size=32)
bs = 32
train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val),   torch.from_numpy(y_val))
test_ds  = TensorDataset(torch.from_numpy(X_test),  torch.from_numpy(y_test))
dl_train = DataLoader(train_ds, batch_size=bs, shuffle=True)
dl_val   = DataLoader(val_ds,   batch_size=bs)
dl_test  = DataLoader(test_ds,  batch_size=bs)

# 5) Device: MPS if available, else CPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# 6) Small LSTM model
class SmallLSTM(nn.Module):
    def __init__(self, in_dim, hid_dim=32):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hid_dim, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hid_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :]).squeeze(1)

model = SmallLSTM(len(feature_columns)).to(device)

# 7) Loss + optimizer (class-weighted)
pos_w = (y_train == 0).sum() / (y_train == 1).sum()
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(pos_w, device=device, dtype=torch.float32)
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 8) Train 15 epochs
for epoch in range(1, 16):
    model.train()
    total_loss = 0.0
    for xb, yb in dl_train:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(dl_train.dataset)

    # Validation AUC
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dl_val:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            preds.extend(probs)
            trues.extend(yb.numpy())
    val_auc = roc_auc_score(trues, preds)
    print(f"Epoch {epoch:02d} | Train Loss: {avg_loss:.4f} | Val AUC: {val_auc:.4f}")

# 9) Final evaluation
model.eval()
test_preds, test_trues = [], []
with torch.no_grad():
    for xb, yb in dl_test:
        xb = xb.to(device)
        test_preds.extend(torch.sigmoid(model(xb)).cpu().numpy())
        test_trues.extend(yb.numpy())

y_hat = (np.array(test_preds) > 0.5).astype(int)

print("\nClassification Report on Test Set:")
print(classification_report(test_trues, y_hat, digits=4))
print("Confusion Matrix (low values = better balance):")
print(confusion_matrix(test_trues, y_hat))

roc = roc_auc_score(test_trues, test_preds)
precision, recall, _ = precision_recall_curve(test_trues, test_preds)
pr_auc = auc(recall, precision)
acc = accuracy_score(test_trues, y_hat)

print(f"\nTest ROC AUC:  {roc:.4f}")
print(f"Test PR AUC:   {pr_auc:.4f}")
print(f"Test Accuracy: {acc:.4f}")


Epoch 01 | Train Loss: 0.5936 | Val AUC: 0.4991
Epoch 02 | Train Loss: 0.5894 | Val AUC: 0.5505
Epoch 03 | Train Loss: 0.5867 | Val AUC: 0.5399
Epoch 04 | Train Loss: 0.5827 | Val AUC: 0.5468
Epoch 05 | Train Loss: 0.5786 | Val AUC: 0.5569
Epoch 06 | Train Loss: 0.5749 | Val AUC: 0.5527
Epoch 07 | Train Loss: 0.5690 | Val AUC: 0.5544
Epoch 08 | Train Loss: 0.5611 | Val AUC: 0.5608
Epoch 09 | Train Loss: 0.5533 | Val AUC: 0.5710
Epoch 10 | Train Loss: 0.5424 | Val AUC: 0.5704
Epoch 11 | Train Loss: 0.5309 | Val AUC: 0.5644
Epoch 12 | Train Loss: 0.5195 | Val AUC: 0.5739
Epoch 13 | Train Loss: 0.5091 | Val AUC: 0.5845
Epoch 14 | Train Loss: 0.5000 | Val AUC: 0.5791
Epoch 15 | Train Loss: 0.4846 | Val AUC: 0.5761

Classification Report on Test Set:
              precision    recall  f1-score   support

         0.0     0.5271    0.5120    0.5194       209
         1.0     0.6151    0.6293    0.6221       259

    accuracy                         0.5769       468
   macro avg     0.5711   

In [43]:
# 11. LSTM on M1 (MPS) — large version

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_recall_curve, auc, accuracy_score
)

# 1) Scale features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[feature_columns] = scaler.fit_transform(df[feature_columns])

# 2) Build 6-month sequences
group_key = 'PERMNO' if 'PERMNO' in df_scaled.columns else 'CUSIP'
WINDOW = 6

Xs, ys = [], []
for _, grp in df_scaled.groupby(group_key):
    arr = grp.sort_index()[feature_columns].values
    lbl = grp['y'].values
    for i in range(WINDOW, len(arr)):
        Xs.append(arr[i-WINDOW:i])
        ys.append(lbl[i])

X = np.stack(Xs).astype(np.float32)
y = np.array(ys).astype(np.float32)

# 3) Chronological split 80/20 train/test, then 80/20 train/val
n = len(y)
cp = int(n * 0.8)
X_train_all, X_test = X[:cp], X[cp:]
y_train_all, y_test = y[:cp], y[cp:]
val_cut = int(len(X_train_all) * 0.8)
X_train, X_val = X_train_all[:val_cut], X_train_all[val_cut:]
y_train, y_val = y_train_all[:val_cut], y_train_all[val_cut:]

# 4) DataLoaders (batch_size=32)
bs = 32
train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val),   torch.from_numpy(y_val))
test_ds  = TensorDataset(torch.from_numpy(X_test),  torch.from_numpy(y_test))
dl_train = DataLoader(train_ds, batch_size=bs, shuffle=True)
dl_val   = DataLoader(val_ds,   batch_size=bs)
dl_test  = DataLoader(test_ds,  batch_size=bs)

# 5) Device: MPS if available, else CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# 6) Large LSTM model
class LargeLSTM(nn.Module):
    def __init__(self, in_dim, hid_dim=128, num_layers=2, dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=in_dim,
            hidden_size=hid_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        self.fc = nn.Sequential(
            nn.Linear(hid_dim*2, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.fc(last).squeeze(1)

model = LargeLSTM(len(feature_columns)).to(device)

# 7) Loss + optimizer (class-weighted)
pos_w = (y_train == 0).sum() / (y_train == 1).sum()
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(pos_w, device=device, dtype=torch.float32)
)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 8) Train 15 epochs
for epoch in range(1, 16):
    model.train()
    total_loss = 0.0
    for xb, yb in dl_train:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(dl_train.dataset)

    # Validation AUC
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dl_val:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            preds.extend(probs)
            trues.extend(yb.numpy())
    val_auc = roc_auc_score(trues, preds)
    print(f"Epoch {epoch:02d} | Train Loss: {avg_loss:.4f} | Val AUC: {val_auc:.4f}")

# 9) Final evaluation
model.eval()
test_preds, test_trues = [], []
with torch.no_grad():
    for xb, yb in dl_test:
        xb = xb.to(device)
        test_preds.extend(torch.sigmoid(model(xb)).cpu().numpy())
        test_trues.extend(yb.numpy())

y_hat = (np.array(test_preds) > 0.5).astype(int)

print("\nClassification Report on Test Set:")
print(classification_report(test_trues, y_hat, digits=4))
print("Confusion Matrix (low values = better balance):")
print(confusion_matrix(test_trues, y_hat))

roc = roc_auc_score(test_trues, test_preds)
precision, recall, _ = precision_recall_curve(test_trues, test_preds)
pr_auc = auc(recall, precision)
acc = accuracy_score(test_trues, y_hat)

print(f"\nTest ROC AUC:  {roc:.4f}")
print(f"Test PR AUC:   {pr_auc:.4f}")
print(f"Test Accuracy: {acc:.4f}")


Epoch 01 | Train Loss: 0.5921 | Val AUC: 0.5333
Epoch 02 | Train Loss: 0.5862 | Val AUC: 0.5131
Epoch 03 | Train Loss: 0.5786 | Val AUC: 0.5611
Epoch 04 | Train Loss: 0.5694 | Val AUC: 0.5676
Epoch 05 | Train Loss: 0.5599 | Val AUC: 0.6160
Epoch 06 | Train Loss: 0.5518 | Val AUC: 0.5645
Epoch 07 | Train Loss: 0.5297 | Val AUC: 0.6170
Epoch 08 | Train Loss: 0.5077 | Val AUC: 0.5730
Epoch 09 | Train Loss: 0.4834 | Val AUC: 0.5968
Epoch 10 | Train Loss: 0.4509 | Val AUC: 0.5877
Epoch 11 | Train Loss: 0.4129 | Val AUC: 0.5959
Epoch 12 | Train Loss: 0.3653 | Val AUC: 0.5782
Epoch 13 | Train Loss: 0.3355 | Val AUC: 0.5860
Epoch 14 | Train Loss: 0.2971 | Val AUC: 0.5452
Epoch 15 | Train Loss: 0.2622 | Val AUC: 0.5521

Classification Report on Test Set:
              precision    recall  f1-score   support

         0.0     0.5200    0.4976    0.5086       209
         1.0     0.6082    0.6293    0.6186       259

    accuracy                         0.5705       468
   macro avg     0.5641   

In [42]:
# ───────────
# 11. Transformer on M1 (MPS)
# ───────────

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_recall_curve, auc, accuracy_score
)

# 1) Scale features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[feature_columns] = scaler.fit_transform(df[feature_columns])

# 2) Build 6-month sequences
group_key = 'PERMNO' if 'PERMNO' in df_scaled.columns else 'CUSIP'
WINDOW = 6
Xs, ys = [], []
for _, grp in df_scaled.groupby(group_key):
    arr = grp.sort_index()[feature_columns].values
    lbl = grp['y'].values
    for i in range(WINDOW, len(arr)):
        Xs.append(arr[i-WINDOW:i])
        ys.append(lbl[i])
X = np.stack(Xs).astype(np.float32)  # (n_samples, WINDOW, n_features)
y = np.array(ys).astype(np.float32)

# 3) Chronological split 80/20 train/test, then 80/20 train/val
n = len(y)
cp = int(n * 0.8)
X_train_all, X_test = X[:cp], X[cp:]
y_train_all, y_test = y[:cp], y[cp:]
val_cut = int(len(X_train_all) * 0.8)
X_train, X_val = X_train_all[:val_cut], X_train_all[val_cut:]
y_train, y_val = y_train_all[:val_cut], y_train_all[val_cut:]

# 4) DataLoaders
bs = 32
train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val),   torch.from_numpy(y_val))
test_ds  = TensorDataset(torch.from_numpy(X_test),  torch.from_numpy(y_test))
dl_train = DataLoader(train_ds, batch_size=bs, shuffle=True)
dl_val   = DataLoader(val_ds,   batch_size=bs)
dl_test  = DataLoader(test_ds,  batch_size=bs)

# 5) Device selection (MPS or CPU)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# 6) Transformer-based classifier
class StockTransformer(nn.Module):
    def __init__(self, in_dim, d_model=128, nhead=4, num_layers=2, dim_ff=256, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(in_dim, d_model)
        self.pos_embed = nn.Parameter(torch.randn(1, WINDOW, d_model))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.classifier = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        # x: (batch, seq_len, in_dim)
        x = self.input_proj(x) + self.pos_embed[:, :x.size(1), :]
        out = self.transformer(x)            # (batch, seq_len, d_model)
        return self.classifier(out[:, -1, :]).squeeze(1)

model = StockTransformer(len(feature_columns)).to(device)

# 7) Loss & optimizer (with class weighting)
pos_w = (y_train == 0).sum() / (y_train == 1).sum()
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(pos_w, device=device, dtype=torch.float32)
)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5)

# 8) Train with early stopping on Val AUC
best_val_auc = 0.0
patience, trials = 5, 0

for epoch in range(1, 51):
    model.train()
    total_loss = 0.0
    for xb, yb in dl_train:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(dl_train.dataset)

    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dl_val:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            preds.extend(probs)
            trues.extend(yb.numpy())
    val_auc = roc_auc_score(trues, preds)
    scheduler.step(val_auc)
    print(f"Epoch {epoch:02d} | Train Loss: {avg_loss:.4f} | Val AUC: {val_auc:.4f}")

    if val_auc > best_val_auc:
        best_val_auc, trials = val_auc, 0
        torch.save(model.state_dict(), 'best_transformer.pth')
    else:
        trials += 1
        if trials >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

# 9) Test set evaluation
model.load_state_dict(torch.load('best_transformer.pth'))
model.eval()
test_preds, test_trues = [], []
with torch.no_grad():
    for xb, yb in dl_test:
        xb = xb.to(device)
        test_preds.extend(torch.sigmoid(model(xb)).cpu().numpy())
        test_trues.extend(yb.numpy())

y_hat = (np.array(test_preds) > 0.5).astype(int)

print("\nClassification Report on Test Set:")
print(classification_report(test_trues, y_hat, digits=4))
print("Confusion Matrix (low values = better balance):")
print(confusion_matrix(test_trues, y_hat))

roc = roc_auc_score(test_trues, test_preds)
prec, rec, _ = precision_recall_curve(test_trues, test_preds)
print(f"\nTest ROC AUC:  {roc:.4f}")
print(f"Test PR AUC:   {auc(rec, prec):.4f}")
print(f"Test Accuracy: {accuracy_score(test_trues, y_hat):.4f}")


Epoch 01 | Train Loss: 0.5960 | Val AUC: 0.5742
Epoch 02 | Train Loss: 0.5952 | Val AUC: 0.5517
Epoch 03 | Train Loss: 0.5922 | Val AUC: 0.5405
Epoch 04 | Train Loss: 0.5898 | Val AUC: 0.5289
Epoch 05 | Train Loss: 0.5879 | Val AUC: 0.5586
Epoch 06 | Train Loss: 0.5839 | Val AUC: 0.5559
Early stopping at epoch 6

Classification Report on Test Set:
              precision    recall  f1-score   support

         0.0     0.5257    0.4402    0.4792       209
         1.0     0.6007    0.6795    0.6377       259

    accuracy                         0.5726       468
   macro avg     0.5632    0.5599    0.5584       468
weighted avg     0.5672    0.5726    0.5669       468

Confusion Matrix (low values = better balance):
[[ 92 117]
 [ 83 176]]

Test ROC AUC:  0.6066
Test PR AUC:   0.6635
Test Accuracy: 0.5726


In [45]:
# ───────────
# 11. InceptionTime on M1 (MPS)
# ───────────

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_recall_curve, auc, accuracy_score
)

# 1) Scale features
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[feature_columns] = scaler.fit_transform(df[feature_columns])

# 2) Build 6-month sequences
group_key = 'PERMNO' if 'PERMNO' in df_scaled.columns else 'CUSIP'
WINDOW = 6
Xs, ys = [], []
for _, grp in df_scaled.groupby(group_key):
    arr = grp.sort_index()[feature_columns].values
    lbl = grp['y'].values
    for i in range(WINDOW, len(arr)):
        Xs.append(arr[i-WINDOW:i])
        ys.append(lbl[i])
X = np.stack(Xs).astype(np.float32)   # (n_samples, WINDOW, n_features)
y = np.array(ys).astype(np.float32)

# 3) Chronological split 80/20 train/test, then 80/20 train/val
n = len(y)
cp = int(n * 0.8)
X_train_all, X_test = X[:cp], X[cp:]
y_train_all, y_test = y[:cp], y[cp:]
val_cut = int(len(X_train_all) * 0.8)
X_train, X_val = X_train_all[:val_cut], X_train_all[val_cut:]
y_train, y_val = y_train_all[:val_cut], y_train_all[val_cut:]

# 4) DataLoaders
bs = 32
train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val),   torch.from_numpy(y_val))
test_ds  = TensorDataset(torch.from_numpy(X_test),  torch.from_numpy(y_test))
dl_train = DataLoader(train_ds, batch_size=bs, shuffle=True)
dl_val   = DataLoader(val_ds,   batch_size=bs)
dl_test  = DataLoader(test_ds,  batch_size=bs)

# 5) Device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# 6) InceptionTime Modules
class InceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_sizes=[10,20,40], 
                 bottleneck_channels=32, use_residual=False, dropout=0.2):
        super().__init__()
        self.use_residual = use_residual
        # 1x1 Bottleneck
        self.bottleneck = nn.Conv1d(in_channels, bottleneck_channels, kernel_size=1) \
                          if in_channels > 1 else nn.Identity()
        # Convolutions with different kernel sizes
        self.conv_branches = nn.ModuleList([
            nn.Conv1d(bottleneck_channels, out_channels, kernel_size=k, padding=k//2)
            for k in kernel_sizes
        ])
        # MaxPool branch
        self.maxpool_branch = nn.Sequential(
            nn.MaxPool1d(kernel_size=3, stride=1, padding=1),
            nn.Conv1d(in_channels, out_channels, kernel_size=1)
        )
        self.batchnorm = nn.BatchNorm1d(out_channels * (len(kernel_sizes) + 1))
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        if self.use_residual:
            self.residual = nn.Sequential(
                nn.Conv1d(in_channels, out_channels * (len(kernel_sizes) + 1), kernel_size=1),
                nn.BatchNorm1d(out_channels * (len(kernel_sizes) + 1))
            )

    def forward(self, x):
        # x: (batch, seq_len, features) -> for conv: (batch, features, seq_len)
        x_in = x.transpose(1, 2)
        if hasattr(self, 'bottleneck') and not isinstance(self.bottleneck, nn.Identity):
            x_b = self.bottleneck(x_in)
        else:
            x_b = x_in
        branches = [conv(x_b) for conv in self.conv_branches]
        branches.append(self.maxpool_branch(x_in))
        x_cat = torch.cat(branches, dim=1)
        x_cat = self.batchnorm(x_cat)
        if self.use_residual:
            x_res = self.residual(x_in)
            x_cat = x_cat + x_res
        x_cat = self.activation(x_cat)
        x_cat = self.dropout(x_cat)
        return x_cat.transpose(1, 2)

class InceptionTime(nn.Module):
    def __init__(self, in_dim, num_blocks=3, out_channels=32, 
                 kernel_sizes=[10,20,40], bottleneck_channels=32, 
                 use_residual=True, dropout=0.2):
        super().__init__()
        blocks = []
        channels = in_dim
        for _ in range(num_blocks):
            blocks.append(InceptionModule(
                in_channels=channels,
                out_channels=out_channels,
                kernel_sizes=kernel_sizes,
                bottleneck_channels=bottleneck_channels,
                use_residual=use_residual,
                dropout=dropout
            ))
            channels = out_channels * (len(kernel_sizes) + 1)
        self.network = nn.Sequential(*blocks)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Linear(channels, 1)

    def forward(self, x):
        # x: (batch, seq_len, features)
        out = self.network(x)
        # out: (batch, seq_len, channels)
        out = out.transpose(1, 2)            # -> (batch, channels, seq_len)
        pooled = self.global_pool(out).squeeze(2)
        return self.classifier(pooled).squeeze(1)

model = InceptionTime(
    in_dim=len(feature_columns),
    num_blocks=3,
    out_channels=32,
    kernel_sizes=[3,5,7],
    bottleneck_channels=32,
    use_residual=True,
    dropout=0.2
).to(device)

# 7) Loss & optimizer (class-weighted)
pos_w = (y_train == 0).sum() / (y_train == 1).sum()
criterion = nn.BCEWithLogitsLoss(
    pos_weight=torch.tensor(pos_w, device=device, dtype=torch.float32)
)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5)

# 8) Train with early stopping on Val AUC
best_val_auc = 0.0
patience, trials = 5, 0

for epoch in range(1, 51):
    model.train()
    total_loss = 0.0
    for xb, yb in dl_train:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    avg_loss = total_loss / len(dl_train.dataset)

    # validation
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dl_val:
            xb = xb.to(device)
            probs = torch.sigmoid(model(xb)).cpu().numpy()
            preds.extend(probs)
            trues.extend(yb.numpy())
    val_auc = roc_auc_score(trues, preds)
    scheduler.step(val_auc)

    print(f"Epoch {epoch:02d} | Train Loss: {avg_loss:.4f} | Val AUC: {val_auc:.4f}")

    if val_auc > best_val_auc:
        best_val_auc, trials = val_auc, 0
        torch.save(model.state_dict(), 'best_inception.pth')
    else:
        trials += 1
        if trials >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

# 9) Test set evaluation
model.load_state_dict(torch.load('best_inception.pth'))
model.eval()
test_preds, test_trues = [], []
with torch.no_grad():
    for xb, yb in dl_test:
        xb = xb.to(device)
        test_preds.extend(torch.sigmoid(model(xb)).cpu().numpy())
        test_trues.extend(yb.numpy())

y_hat = (np.array(test_preds) > 0.5).astype(int)

print("\nClassification Report on Test Set:")
print(classification_report(test_trues, y_hat, digits=4))
print("Confusion Matrix (low values = better balance):")
print(confusion_matrix(test_trues, y_hat))

roc = roc_auc_score(test_trues, test_preds)
prec, rec, _ = precision_recall_curve(test_trues, test_preds)
print(f"\nTest ROC AUC:  {roc:.4f}")
print(f"Test PR AUC:   {auc(rec, prec):.4f}")
print(f"Test Accuracy: {accuracy_score(test_trues, y_hat):.4f}")


Epoch 01 | Train Loss: 0.6048 | Val AUC: 0.5436
Epoch 02 | Train Loss: 0.5881 | Val AUC: 0.5549
Epoch 03 | Train Loss: 0.5838 | Val AUC: 0.5428
Epoch 04 | Train Loss: 0.5742 | Val AUC: 0.5495
Epoch 05 | Train Loss: 0.5639 | Val AUC: 0.5139
Epoch 06 | Train Loss: 0.5574 | Val AUC: 0.5351
Epoch 07 | Train Loss: 0.5401 | Val AUC: 0.5579
Epoch 08 | Train Loss: 0.5344 | Val AUC: 0.5609
Epoch 09 | Train Loss: 0.5138 | Val AUC: 0.5491
Epoch 10 | Train Loss: 0.5203 | Val AUC: 0.5471
Epoch 11 | Train Loss: 0.5092 | Val AUC: 0.5371
Epoch 12 | Train Loss: 0.5030 | Val AUC: 0.5414
Epoch 13 | Train Loss: 0.4758 | Val AUC: 0.5353
Early stopping at epoch 13

Classification Report on Test Set:
              precision    recall  f1-score   support

         0.0     0.5568    0.4689    0.5091       209
         1.0     0.6199    0.6988    0.6570       259

    accuracy                         0.5962       468
   macro avg     0.5883    0.5839    0.5830       468
weighted avg     0.5917    0.5962    0.59