In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
import datetime
from tqdm import tqdm
import time

In [6]:
firm = pd.read_csv("../data/raw/mma_sample_v2.csv")
firm.head(2)

Unnamed: 0,date,ret_eom,permno,shrcd,exchcd,mspread,year,month,size_port,rf,...,betadown_252d,bidaskhl_21d,corr_1260d,betabab_1260d,rmax5_rvol_21d,age,qmj,qmj_prof,qmj_growth,qmj_safety
0,20000131,20000131,10078,11,3,0.017178,2000,1,B,0.0041,...,2.219037,0.012635,0.504688,1.58154,1.373224,180,1.700939,1.711756,1.47641,1.087626
1,20000131,20000131,10104,11,3,0.01972,2000,1,B,0.0041,...,1.43733,0.016634,0.473872,1.941648,2.058353,180,0.82893,1.71767,-0.603531,0.608215


In [2]:
# === Step 1: Load and merge data ===
# Load firm-level data
firm = pd.read_csv("../data/raw/mma_sample_v2.csv", parse_dates=["date", "ret_eom"])

# Convert ret_eom (return date) to yyyymm for alignment
firm["yyyymm"] = firm["ret_eom"].dt.to_period("M").dt.to_timestamp()
firm["year"] = firm["yyyymm"].dt.year
firm["month"] = firm["yyyymm"].dt.month

# Load macroeconomic variables and surprise predictions
macro = pd.read_csv("../data/macro/macro_monthly.csv")
macro["yyyymm"] = pd.to_datetime(macro["yyyymm"].astype(str), format="%Y%m")

surprise = pd.read_csv("../results/surprise_predictions.csv", parse_dates=["date"])
surprise["yyyymm"] = surprise["date"].dt.to_period("M").dt.to_timestamp()
surprise = surprise.rename(columns={"predicted_surprise": "surprise_pred"})

# Load list of stock characteristic features
factor_list = pd.read_csv("../data/raw/factor_char_list.csv")["variable"].tolist()

In [3]:
# Filter firm data to include only relevant features
firm = firm[["permno", "yyyymm", "stock_exret", "year", "month"] + factor_list]

# Drop observations with missing target value
firm = firm[firm["stock_exret"].notna()].copy()

# Merge predicted surprise
firm = firm.merge(surprise, on=["permno", "yyyymm"], how="left")

# Merge macro variables
df_full = firm.merge(macro, on="yyyymm", how="left")

In [18]:
df_full.head(2)

Unnamed: 0,permno,yyyymm,stock_exret,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,...,date,pred_surprise,dp,ep,bm,ntis,svar,dfy,tms,tbl
0,10078,2000-01-01,0.010428,180,1.1216,0.182049,5.1e-05,1.749945,0.474393,0.069654,...,NaT,,-4.423938,-3.346471,0.154654,0.025359,0.005206,0.0055,0.0134,0.0532
1,10104,2000-01-01,-0.112577,180,0.9079,0.117486,8.3e-05,1.798574,0.156012,0.04124,...,NaT,,-4.423938,-3.346471,0.154654,0.025359,0.005206,0.0055,0.0134,0.0532


In [4]:
# === Step 2: Create interaction terms in batches (excluding surprise_pred and date) ===
macro_vars = ['dp', 'ep', 'ntis', 'bm', 'svar', 'dfy', 'tms', 'tbl']
interaction_terms = []
batch_size = 20  # adjust as needed depending on memory capacity

for i in range(0, len(factor_list), batch_size):
    batch_vars = factor_list[i:i + batch_size]
    interaction_data = {}

    for macro_var in macro_vars:
        for firm_var in batch_vars:
            col_name = f"{firm_var}_{macro_var}"
            interaction_data[col_name] = df_full[firm_var] * df_full[macro_var]
            interaction_terms.append(col_name)

    interaction_df = pd.DataFrame(interaction_data)
    df_full = pd.concat([df_full.reset_index(drop=True), interaction_df.reset_index(drop=True)], axis=1)
    del interaction_df, interaction_data  # free memory after each batch

In [5]:
# Add surprise_pred as a direct feature (no interaction)
if "surprise_pred" not in df_full.columns:
    df_full["surprise_pred"] = np.nan

# Final feature list for modeling
all_features = factor_list + interaction_terms + ["surprise_pred"]

In [6]:
# === Step 3: Monthly standardization with median imputation ===
# Ensure all expected features exist before standardization
for col in all_features:
    if col not in df_full.columns:
        df_full[col] = np.nan

data = pd.DataFrame()
monthly = df_full.groupby("yyyymm")
for date, group in monthly:
    group = group.copy()
    for var in all_features:
        median_val = group[var].median(skipna=True)
        group[var] = group[var].fillna(median_val)
        group[var] = group[var].rank(method="dense") - 1
        vmax = group[var].max()
        group[var] = (group[var] / vmax) * 2 - 1 if vmax > 0 else 0
    data = pd.concat([data, group], ignore_index=True)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [7]:
# === Chunked Standardization Function ===
def chunk_standardize(df_train, df_val, df_test, columns, chunk_size=100):
    for i in range(0, len(columns), chunk_size):
        subset = columns[i:i+chunk_size]
        scaler = StandardScaler().fit(df_train[subset])
        df_train.loc[:, subset] = scaler.transform(df_train[subset])
        df_val.loc[:, subset] = scaler.transform(df_val[subset])
        df_test.loc[:, subset] = scaler.transform(df_test[subset])
    return df_train, df_val, df_test

## OLS

In [9]:
# === Step 4: Expanding window OLS-only training and prediction ===
data['date'] = data['yyyymm']
start_round = 0
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = start_round

print("Starting expanding window prediction (OLS only)...")

# Count total rounds based on full expansion logic
end_date = pd.to_datetime("20240101", format="%Y%m%d")
total_rounds = 0
while (starting + pd.DateOffset(years=11 + total_rounds)) <= end_date:
    total_rounds += 1

pbar = tqdm(total=total_rounds - start_round, desc="OLS Rounds")

while (counter < total_rounds):
    round_start_time = time.time()

    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    print(f"\n=== Round {counter + 1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

    if test.empty or train.empty or validate.empty:
        print(f"Skipping round {counter + 1} due to empty set.")
        counter += 1
        pbar.update(1)
        continue

    print("Standardizing features (chunked)...")
    train, validate, test = chunk_standardize(train, validate, test, all_features, chunk_size=100)
    print("Standardization complete.")

    X_train, Y_train = train[all_features].values, train['stock_exret'].values
    X_test, Y_test = test[all_features].values, test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    reg_pred = test[["permno", "yyyymm", "stock_exret"]].copy()

    print("Training OLS...")
    reg = LinearRegression(fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["ols"] = reg.predict(X_test) + Y_mean

    r2_ols = 1 - np.sum((reg_pred["ols"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # Append predictions and R² per round
    reg_pred.to_csv("../results/ols_predictions_all.csv", mode="a", header=(counter==start_round), index=False)
    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_ols": r2_ols
    }]).to_csv("../results/ols_r2_all.csv", mode="a", header=(counter==start_round), index=False)

    print(f"Round {counter + 1} completed. R²: {r2_ols:.4f}, Time: {round(time.time() - round_start_time, 2)}s")
    counter += 1
    pbar.update(1)

print("OLS-only prediction complete.")

Starting expanding window prediction (OLS only)...



OLS Rounds:  50%|███████████████████████████████████                                   | 7/14 [20:23<20:23, 174.77s/it]


=== Round 1: 2000-01 to 2011-01 ===





Train size: (93496, 1339), Val size: (20897, 1339), Test size: (10563, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:   7%|█████                                                                  | 1/14 [00:37<08:08, 37.59s/it]

Round 1 completed. R²: -0.0529, Time: 37.59s

=== Round 2: 2000-01 to 2012-01 ===
Train size: (103876, 1339), Val size: (21080, 1339), Test size: (10458, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  14%|██████████▏                                                            | 2/14 [01:20<08:08, 40.68s/it]

Round 2 completed. R²: -0.0272, Time: 42.83s

=== Round 3: 2000-01 to 2013-01 ===
Train size: (114393, 1339), Val size: (21021, 1339), Test size: (10556, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  21%|███████████████▏                                                       | 3/14 [02:06<07:56, 43.33s/it]

Round 3 completed. R²: -0.0610, Time: 46.49s

=== Round 4: 2000-01 to 2014-01 ===
Train size: (124956, 1339), Val size: (21014, 1339), Test size: (10617, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  29%|████████████████████▎                                                  | 4/14 [02:58<07:46, 46.63s/it]

Round 4 completed. R²: -0.1533, Time: 51.67s

=== Round 5: 2000-01 to 2015-01 ===
Train size: (135414, 1339), Val size: (21173, 1339), Test size: (10974, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  36%|█████████████████████████▎                                             | 5/14 [03:50<07:17, 48.65s/it]

Round 5 completed. R²: -0.0384, Time: 52.23s

=== Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1339), Val size: (21591, 1339), Test size: (11459, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  43%|██████████████████████████████▍                                        | 6/14 [04:51<07:01, 52.73s/it]

Round 6 completed. R²: -0.1779, Time: 60.61s

=== Round 7: 2000-01 to 2017-01 ===
Train size: (156587, 1339), Val size: (22433, 1339), Test size: (11273, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  50%|███████████████████████████████████▌                                   | 7/14 [05:54<06:33, 56.23s/it]

Round 7 completed. R²: -0.1139, Time: 63.39s

=== Round 8: 2000-01 to 2018-01 ===
Train size: (167561, 1339), Val size: (22732, 1339), Test size: (11054, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  57%|████████████████████████████████████████▌                              | 8/14 [07:02<05:59, 59.88s/it]

Round 8 completed. R²: -0.0372, Time: 67.67s

=== Round 9: 2000-01 to 2019-01 ===
Train size: (179020, 1339), Val size: (22327, 1339), Test size: (11061, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  64%|█████████████████████████████████████████████▋                         | 9/14 [08:16<05:20, 64.13s/it]

Round 9 completed. R²: -0.0364, Time: 73.43s

=== Round 10: 2000-01 to 2020-01 ===
Train size: (190293, 1339), Val size: (22115, 1339), Test size: (11207, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  71%|██████████████████████████████████████████████████                    | 10/14 [09:36<04:35, 68.99s/it]

Round 10 completed. R²: -0.0880, Time: 79.85s

=== Round 11: 2000-01 to 2021-01 ===
Train size: (201347, 1339), Val size: (22268, 1339), Test size: (11729, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  79%|███████████████████████████████████████████████████████               | 11/14 [10:56<03:37, 72.65s/it]

Round 11 completed. R²: -0.0246, Time: 80.88s

=== Round 12: 2000-01 to 2022-01 ===
Train size: (212408, 1339), Val size: (22936, 1339), Test size: (12510, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  86%|████████████████████████████████████████████████████████████          | 12/14 [12:29<02:37, 78.70s/it]

Round 12 completed. R²: -0.0011, Time: 92.47s

=== Round 13: 2000-01 to 2023-01 ===
Train size: (223615, 1339), Val size: (24239, 1339), Test size: (13192, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds:  93%|█████████████████████████████████████████████████████████████████     | 13/14 [14:04<01:23, 83.66s/it]

Round 13 completed. R²: -0.0274, Time: 95.03s

=== Round 14: 2000-01 to 2024-01 ===
Train size: (235344, 1339), Val size: (25702, 1339), Test size: (12327, 1339)
Standardizing features (chunked)...
Standardization complete.
Training OLS...



[A Rounds: 100%|██████████████████████████████████████████████████████████████████████| 14/14 [15:44<00:00, 88.48s/it]

Round 14 completed. R²: -0.0808, Time: 99.55s
OLS-only prediction complete.


## Lasso

In [10]:
# === Step 4: Expanding window Lasso-only training and prediction ===
data['date'] = data['yyyymm']
start_round = 0
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = start_round

print("Starting expanding window prediction (Lasso only)...")

# Determine total number of rounds
end_date = pd.to_datetime("20240101", format="%Y%m%d")
total_rounds = 0
temp_start = starting
while (temp_start + pd.DateOffset(years=11 + total_rounds)) <= end_date:
    total_rounds += 1

pbar = tqdm(total=total_rounds - start_round, desc="Lasso Rounds")

while (counter < total_rounds):
    round_start_time = time.time()

    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    print(f"\n=== Round {counter + 1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

    if test.empty or train.empty or validate.empty:
        print(f"Skipping round {counter + 1} due to empty set.")
        counter += 1
        pbar.update(1)
        continue

    print("Standardizing features (chunked)...")
    train, validate, test = chunk_standardize(train, validate, test, all_features, chunk_size=100)
    print("Standardization complete.")

    X_train, Y_train = train[all_features].values, train['stock_exret'].values
    X_val, Y_val = validate[all_features].values, validate['stock_exret'].values
    X_test, Y_test = test[all_features].values, test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    reg_pred = test[["permno", "yyyymm", "stock_exret"]].copy()

    print("Tuning Lasso alpha...")
    lambdas = np.arange(-4, 4.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(tqdm(lambdas, desc="Alpha Grid")):
        reg = Lasso(alpha=10**i, max_iter=1000000, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)
    best_lambda = lambdas[np.argmin(val_mse)]

    print(f"Best lambda: 10^{best_lambda:.2f}")
    reg = Lasso(alpha=10**best_lambda, max_iter=1000000, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["lasso"] = reg.predict(X_test) + Y_mean

    r2_lasso = 1 - np.sum((reg_pred["lasso"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # Save prediction and r2 per round
    reg_pred.to_csv("../results/lasso_predictions_all.csv", mode="a", header=(counter==start_round), index=False)
    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_lasso": r2_lasso
    }]).to_csv("../results/lasso_r2_all.csv", mode="a", header=(counter==start_round), index=False)

    print(f"Round {counter + 1} completed. R²: {r2_lasso:.4f}, Time: {round(time.time() - round_start_time, 2)}s")
    counter += 1
    pbar.update(1)

print("Lasso-only prediction complete.")

Starting expanding window prediction (Lasso only)...


OLS Rounds: 100%|█████████████████████████████████████████████████████████████████████| 14/14 [28:20<00:00, 121.49s/it]


=== Round 1: 2000-01 to 2011-01 ===





Train size: (93496, 1339), Val size: (20897, 1339), Test size: (10563, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [00:56<1:15:38, 56.74s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [01:40<1:04:27, 48.96s/it]
[Aha Grid:   4%|██▋                                                                    | 3/81 [02:11<52:52, 40.67s/it]
[Aha Grid:   5%|███▌                                                                   | 4/81 [02:37<44:53, 34.98s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [02:57<37:34, 29.67s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [03:16<32:24, 25.93s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [03:32<28:04, 22.76s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.40


Lasso Rounds:   7%|████▋                                                             | 1/14 [06:36<1:25:52, 396.34s/it]

Round 1 completed. R²: -0.0316, Time: 396.34s

=== Round 2: 2000-01 to 2012-01 ===
Train size: (103876, 1339), Val size: (21080, 1339), Test size: (10458, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:11<1:34:59, 71.24s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:12<1:26:18, 65.55s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [02:49<1:08:07, 52.41s/it]
[Aha Grid:   5%|███▌                                                                   | 4/81 [03:20<56:34, 44.08s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [03:44<46:37, 36.80s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [04:03<38:30, 30.81s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [04:22<33:12, 26.92s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.20


Lasso Rounds:  14%|█████████▍                                                        | 2/14 [14:24<1:27:40, 438.35s/it]

Round 2 completed. R²: -0.0002, Time: 467.76s

=== Round 3: 2000-01 to 2013-01 ===
Train size: (114393, 1339), Val size: (21021, 1339), Test size: (10556, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:09<1:32:30, 69.38s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:10<1:25:15, 64.75s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [02:50<1:09:13, 53.26s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [03:34<1:03:32, 49.52s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [04:02<53:00, 41.84s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [04:26<44:36, 35.68s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [04:44<37:02, 30.04s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.20


Lasso Rounds:  21%|██████████████▏                                                   | 3/14 [22:46<1:25:41, 467.37s/it]

Round 3 completed. R²: -0.0198, Time: 501.91s

=== Round 4: 2000-01 to 2014-01 ===
Train size: (124956, 1339), Val size: (21014, 1339), Test size: (10617, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:13<1:37:26, 73.08s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:12<1:25:45, 65.14s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [03:21<1:26:47, 66.77s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [04:10<1:16:44, 59.80s/it]
[Aha Grid:   6%|████▎                                                                | 5/81 [04:36<1:00:09, 47.50s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [05:00<49:19, 39.46s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [05:19<40:47, 33.08s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.20


Lasso Rounds:  29%|██████████████████▊                                               | 4/14 [31:57<1:23:26, 500.65s/it]

Round 4 completed. R²: -0.1136, Time: 551.67s

=== Round 5: 2000-01 to 2015-01 ===
Train size: (135414, 1339), Val size: (21173, 1339), Test size: (10974, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:21<1:48:26, 81.34s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:16<1:27:02, 66.10s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [03:01<1:12:59, 56.15s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [03:39<1:02:59, 49.09s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [04:14<55:46, 44.04s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [04:39<46:51, 37.48s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [04:59<39:10, 31.76s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.30


Lasso Rounds:  36%|███████████████████████▌                                          | 5/14 [40:59<1:17:18, 515.36s/it]

Round 5 completed. R²: -0.0052, Time: 541.42s

=== Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1339), Val size: (21591, 1339), Test size: (11459, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                   | 1/81 [01:56<2:35:54, 116.93s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [03:03<1:55:16, 87.56s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [03:38<1:22:05, 63.14s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [04:06<1:03:33, 49.53s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [04:32<52:04, 41.11s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [04:55<43:21, 34.68s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [05:12<36:01, 29.21s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.30


Lasso Rounds:  43%|████████████████████████████▎                                     | 6/14 [50:25<1:11:01, 532.63s/it]

Round 6 completed. R²: -0.0057, Time: 566.16s

=== Round 7: 2000-01 to 2017-01 ===
Train size: (156587, 1339), Val size: (22433, 1339), Test size: (11273, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                   | 1/81 [01:48<2:24:17, 108.22s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:55<1:50:56, 84.26s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [03:24<1:16:47, 59.07s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [03:52<1:00:06, 46.83s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [04:15<48:17, 38.13s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [04:34<39:27, 31.57s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [04:53<33:44, 27.36s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.20


Lasso Rounds:  50%|█████████████████████████████████                                 | 7/14 [59:44<1:03:08, 541.27s/it]

Round 7 completed. R²: -0.0054, Time: 559.04s

=== Round 8: 2000-01 to 2018-01 ===
Train size: (167561, 1339), Val size: (22732, 1339), Test size: (11054, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:22<1:49:23, 82.05s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:16<1:26:22, 65.60s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [02:47<1:04:48, 49.85s/it]
[Aha Grid:   5%|███▌                                                                   | 4/81 [03:14<52:29, 40.91s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [03:37<43:51, 34.63s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [03:58<37:14, 29.79s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [04:18<33:02, 26.79s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.40


Lasso Rounds:  57%|█████████████████████████████████████▋                            | 8/14 [1:08:42<54:02, 540.42s/it]

Round 8 completed. R²: -0.0171, Time: 538.6s

=== Round 9: 2000-01 to 2019-01 ===
Train size: (179020, 1339), Val size: (22327, 1339), Test size: (11061, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:24<1:53:12, 84.90s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [03:00<2:00:07, 91.24s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [04:14<1:48:16, 83.29s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [04:44<1:19:59, 62.34s/it]
[Aha Grid:   6%|████▎                                                                | 5/81 [05:06<1:00:18, 47.61s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [05:24<47:00, 37.61s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [05:46<40:15, 32.64s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.60


Lasso Rounds:  64%|██████████████████████████████████████████▍                       | 9/14 [1:19:36<47:58, 575.66s/it]

Round 9 completed. R²: -0.0276, Time: 653.14s

=== Round 10: 2000-01 to 2020-01 ===
Train size: (190293, 1339), Val size: (22115, 1339), Test size: (11207, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                   | 1/81 [01:44<2:18:46, 104.08s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:46<1:44:43, 79.54s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [03:23<1:18:19, 60.24s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [03:57<1:03:42, 49.64s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [04:21<51:14, 40.45s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [04:44<43:14, 34.60s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [05:10<39:08, 31.74s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.30


Lasso Rounds:  71%|██████████████████████████████████████████████▍                  | 10/14 [1:30:58<40:34, 608.70s/it]

Round 10 completed. R²: -0.0293, Time: 682.67s

=== Round 11: 2000-01 to 2021-01 ===
Train size: (201347, 1339), Val size: (22268, 1339), Test size: (11729, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:39<2:13:00, 99.76s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:37<1:38:34, 74.87s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [03:13<1:14:39, 57.43s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [03:47<1:01:24, 47.85s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [04:10<49:36, 39.17s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [04:37<43:44, 34.99s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [05:07<40:55, 33.18s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.30


Lasso Rounds:  79%|███████████████████████████████████████████████████              | 11/14 [1:44:29<33:31, 670.60s/it]

Round 11 completed. R²: -0.0104, Time: 810.95s

=== Round 12: 2000-01 to 2022-01 ===
Train size: (212408, 1339), Val size: (22936, 1339), Test size: (12510, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:28<1:57:37, 88.22s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:32<1:37:51, 74.32s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [03:13<1:16:32, 58.88s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [03:47<1:02:55, 49.03s/it]
[Aha Grid:   6%|████▍                                                                  | 5/81 [04:13<51:41, 40.81s/it]
[Aha Grid:   7%|█████▎                                                                 | 6/81 [04:45<47:26, 37.95s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [05:16<43:55, 35.62s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.40


Lasso Rounds:  86%|███████████████████████████████████████████████████████▋         | 12/14 [2:00:30<25:17, 758.88s/it]

Round 12 completed. R²: -0.0076, Time: 960.77s

=== Round 13: 2000-01 to 2023-01 ===
Train size: (223615, 1339), Val size: (24239, 1339), Test size: (13192, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                    | 1/81 [01:27<1:56:36, 87.46s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [02:39<1:42:52, 78.13s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [03:30<1:25:25, 65.71s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [04:18<1:15:49, 59.08s/it]
[Aha Grid:   6%|████▎                                                                | 5/81 [05:03<1:08:12, 53.85s/it]
[Aha Grid:   7%|█████                                                                | 6/81 [05:45<1:02:07, 49.69s/it]
[Aha Grid:   9%|██████▏                                                                | 7/81 [06:22<56:07, 45.51s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-2.40


Lasso Rounds:  93%|████████████████████████████████████████████████████████████▎    | 13/14 [2:22:32<15:29, 929.44s/it]

Round 13 completed. R²: -0.0299, Time: 1321.91s

=== Round 14: 2000-01 to 2024-01 ===
Train size: (235344, 1339), Val size: (25702, 1339), Test size: (12327, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning Lasso alpha...



[Aha Grid:   0%|                                                                               | 0/81 [00:00<?, ?it/s]
[Aha Grid:   1%|▊                                                                   | 1/81 [02:09<2:53:09, 129.87s/it]
[Aha Grid:   2%|█▋                                                                   | 2/81 [03:27<2:10:33, 99.16s/it]
[Aha Grid:   4%|██▌                                                                  | 3/81 [05:03<2:07:14, 97.88s/it]
[Aha Grid:   5%|███▍                                                                 | 4/81 [05:54<1:41:24, 79.02s/it]
[Aha Grid:   6%|████▎                                                                | 5/81 [06:36<1:23:26, 65.87s/it]
[Aha Grid:   7%|█████                                                                | 6/81 [07:18<1:12:02, 57.63s/it]
[Aha Grid:   9%|█████▉                                                               | 7/81 [07:57<1:03:28, 51.47s/it]
[Aha Grid:  10%|███████               

Best lambda: 10^-3.40


Lasso Rounds: 100%|████████████████████████████████████████████████████████████████| 14/14 [2:49:32<00:00, 1138.05s/it]

Round 14 completed. R²: -0.0104, Time: 1620.04s
Lasso-only prediction complete.


In [23]:
import pandas as pd
import numpy as np

lambda_df = pd.read_csv("../results/best_lambda_by_round.csv")
lambda_raw = lambda_df["Best Lambda (Alpha)"].astype(str)

lambda_df.head(2)

Unnamed: 0,Round,Best Lambda (Alpha)
0,1,10^-2.40
1,2,10^-2.20


### Lasso Feature Selection

In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from collections import Counter
from tqdm import tqdm

# === Step 0: Load and parse lambda values correctly ===
lambda_df = pd.read_csv("../results/best_lambda_by_round.csv", skiprows=1, header=None)

# This will convert "10^-2.4" → 10 ** -2.4 → 0.00398 (numeric alpha)
def parse_lambda(expr):
    try:
        exponent = float(expr.split("^")[1])
        return 10 ** exponent
    except:
        return np.nan

best_lambdas = lambda_df.iloc[:, 1].apply(parse_lambda).dropna().values

# === Step 1: Initialize ===
feature_counter = Counter()
data['date'] = data['yyyymm']
starting = pd.to_datetime("20000101", format="%Y%m%d")

# === Step 2: Loop through all rounds ===
for counter in tqdm(range(len(best_lambdas)), desc="Refitting Lasso"):
    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]

    if train.empty or validate.empty:
        print(f"Skipping round {counter + 1} due to empty set.")
        continue

    X_train = train[all_features].values
    Y_train = train['stock_exret'].values
    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    # alpha is already numeric
    alpha = best_lambdas[counter]
    reg = Lasso(alpha=alpha, max_iter=1000000, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)

    nonzero_mask = reg.coef_ != 0
    nonzero_vars = np.array(all_features)[nonzero_mask]
    feature_counter.update(nonzero_vars)

    print(f"Round {counter+1}: alpha={alpha:.5f}, nonzero features={len(nonzero_vars)}")

# === Step 4: Output top feature frequency ===
freq_df = pd.DataFrame.from_dict(feature_counter, orient='index', columns=['count'])
freq_df = freq_df.sort_values('count', ascending=False)
freq_df.to_csv("../results/lasso_top_features.csv")

print(f"Finished! Total selected features: {len(freq_df)}")
print("Top feature frequency saved to ../results/lasso_top_features.csv")

Refitting Lasso:   7%|████▋                                                             | 1/14 [00:06<01:24,  6.47s/it]

Round 1: alpha=0.00398, nonzero features=3


Refitting Lasso:  14%|█████████▍                                                        | 2/14 [00:13<01:19,  6.66s/it]

Round 2: alpha=0.00631, nonzero features=0


Refitting Lasso:  21%|██████████████▏                                                   | 3/14 [00:21<01:19,  7.19s/it]

Round 3: alpha=0.00631, nonzero features=0


Refitting Lasso:  29%|██████████████████▊                                               | 4/14 [00:32<01:30,  9.04s/it]

Round 4: alpha=0.00631, nonzero features=0


Refitting Lasso:  36%|███████████████████████▌                                          | 5/14 [00:45<01:34, 10.47s/it]

Round 5: alpha=0.00501, nonzero features=0


Refitting Lasso:  43%|████████████████████████████▎                                     | 6/14 [01:05<01:48, 13.57s/it]

Round 6: alpha=0.00501, nonzero features=0


Refitting Lasso:  50%|█████████████████████████████████                                 | 7/14 [01:24<01:47, 15.33s/it]

Round 7: alpha=0.00631, nonzero features=0


Refitting Lasso:  57%|█████████████████████████████████████▋                            | 8/14 [01:47<01:46, 17.68s/it]

Round 8: alpha=0.00398, nonzero features=0


Refitting Lasso:  64%|██████████████████████████████████████████▍                       | 9/14 [02:10<01:37, 19.52s/it]

Round 9: alpha=0.00251, nonzero features=1


Refitting Lasso:  71%|██████████████████████████████████████████████▍                  | 10/14 [02:35<01:25, 21.26s/it]

Round 10: alpha=0.00501, nonzero features=0


Refitting Lasso:  79%|███████████████████████████████████████████████████              | 11/14 [03:22<01:26, 28.88s/it]

Round 11: alpha=0.00501, nonzero features=0


Refitting Lasso:  86%|███████████████████████████████████████████████████████▋         | 12/14 [04:01<01:03, 31.94s/it]

Round 12: alpha=0.00398, nonzero features=0


Refitting Lasso:  93%|████████████████████████████████████████████████████████████▎    | 13/14 [04:45<00:35, 35.79s/it]

Round 13: alpha=0.00398, nonzero features=0


Refitting Lasso: 100%|█████████████████████████████████████████████████████████████████| 14/14 [05:33<00:00, 23.79s/it]

Round 14: alpha=0.00040, nonzero features=129





Finished! Total selected features: 131
Top feature frequency saved to ../results/lasso_top_features.csv


## OLS Reduced Feature

In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import time

# === Load reduced features ===
reduced_features = pd.read_csv("../results/lasso_top_features.csv")
reduced_feature_list = reduced_features.iloc[:, 0].tolist()

# === Chunked Standardization Function ===
def chunk_standardize(df_train, df_val, df_test, columns, chunk_size=100):
    for i in range(0, len(columns), chunk_size):
        subset = columns[i:i+chunk_size]
        scaler = StandardScaler().fit(df_train[subset])
        df_train.loc[:, subset] = scaler.transform(df_train[subset])
        df_val.loc[:, subset] = scaler.transform(df_val[subset])
        df_test.loc[:, subset] = scaler.transform(df_test[subset])
    return df_train, df_val, df_test

# === Step 4: Expanding window OLS (Reduced) ===
data['date'] = data['yyyymm']
start_round = 0
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = start_round

print("Starting expanding window prediction (OLS with reduced features)...")

# Determine total number of rounds
end_date = pd.to_datetime("20240101", format="%Y%m%d")
total_rounds = 0
temp_start = starting
while (temp_start + pd.DateOffset(years=11 + total_rounds)) <= end_date:
    total_rounds += 1

pbar = tqdm(total=total_rounds - start_round, desc="OLS Rounds")

while (counter < total_rounds):
    round_start_time = time.time()

    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    print(f"\n=== Round {counter + 1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

    if test.empty or train.empty or validate.empty:
        print(f"Skipping round {counter + 1} due to empty set.")
        counter += 1
        pbar.update(1)
        continue

    print("Standardizing reduced features (chunked)...")
    train, validate, test = chunk_standardize(train, validate, test, reduced_feature_list, chunk_size=100)
    print("Standardization complete.")

    X_train = train[reduced_feature_list].values
    Y_train = train['stock_exret'].values
    X_test = test[reduced_feature_list].values
    Y_test = test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    reg_pred = test[["permno", "yyyymm", "stock_exret"]].copy()

    print("Training OLS...")
    reg = LinearRegression(fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["ols"] = reg.predict(X_test) + Y_mean

    r2_ols = 1 - np.sum((reg_pred["ols"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # Save prediction and r2 per round
    reg_pred.to_csv("../results/ols_predictions_reduced.csv", mode="a", header=(counter==start_round), index=False)
    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_ols": r2_ols
    }]).to_csv("../results/ols_r2_reduced.csv", mode="a", header=(counter==start_round), index=False)

    print(f"Round {counter + 1} completed. R²: {r2_ols:.4f}, Time: {round(time.time() - round_start_time, 2)}s")
    counter += 1
    pbar.update(1)

print("OLS (Reduced) prediction complete.")

Starting expanding window prediction (OLS with reduced features)...


Ridge Rounds: 100%|████████████████████████████████████████████████████████████████████| 14/14 [12:14<00:00, 52.48s/it]


=== Round 1: 2000-01 to 2011-01 ===





Train size: (93496, 1339), Val size: (20897, 1339), Test size: (10563, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:   7%|█████                                                                  | 1/14 [00:12<02:39, 12.27s/it]

Round 1 completed. R²: -0.0472, Time: 12.27s

=== Round 2: 2000-01 to 2012-01 ===
Train size: (103876, 1339), Val size: (21080, 1339), Test size: (10458, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  14%|██████████▏                                                            | 2/14 [00:20<01:55,  9.60s/it]

Round 2 completed. R²: -0.0149, Time: 7.73s

=== Round 3: 2000-01 to 2013-01 ===
Train size: (114393, 1339), Val size: (21021, 1339), Test size: (10556, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  21%|███████████████▏                                                       | 3/14 [00:29<01:44,  9.53s/it]

Round 3 completed. R²: -0.0339, Time: 9.43s

=== Round 4: 2000-01 to 2014-01 ===
Train size: (124956, 1339), Val size: (21014, 1339), Test size: (10617, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  29%|████████████████████▎                                                  | 4/14 [00:38<01:34,  9.44s/it]

Round 4 completed. R²: -0.1303, Time: 9.32s

=== Round 5: 2000-01 to 2015-01 ===
Train size: (135414, 1339), Val size: (21173, 1339), Test size: (10974, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  36%|█████████████████████████▎                                             | 5/14 [00:48<01:25,  9.47s/it]

Round 5 completed. R²: -0.0257, Time: 9.5s

=== Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1339), Val size: (21591, 1339), Test size: (11459, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  43%|██████████████████████████████▍                                        | 6/14 [00:59<01:21, 10.21s/it]

Round 6 completed. R²: -0.0170, Time: 11.63s

=== Round 7: 2000-01 to 2017-01 ===
Train size: (156587, 1339), Val size: (22433, 1339), Test size: (11273, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  50%|███████████████████████████████████▌                                   | 7/14 [01:17<01:27, 12.46s/it]

Round 7 completed. R²: -0.0068, Time: 17.08s

=== Round 8: 2000-01 to 2018-01 ===
Train size: (167561, 1339), Val size: (22732, 1339), Test size: (11054, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  57%|████████████████████████████████████████▌                              | 8/14 [01:33<01:22, 13.70s/it]

Round 8 completed. R²: -0.0196, Time: 16.34s

=== Round 9: 2000-01 to 2019-01 ===
Train size: (179020, 1339), Val size: (22327, 1339), Test size: (11061, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  64%|█████████████████████████████████████████████▋                         | 9/14 [01:53<01:19, 15.85s/it]

Round 9 completed. R²: -0.0286, Time: 20.58s

=== Round 10: 2000-01 to 2020-01 ===
Train size: (190293, 1339), Val size: (22115, 1339), Test size: (11207, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  71%|██████████████████████████████████████████████████                    | 10/14 [02:19<01:15, 18.83s/it]

Round 10 completed. R²: -0.0349, Time: 25.5s

=== Round 11: 2000-01 to 2021-01 ===
Train size: (201347, 1339), Val size: (22268, 1339), Test size: (11729, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  79%|███████████████████████████████████████████████████████               | 11/14 [02:46<01:04, 21.35s/it]

Round 11 completed. R²: -0.0167, Time: 27.04s

=== Round 12: 2000-01 to 2022-01 ===
Train size: (212408, 1339), Val size: (22936, 1339), Test size: (12510, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  86%|████████████████████████████████████████████████████████████          | 12/14 [03:14<00:46, 23.34s/it]

Round 12 completed. R²: 0.0027, Time: 27.88s

=== Round 13: 2000-01 to 2023-01 ===
Train size: (223615, 1339), Val size: (24239, 1339), Test size: (13192, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds:  93%|█████████████████████████████████████████████████████████████████     | 13/14 [03:43<00:25, 25.06s/it]

Round 13 completed. R²: -0.0262, Time: 29.02s

=== Round 14: 2000-01 to 2024-01 ===
Train size: (235344, 1339), Val size: (25702, 1339), Test size: (12327, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Training OLS...


OLS Rounds: 100%|██████████████████████████████████████████████████████████████████████| 14/14 [04:14<00:00, 26.98s/it]

Round 14 completed. R²: -0.0157, Time: 31.42s
OLS (Reduced) prediction complete.


## XGBoost Reduced Feature

In [17]:
# === Step 4: Expanding window XGBoost (Reduced) ===
data['date'] = data['yyyymm']
start_round = 0
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = start_round

print("Starting expanding window prediction (XGBoost with reduced features)...")

# Determine total number of rounds
end_date = pd.to_datetime("20240101", format="%Y%m%d")
total_rounds = 0
temp_start = starting
while (temp_start + pd.DateOffset(years=11 + total_rounds)) <= end_date:
    total_rounds += 1

pbar = tqdm(total=total_rounds - start_round, desc="XGBoost Rounds")

while (counter < total_rounds):
    round_start_time = time.time()

    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    print(f"\n=== Round {counter + 1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

    if test.empty or train.empty or validate.empty:
        print(f"Skipping round {counter + 1} due to empty set.")
        counter += 1
        pbar.update(1)
        continue

    X_train = train[reduced_feature_list].values
    Y_train = train['stock_exret'].values
    X_val = validate[reduced_feature_list].values
    Y_val = validate['stock_exret'].values
    X_test = test[reduced_feature_list].values
    Y_test = test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    reg_pred = test[["permno", "yyyymm", "stock_exret"]].copy()

    print("Training XGBoost...")
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model.fit(X_train, Y_train_dm)

    reg_pred["xgb"] = model.predict(X_test) + Y_mean

    r2_xgb = 1 - np.sum((reg_pred["xgb"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # Save prediction and r2 per round
    reg_pred.to_csv("../results/xgb_predictions_reduced.csv", mode="a", header=(counter==start_round), index=False)
    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_xgb": r2_xgb
    }]).to_csv("../results/xgb_r2_reduced.csv", mode="a", header=(counter==start_round), index=False)

    print(f"Round {counter + 1} completed. R²: {r2_xgb:.4f}, Time: {round(time.time() - round_start_time, 2)}s")
    counter += 1
    pbar.update(1)

print("XGBoost (Reduced) prediction complete.")

Starting expanding window prediction (XGBoost with reduced features)...



XGBoost Rounds:  14%|█████████▌                                                         | 2/14 [01:49<10:58, 54.87s/it]


=== Round 1: 2000-01 to 2011-01 ===





Train size: (93496, 1339), Val size: (20897, 1339), Test size: (10563, 1339)
Training XGBoost...



[Aoost Rounds:   7%|████▊                                                              | 1/14 [00:21<04:36, 21.27s/it]

Round 1 completed. R²: -0.0626, Time: 21.26s

=== Round 2: 2000-01 to 2012-01 ===
Train size: (103876, 1339), Val size: (21080, 1339), Test size: (10458, 1339)
Training XGBoost...



[Aoost Rounds:  14%|█████████▌                                                         | 2/14 [00:42<04:16, 21.39s/it]

Round 2 completed. R²: -0.0455, Time: 21.48s

=== Round 3: 2000-01 to 2013-01 ===
Train size: (114393, 1339), Val size: (21021, 1339), Test size: (10556, 1339)
Training XGBoost...



[Aoost Rounds:  21%|██████████████▎                                                    | 3/14 [01:10<04:28, 24.42s/it]

Round 3 completed. R²: -0.1194, Time: 28.01s

=== Round 4: 2000-01 to 2014-01 ===
Train size: (124956, 1339), Val size: (21014, 1339), Test size: (10617, 1339)
Training XGBoost...



[Aoost Rounds:  29%|███████████████████▏                                               | 4/14 [01:35<04:05, 24.59s/it]

Round 4 completed. R²: -0.1458, Time: 24.85s

=== Round 5: 2000-01 to 2015-01 ===
Train size: (135414, 1339), Val size: (21173, 1339), Test size: (10974, 1339)
Training XGBoost...



[Aoost Rounds:  36%|███████████████████████▉                                           | 5/14 [02:04<03:55, 26.12s/it]

Round 5 completed. R²: -0.0908, Time: 28.84s

=== Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1339), Val size: (21591, 1339), Test size: (11459, 1339)
Training XGBoost...



[Aoost Rounds:  43%|████████████████████████████▋                                      | 6/14 [02:35<03:43, 27.91s/it]

Round 6 completed. R²: -0.0720, Time: 31.35s

=== Round 7: 2000-01 to 2017-01 ===
Train size: (156587, 1339), Val size: (22433, 1339), Test size: (11273, 1339)
Training XGBoost...



[Aoost Rounds:  50%|█████████████████████████████████▌                                 | 7/14 [03:13<03:38, 31.25s/it]

Round 7 completed. R²: -0.0023, Time: 38.14s

=== Round 8: 2000-01 to 2018-01 ===
Train size: (167561, 1339), Val size: (22732, 1339), Test size: (11054, 1339)
Training XGBoost...



[Aoost Rounds:  57%|██████████████████████████████████████▎                            | 8/14 [03:50<03:18, 33.01s/it]

Round 8 completed. R²: -0.1405, Time: 36.78s

=== Round 9: 2000-01 to 2019-01 ===
Train size: (179020, 1339), Val size: (22327, 1339), Test size: (11061, 1339)
Training XGBoost...



[Aoost Rounds:  64%|███████████████████████████████████████████                        | 9/14 [04:31<02:57, 35.58s/it]

Round 9 completed. R²: -0.0657, Time: 41.23s

=== Round 10: 2000-01 to 2020-01 ===
Train size: (190293, 1339), Val size: (22115, 1339), Test size: (11207, 1339)
Training XGBoost...



[Aoost Rounds:  71%|███████████████████████████████████████████████▏                  | 10/14 [05:17<02:34, 38.58s/it]

Round 10 completed. R²: -0.1127, Time: 45.29s

=== Round 11: 2000-01 to 2021-01 ===
Train size: (201347, 1339), Val size: (22268, 1339), Test size: (11729, 1339)
Training XGBoost...



[Aoost Rounds:  79%|███████████████████████████████████████████████████▊              | 11/14 [06:11<02:09, 43.23s/it]

Round 11 completed. R²: -0.0208, Time: 53.76s

=== Round 12: 2000-01 to 2022-01 ===
Train size: (212408, 1339), Val size: (22936, 1339), Test size: (12510, 1339)
Training XGBoost...



[Aoost Rounds:  86%|████████████████████████████████████████████████████████▌         | 12/14 [07:03<01:32, 46.04s/it]

Round 12 completed. R²: -0.0519, Time: 52.46s

=== Round 13: 2000-01 to 2023-01 ===
Train size: (223615, 1339), Val size: (24239, 1339), Test size: (13192, 1339)
Training XGBoost...



[Aoost Rounds:  93%|█████████████████████████████████████████████████████████████▎    | 13/14 [08:08<00:51, 51.66s/it]

Round 13 completed. R²: -0.0454, Time: 64.58s

=== Round 14: 2000-01 to 2024-01 ===
Train size: (235344, 1339), Val size: (25702, 1339), Test size: (12327, 1339)
Training XGBoost...



[Aoost Rounds: 100%|██████████████████████████████████████████████████████████████████| 14/14 [09:07<00:00, 54.01s/it]

Round 14 completed. R²: -0.0814, Time: 59.43s
XGBoost (Reduced) prediction complete.


## Elastic Net Reduced Feature

In [19]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from tqdm import tqdm
import time

# === Load 131 reduced features ===
reduced_features = pd.read_csv("../results/lasso_top_features.csv")
reduced_feature_list = reduced_features.iloc[:, 0].tolist()

data['date'] = data['yyyymm']
starting = pd.to_datetime("20000101", format="%Y%m%d")
start_round = 0
counter = start_round

print("Starting expanding window prediction (ElasticNet simplified)...")

# Determine total number of rounds
end_date = pd.to_datetime("20240101", format="%Y%m%d")
total_rounds = 0
temp_start = starting
while (temp_start + pd.DateOffset(years=11 + total_rounds)) <= end_date:
    total_rounds += 1

pbar = tqdm(total=total_rounds - start_round, desc="ElasticNet Rounds")

while (counter < total_rounds):
    round_start_time = time.time()

    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    print(f"\n=== Round {counter + 1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

    if test.empty or train.empty or validate.empty:
        print(f"Skipping round {counter + 1} due to empty set.")
        counter += 1
        pbar.update(1)
        continue

    print("Standardizing features (chunked)...")
    train, validate, test = chunk_standardize(train, validate, test, reduced_feature_list, chunk_size=100)
    print("Standardization complete.")

    X_train = train[reduced_feature_list].values
    Y_train = train['stock_exret'].values
    X_val = validate[reduced_feature_list].values
    Y_val = validate['stock_exret'].values
    X_test = test[reduced_feature_list].values
    Y_test = test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    reg_pred = test[["permno", "yyyymm", "stock_exret"]].copy()

    print("Tuning ElasticNet (alpha only)...")
    lambdas = np.arange(-4, 4.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(lambdas):
        reg = ElasticNet(alpha=(10**i), l1_ratio=0.5, max_iter=1000000, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)

    best_lambda = lambdas[np.argmin(val_mse)]
    reg = ElasticNet(alpha=(10**best_lambda), l1_ratio=0.5, max_iter=1000000, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["enet_simple"] = reg.predict(X_test) + Y_mean

    r2_enet = 1 - np.sum((reg_pred["enet_simple"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # Save predictions and R2
    reg_pred.to_csv("../results/enet_predictions_reduced_simple.csv", mode="a", header=(counter==start_round), index=False)
    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_enet_simple": r2_enet
    }]).to_csv("../results/enet_r2_reduced_simple.csv", mode="a", header=(counter==start_round), index=False)

    print(f"Round {counter + 1} completed. R²: {r2_enet:.4f}, Time: {round(time.time() - round_start_time, 2)}s")
    counter += 1
    pbar.update(1)

print("ElasticNet simplified prediction complete.")

Starting expanding window prediction (ElasticNet simplified)...



ElasticNet Rounds:   7%|████▎                                                       | 1/14 [17:52<3:52:27, 1072.91s/it]


=== Round 1: 2000-01 to 2011-01 ===





Train size: (93496, 1339), Val size: (20897, 1339), Test size: (10563, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:   7%|████▎                                                        | 1/14 [15:20<3:19:27, 920.58s/it]

Round 1 completed. R²: -0.0327, Time: 920.57s

=== Round 2: 2000-01 to 2012-01 ===
Train size: (103876, 1339), Val size: (21080, 1339), Test size: (10458, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  14%|████████▋                                                    | 2/14 [30:44<3:04:27, 922.33s/it]

Round 2 completed. R²: -0.0002, Time: 923.55s

=== Round 3: 2000-01 to 2013-01 ===
Train size: (114393, 1339), Val size: (21021, 1339), Test size: (10556, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  21%|█████████████                                                | 3/14 [47:52<2:57:58, 970.73s/it]

Round 3 completed. R²: -0.0198, Time: 1028.31s

=== Round 4: 2000-01 to 2014-01 ===
Train size: (124956, 1339), Val size: (21014, 1339), Test size: (10617, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  29%|████████████████▊                                          | 4/14 [1:03:42<2:40:26, 962.69s/it]

Round 4 completed. R²: -0.1136, Time: 950.35s

=== Round 5: 2000-01 to 2015-01 ===
Train size: (135414, 1339), Val size: (21173, 1339), Test size: (10974, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  36%|█████████████████████                                      | 5/14 [1:20:19<2:26:14, 974.95s/it]

Round 5 completed. R²: -0.0052, Time: 996.69s

=== Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1339), Val size: (21591, 1339), Test size: (11459, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  43%|█████████████████████████▎                                 | 6/14 [1:36:37<2:10:08, 976.04s/it]

Round 6 completed. R²: -0.0057, Time: 978.16s

=== Round 7: 2000-01 to 2017-01 ===
Train size: (156587, 1339), Val size: (22433, 1339), Test size: (11273, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  50%|█████████████████████████████▌                             | 7/14 [1:52:39<1:53:19, 971.29s/it]

Round 7 completed. R²: -0.0054, Time: 961.5s

=== Round 8: 2000-01 to 2018-01 ===
Train size: (167561, 1339), Val size: (22732, 1339), Test size: (11054, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  57%|█████████████████████████████████▋                         | 8/14 [2:10:18<1:39:55, 999.19s/it]

Round 8 completed. R²: -0.0164, Time: 1058.9s

=== Round 9: 2000-01 to 2019-01 ===
Train size: (179020, 1339), Val size: (22327, 1339), Test size: (11061, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  64%|█████████████████████████████████████▎                    | 9/14 [2:29:27<1:27:10, 1046.09s/it]

Round 9 completed. R²: -0.0277, Time: 1149.21s

=== Round 10: 2000-01 to 2020-01 ===
Train size: (190293, 1339), Val size: (22115, 1339), Test size: (11207, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  71%|████████████████████████████████████████▋                | 10/14 [2:46:14<1:08:55, 1033.99s/it]

Round 10 completed. R²: -0.0293, Time: 1006.9s

=== Round 11: 2000-01 to 2021-01 ===
Train size: (201347, 1339), Val size: (22268, 1339), Test size: (11729, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  79%|██████████████████████████████████████████████▎            | 11/14 [3:06:08<54:08, 1082.90s/it]

Round 11 completed. R²: -0.0104, Time: 1193.79s

=== Round 12: 2000-01 to 2022-01 ===
Train size: (212408, 1339), Val size: (22936, 1339), Test size: (12510, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  86%|██████████████████████████████████████████████████▌        | 12/14 [3:27:32<38:08, 1144.34s/it]

Round 12 completed. R²: -0.0076, Time: 1284.86s

=== Round 13: 2000-01 to 2023-01 ===
Train size: (223615, 1339), Val size: (24239, 1339), Test size: (13192, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds:  93%|██████████████████████████████████████████████████████▊    | 13/14 [3:53:59<21:18, 1278.45s/it]

Round 13 completed. R²: -0.0256, Time: 1587.01s

=== Round 14: 2000-01 to 2024-01 ===
Train size: (235344, 1339), Val size: (25702, 1339), Test size: (12327, 1339)
Standardizing features (chunked)...
Standardization complete.
Tuning ElasticNet (alpha only)...



[AsticNet Rounds: 100%|███████████████████████████████████████████████████████████| 14/14 [4:22:40<00:00, 1411.85s/it]

Round 14 completed. R²: -0.0092, Time: 1720.09s
ElasticNet simplified prediction complete.


## Ridge

In [10]:
# === 04_predict_return_ridge_reduced.ipynb ===

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import datetime
import time

# === Load reduced features ===
reduced_features = pd.read_csv("../results/lasso_top_features.csv")
reduced_feature_list = reduced_features.iloc[:, 0].tolist()

# === Chunked Standardization Function ===
def chunk_standardize(df_train, df_val, df_test, columns, chunk_size=100):
    for i in range(0, len(columns), chunk_size):
        subset = columns[i:i+chunk_size]
        scaler = StandardScaler().fit(df_train[subset])
        df_train.loc[:, subset] = scaler.transform(df_train[subset])
        df_val.loc[:, subset] = scaler.transform(df_val[subset])
        df_test.loc[:, subset] = scaler.transform(df_test[subset])
    return df_train, df_val, df_test

# === Step 4: Expanding window Ridge (Reduced) ===
data['date'] = data['yyyymm']
start_round = 0
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = start_round

print("Starting expanding window prediction (Ridge with reduced features)...")

# Determine total number of rounds
end_date = pd.to_datetime("20240101", format="%Y%m%d")
total_rounds = 0
temp_start = starting
while (temp_start + pd.DateOffset(years=11 + total_rounds)) <= end_date:
    total_rounds += 1

pbar = tqdm(total=total_rounds - start_round, desc="Ridge Rounds")

while (counter < total_rounds):
    round_start_time = time.time()

    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    print(f"\n=== Round {counter + 1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

    if test.empty or train.empty or validate.empty:
        print(f"Skipping round {counter + 1} due to empty set.")
        counter += 1
        pbar.update(1)
        continue

    print("Standardizing reduced features (chunked)...")
    train, validate, test = chunk_standardize(train, validate, test, reduced_feature_list, chunk_size=100)
    print("Standardization complete.")

    X_train, Y_train = train[reduced_feature_list].values, train['stock_exret'].values
    X_val, Y_val = validate[reduced_feature_list].values, validate['stock_exret'].values
    X_test, Y_test = test[reduced_feature_list].values, test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    reg_pred = test[["permno", "yyyymm", "stock_exret"]].copy()

    print("Tuning Ridge alpha...")
    lambdas = np.arange(-1, 8.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(tqdm(lambdas, desc="Ridge Alpha Grid")):
        reg = Ridge(alpha=(10**i) * 0.5, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)
    best_lambda = lambdas[np.argmin(val_mse)]
    best_alpha = (10**best_lambda) * 0.5

    print(f"Best Ridge alpha: {best_alpha:.6f}")
    reg = Ridge(alpha=best_alpha, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["ridge"] = reg.predict(X_test) + Y_mean

    r2_ridge = 1 - np.sum((reg_pred["ridge"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # Save prediction and r2 per round
    reg_pred.to_csv("../results/ridge_predictions_reduced.csv", mode="a", header=(counter==start_round), index=False)
    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_ridge": r2_ridge,
        "best_alpha": best_alpha
    }]).to_csv("../results/ridge_r2_reduced.csv", mode="a", header=(counter==start_round), index=False)

    print(f"Round {counter + 1} completed. R²: {r2_ridge:.4f}, Time: {round(time.time() - round_start_time, 2)}s")
    counter += 1
    pbar.update(1)

print("Ridge (Reduced) prediction complete.")

Starting expanding window prediction (Ridge with reduced features)...



Ridge Rounds:   0%|                                                                             | 0/14 [01:28<?, ?it/s]


=== Round 1: 2000-01 to 2011-01 ===





Train size: (93496, 1339), Val size: (20897, 1339), Test size: (10563, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:14<00:00,  6.38it/s]



Best Ridge alpha: 500000.000000
Round 1 completed. R²: -0.0364, Time: 19.07s


[Age Rounds:   7%|████▉                                                                | 1/14 [00:19<04:07, 19.07s/it]


=== Round 2: 2000-01 to 2012-01 ===
Train size: (103876, 1339), Val size: (21080, 1339), Test size: (10458, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:13<00:00,  6.86it/s]



Best Ridge alpha: 50000000.000000
Round 2 completed. R²: -0.0004, Time: 19.11s


[Age Rounds:  14%|█████████▊                                                           | 2/14 [00:38<03:49, 19.10s/it]


=== Round 3: 2000-01 to 2013-01 ===
Train size: (114393, 1339), Val size: (21021, 1339), Test size: (10556, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:14<00:00,  6.37it/s]



Best Ridge alpha: 7924465.962306
Round 3 completed. R²: -0.0192, Time: 20.22s


[Age Rounds:  21%|██████████████▊                                                      | 3/14 [00:58<03:35, 19.61s/it]


=== Round 4: 2000-01 to 2014-01 ===
Train size: (124956, 1339), Val size: (21014, 1339), Test size: (10617, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:16<00:00,  5.64it/s]


Best Ridge alpha: 3971641.173621



[Age Rounds:  29%|███████████████████▋                                                 | 4/14 [01:20<03:26, 20.65s/it]

Round 4 completed. R²: -0.1129, Time: 22.25s

=== Round 5: 2000-01 to 2015-01 ===
Train size: (135414, 1339), Val size: (21173, 1339), Test size: (10974, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:18<00:00,  4.83it/s]


Best Ridge alpha: 6294627.058971



[Age Rounds:  36%|████████████████████████▋                                            | 5/14 [01:50<03:34, 23.82s/it]

Round 5 completed. R²: -0.0039, Time: 29.45s

=== Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1339), Val size: (21591, 1339), Test size: (11459, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:21<00:00,  4.25it/s]


Best Ridge alpha: 3971641.173621



[Age Rounds:  43%|█████████████████████████████▌                                       | 6/14 [02:21<03:30, 26.30s/it]

Round 6 completed. R²: -0.0056, Time: 31.1s

=== Round 7: 2000-01 to 2017-01 ===
Train size: (156587, 1339), Val size: (22433, 1339), Test size: (11273, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:20<00:00,  4.48it/s]


Best Ridge alpha: 9976311.574844



[Age Rounds:  50%|██████████████████████████████████▌                                  | 7/14 [02:53<03:17, 28.23s/it]

Round 7 completed. R²: -0.0043, Time: 32.2s

=== Round 8: 2000-01 to 2018-01 ===
Train size: (167561, 1339), Val size: (22732, 1339), Test size: (11054, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:20<00:00,  4.46it/s]


Best Ridge alpha: 3971641.173621



[Age Rounds:  57%|███████████████████████████████████████▍                             | 8/14 [03:28<03:02, 30.40s/it]

Round 8 completed. R²: -0.0165, Time: 35.04s

=== Round 9: 2000-01 to 2019-01 ===
Train size: (179020, 1339), Val size: (22327, 1339), Test size: (11061, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:24<00:00,  3.73it/s]


Best Ridge alpha: 1990535.852767



[Age Rounds:  64%|████████████████████████████████████████████▎                        | 9/14 [04:07<02:45, 33.15s/it]

Round 9 completed. R²: -0.0258, Time: 39.19s

=== Round 10: 2000-01 to 2020-01 ===
Train size: (190293, 1339), Val size: (22115, 1339), Test size: (11207, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:26<00:00,  3.41it/s]


Best Ridge alpha: 50000000.000000



[Age Rounds:  71%|████████████████████████████████████████████████▌                   | 10/14 [05:01<02:38, 39.67s/it]

Round 10 completed. R²: -0.0293, Time: 54.26s

=== Round 11: 2000-01 to 2021-01 ===
Train size: (201347, 1339), Val size: (22268, 1339), Test size: (11729, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:28<00:00,  3.16it/s]


Best Ridge alpha: 50000000.000000



[Age Rounds:  79%|█████████████████████████████████████████████████████▍              | 11/14 [05:52<02:09, 43.10s/it]

Round 11 completed. R²: -0.0107, Time: 50.88s

=== Round 12: 2000-01 to 2022-01 ===
Train size: (212408, 1339), Val size: (22936, 1339), Test size: (12510, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:33<00:00,  2.70it/s]


Best Ridge alpha: 50000000.000000



[Age Rounds:  86%|██████████████████████████████████████████████████████████▎         | 12/14 [06:50<01:34, 47.42s/it]

Round 12 completed. R²: -0.0068, Time: 57.28s

=== Round 13: 2000-01 to 2023-01 ===
Train size: (223615, 1339), Val size: (24239, 1339), Test size: (13192, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:30<00:00,  2.97it/s]


Best Ridge alpha: 315478.672240



[Age Rounds:  93%|███████████████████████████████████████████████████████████████▏    | 13/14 [07:52<00:52, 52.02s/it]

Round 13 completed. R²: -0.0246, Time: 62.62s

=== Round 14: 2000-01 to 2024-01 ===
Train size: (235344, 1339), Val size: (25702, 1339), Test size: (12327, 1339)
Standardizing reduced features (chunked)...
Standardization complete.
Tuning Ridge alpha...


Ridge Alpha Grid: 100%|████████████████████████████████████████████████████████████████| 91/91 [00:36<00:00,  2.51it/s]


Best Ridge alpha: 500000.000000



[Age Rounds: 100%|████████████████████████████████████████████████████████████████████| 14/14 [09:04<00:00, 58.10s/it]

Round 14 completed. R²: -0.0038, Time: 72.14s
Ridge (Reduced) prediction complete.


In [7]:
# === Step 4: Expanding window model training and prediction ===
data['date'] = data['yyyymm']
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = 0
pred_out = pd.DataFrame()
r2_records = []

print("Starting expanding window prediction...")

while (starting + pd.DateOffset(years=11 + counter)) <= pd.to_datetime("20240101", format="%Y%m%d"):
    print(f"\n=== Round {counter + 1}: {starting.strftime('%Y-%m')} to {(starting + pd.DateOffset(years=11 + counter)).strftime('%Y-%m')} ===")

    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

    if test.empty or train.empty or validate.empty:
        print(f"Skipping round {counter + 1} due to empty set.")
        counter += 1
        continue

    print("Standardizing features...")
    scaler = StandardScaler().fit(train[all_features])
    train.loc[:, all_features] = scaler.transform(train[all_features])
    validate.loc[:, all_features] = scaler.transform(validate[all_features])
    test.loc[:, all_features] = scaler.transform(test[all_features])
    print("Standardization complete.")

    X_train, Y_train = train[all_features].values, train['stock_exret'].values
    X_val, Y_val = validate[all_features].values, validate['stock_exret'].values
    X_test, Y_test = test[all_features].values, test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    reg_pred = test[["permno", "yyyymm", "stock_exret"]].copy()

    # === OLS ===
    print("Training OLS...")
    reg = LinearRegression(fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["ols"] = reg.predict(X_test) + Y_mean
    r2_ols = 1 - np.sum((reg_pred["ols"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # === Lasso ===
    print("Tuning Lasso alpha...")
    lambdas = np.arange(-4, 4.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(tqdm(lambdas, desc="Lasso alpha")):
        reg = Lasso(alpha=10**i, max_iter=1000000, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)
    best_lambda = lambdas[np.argmin(val_mse)]
    reg = Lasso(alpha=10**best_lambda, max_iter=1000000, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["lasso"] = reg.predict(X_test) + Y_mean
    r2_lasso = 1 - np.sum((reg_pred["lasso"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # === Ridge ===
    print("Tuning Ridge alpha...")
    lambdas = np.arange(-1, 8.1, 0.1)
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(tqdm(lambdas, desc="Ridge alpha")):
        reg = Ridge(alpha=(10**i) * 0.5, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)
    best_lambda = lambdas[np.argmin(val_mse)]
    reg = Ridge(alpha=(10**best_lambda) * 0.5, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["ridge"] = reg.predict(X_test) + Y_mean
    r2_ridge = 1 - np.sum((reg_pred["ridge"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # === ElasticNet (with early stopping and reduced grid) ===
    print("Tuning ElasticNet alpha (fast mode)...")
    lambdas = np.arange(-3, 2.25, 0.25)  # reduced grid
    val_mse = np.zeros(len(lambdas))
    for ind, i in enumerate(tqdm(lambdas, desc="ElasticNet alpha")):
        reg = ElasticNet(alpha=10**i, max_iter=5000, tol=1e-3, fit_intercept=False)
        reg.fit(X_train, Y_train_dm)
        val_mse[ind] = mean_squared_error(Y_val, reg.predict(X_val) + Y_mean)
    best_lambda = lambdas[np.argmin(val_mse)]
    reg = ElasticNet(alpha=10**best_lambda, max_iter=5000, tol=1e-3, fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["en"] = reg.predict(X_test) + Y_mean
    r2_en = 1 - np.sum((reg_pred["en"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)

    # Save R^2 per model per round
    r2_records.append({
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_ols": r2_ols,
        "r2_lasso": r2_lasso,
        "r2_ridge": r2_ridge,
        "r2_en": r2_en
    })

    pred_out = pd.concat([pred_out, reg_pred], ignore_index=True)
    counter += 1

Starting expanding window prediction...

=== Round 1: 2000-01 to 2011-01 ===
Train size: (93496, 1339), Val size: (20897, 1339), Test size: (10563, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [06:45<00:00,  5.01s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [04:18<00:00,  2.84s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [05:32<00:00, 15.84s/it]



=== Round 2: 2000-01 to 2012-01 ===
Train size: (103876, 1339), Val size: (21080, 1339), Test size: (10458, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [07:15<00:00,  5.38s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [04:42<00:00,  3.11s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [08:05<00:00, 23.11s/it]



=== Round 3: 2000-01 to 2013-01 ===
Train size: (114393, 1339), Val size: (21021, 1339), Test size: (10556, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [08:20<00:00,  6.17s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [05:21<00:00,  3.53s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [07:00<00:00, 20.01s/it]



=== Round 4: 2000-01 to 2014-01 ===
Train size: (124956, 1339), Val size: (21014, 1339), Test size: (10617, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [09:27<00:00,  7.01s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [05:49<00:00,  3.84s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [07:28<00:00, 21.36s/it]



=== Round 5: 2000-01 to 2015-01 ===
Train size: (135414, 1339), Val size: (21173, 1339), Test size: (10974, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [09:04<00:00,  6.72s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [06:39<00:00,  4.39s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [08:11<00:00, 23.41s/it]



=== Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1339), Val size: (21591, 1339), Test size: (11459, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [08:35<00:00,  6.36s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [06:26<00:00,  4.24s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [07:52<00:00, 22.52s/it]



=== Round 7: 2000-01 to 2017-01 ===
Train size: (156587, 1339), Val size: (22433, 1339), Test size: (11273, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [08:55<00:00,  6.61s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [07:11<00:00,  4.74s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [08:47<00:00, 25.10s/it]



=== Round 8: 2000-01 to 2018-01 ===
Train size: (167561, 1339), Val size: (22732, 1339), Test size: (11054, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [08:18<00:00,  6.16s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [07:29<00:00,  4.94s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [09:04<00:00, 25.94s/it]



=== Round 9: 2000-01 to 2019-01 ===
Train size: (179020, 1339), Val size: (22327, 1339), Test size: (11061, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [09:57<00:00,  7.38s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [08:06<00:00,  5.34s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [07:41<00:00, 21.96s/it]



=== Round 10: 2000-01 to 2020-01 ===
Train size: (190293, 1339), Val size: (22115, 1339), Test size: (11207, 1339)
Standardizing features...
Standardization complete.
Training OLS...
Tuning Lasso alpha...


Lasso alpha: 100%|█████████████████████████████████████████████████████████████████████| 81/81 [11:03<00:00,  8.19s/it]


Tuning Ridge alpha...


Ridge alpha: 100%|█████████████████████████████████████████████████████████████████████| 91/91 [08:58<00:00,  5.92s/it]


Tuning ElasticNet alpha (fast mode)...


ElasticNet alpha: 100%|████████████████████████████████████████████████████████████████| 21/21 [07:51<00:00, 22.44s/it]



=== Round 11: 2000-01 to 2021-01 ===
Train size: (201347, 1339), Val size: (22268, 1339), Test size: (11729, 1339)
Standardizing features...


MemoryError: Unable to allocate 1.99 GiB for an array with shape (201347, 1324) and data type float64

In [None]:
# Save predictions and R^2 records
pred_out.to_csv("../data/results/stock_return_predictions.csv", index=False)
pd.DataFrame(r2_records).to_csv("../data/processed/model_r2_each_round.csv", index=False)