In [1]:
# === Autoencoder-Based Stock Return Prediction ===
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import time
import os

In [5]:
# === Load data ===
firm = pd.read_csv("mma_sample_v2.csv", parse_dates=["date", "ret_eom"])

# Convert ret_eom (return date) to yyyymm for alignment
firm["yyyymm"] = firm["ret_eom"].dt.to_period("M").dt.to_timestamp()
firm["year"] = firm["yyyymm"].dt.year
firm["month"] = firm["yyyymm"].dt.month

# === Load macro data and construct interactions ===
macro = pd.read_csv("macro_monthly.csv")
macro["yyyymm"] = pd.to_datetime(macro["yyyymm"].astype(str), format="%Y%m")

# === Construct interaction terms ===
factor_list = pd.read_csv("factor_char_list.csv")["variable"].tolist()

In [7]:
# Filter firm data to include only relevant features
firm = firm[["permno", "yyyymm", "stock_exret", "year", "month"] + factor_list]

# Drop observations with missing target value
firm = firm[firm["stock_exret"].notna()].copy()

# Merge macro variables
df_full = firm.merge(macro, on="yyyymm", how="left")

In [8]:
# === Set seed ===
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [9]:
# === Construct interaction terms ===
macro_vars = ['dp', 'ep', 'ntis', 'svar', 'dfy', 'tms', 'tbl']
for m in macro_vars:
    for f in factor_list:
        col_name = f"{f}_{m}"
        df_full[col_name] = df_full[f] * df_full[m]

interaction_features = [f"{f}_{m}" for m in macro_vars for f in factor_list]
full_features = factor_list + interaction_features

  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] *

In [12]:
# === Fill missing values ===
df_full[full_features] = df_full[full_features].fillna(0)

In [13]:
# === Autoencoder definition ===
class AE(nn.Module):
    def __init__(self, input_dim, hidden_dim=32):
        super(AE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, hidden_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

In [14]:
# === Expand-window prediction ===
start = pd.to_datetime("20000101", format="%Y%m%d")
end = pd.to_datetime("20240101", format="%Y%m%d")
counter = 0
results = []

while start + pd.DateOffset(years=11 + counter) <= end:
    cutoff = [
        start,
        start + pd.DateOffset(years=8 + counter),
        start + pd.DateOffset(years=10 + counter),
        start + pd.DateOffset(years=11 + counter)
    ]

    print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
    train = df_full[(df_full['yyyymm'] >= cutoff[0]) & (df_full['yyyymm'] < cutoff[1])].copy()
    val   = df_full[(df_full['yyyymm'] >= cutoff[1]) & (df_full['yyyymm'] < cutoff[2])].copy()
    test  = df_full[(df_full['yyyymm'] >= cutoff[2]) & (df_full['yyyymm'] < cutoff[3])].copy()

    if train.empty or val.empty or test.empty:
        print("Skipping due to empty split.")
        counter += 1
        continue

    scaler = StandardScaler()
    train[full_features] = scaler.fit_transform(train[full_features])
    val[full_features] = scaler.transform(val[full_features])
    test[full_features] = scaler.transform(test[full_features])

    X_train = torch.tensor(train[full_features].values, dtype=torch.float32)
    X_val = torch.tensor(val[full_features].values, dtype=torch.float32)
    X_test = torch.tensor(test[full_features].values, dtype=torch.float32)

    ae_model = AE(len(full_features))
    optimizer = torch.optim.Adam(ae_model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    ae_model.train()
    for epoch in range(50):
        optimizer.zero_grad()
        x_hat, _ = ae_model(X_train)
        loss = loss_fn(x_hat, X_train)
        loss.backward()
        optimizer.step()

    ae_model.eval()
    with torch.no_grad():
        _, Z_train = ae_model(X_train)
        _, Z_val = ae_model(X_val)
        _, Z_test = ae_model(X_test)

    reg = LinearRegression()
    y_train = train['stock_exret'].values
    y_val = val['stock_exret'].values
    y_test = test['stock_exret'].values
    y_mean = y_train.mean()
    reg.fit(Z_train, y_train - y_mean)
    y_pred = reg.predict(Z_test) + y_mean

    r2 = r2_score(y_test, y_pred)
    pred_df = test[['permno', 'yyyymm', 'stock_exret']].copy()
    pred_df['ae'] = y_pred

    if not os.path.exists("ae_predictions.csv"):
        pred_df.to_csv("ae_predictions.csv", index=False)
    else:
        pred_df.to_csv("ae_predictions.csv", mode="a", header=False, index=False)

    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_ae": r2
    }]).to_csv("ae_r2.csv", mode="a", header=(counter == 0), index=False)

    print(f"Round {counter+1} completed. R²: {r2:.4f}")
    counter += 1

print("Autoencoder expanding window prediction complete.")



=== Round 1: 2000-01 to 2011-01 ===
Round 1 completed. R²: -0.0087

=== Round 2: 2000-01 to 2012-01 ===
Round 2 completed. R²: 0.0163

=== Round 3: 2000-01 to 2013-01 ===
Round 3 completed. R²: -0.0049

=== Round 4: 2000-01 to 2014-01 ===
Round 4 completed. R²: -0.1057

=== Round 5: 2000-01 to 2015-01 ===
Round 5 completed. R²: -0.0147

=== Round 6: 2000-01 to 2016-01 ===
Round 6 completed. R²: -20.3585

=== Round 7: 2000-01 to 2017-01 ===
Round 7 completed. R²: -0.3638

=== Round 8: 2000-01 to 2018-01 ===
Round 8 completed. R²: -0.0314

=== Round 9: 2000-01 to 2019-01 ===
Round 9 completed. R²: -0.0770

=== Round 10: 2000-01 to 2020-01 ===
Round 10 completed. R²: -0.0247

=== Round 11: 2000-01 to 2021-01 ===
Round 11 completed. R²: 0.0379

=== Round 12: 2000-01 to 2022-01 ===
Round 12 completed. R²: -0.7364

=== Round 13: 2000-01 to 2023-01 ===
Round 13 completed. R²: -2.3956

=== Round 14: 2000-01 to 2024-01 ===


: 

In [1]:
# === Autoencoder-Based Stock Return Prediction ===
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import time
import os

# === Set seed ===
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# === Load data ===
firm = pd.read_csv("mma_sample_v2.csv", parse_dates=["date", "ret_eom"])
firm["yyyymm"] = firm["ret_eom"].dt.to_period("M").dt.to_timestamp()
firm["year"] = firm["yyyymm"].dt.year
firm["month"] = firm["yyyymm"].dt.month

macro = pd.read_csv("macro_monthly.csv")
macro["yyyymm"] = pd.to_datetime(macro["yyyymm"].astype(str), format="%Y%m")

factor_list = pd.read_csv("factor_char_list.csv")["variable"].tolist()
firm = firm[["permno", "yyyymm", "stock_exret", "year", "month"] + factor_list]
firm = firm[firm["stock_exret"].notna()].copy()
df_full = firm.merge(macro, on="yyyymm", how="left")

# === Construct interaction terms ===
macro_vars = ['dp', 'ep', 'ntis', 'svar', 'dfy', 'tms', 'tbl']
for m in macro_vars:
    for f in factor_list:
        col_name = f"{f}_{m}"
        df_full[col_name] = df_full[f] * df_full[m]

interaction_features = [f"{f}_{m}" for m in macro_vars for f in factor_list]
full_features = factor_list + interaction_features

# === Autoencoder definition ===
class AE(nn.Module):
    def __init__(self, input_dim, hidden_dim=32):
        super(AE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, hidden_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

# === Expand-window prediction ===
start = pd.to_datetime("20000101", format="%Y%m%d")
end = pd.to_datetime("20240101", format="%Y%m%d")
counter = 0
results = []

while start + pd.DateOffset(years=11 + counter) <= end:
    cutoff = [
        start,
        start + pd.DateOffset(years=8 + counter),
        start + pd.DateOffset(years=10 + counter),
        start + pd.DateOffset(years=11 + counter)
    ]

    print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
    train = df_full[(df_full['yyyymm'] >= cutoff[0]) & (df_full['yyyymm'] < cutoff[1])].copy()
    val   = df_full[(df_full['yyyymm'] >= cutoff[1]) & (df_full['yyyymm'] < cutoff[2])].copy()
    test  = df_full[(df_full['yyyymm'] >= cutoff[2]) & (df_full['yyyymm'] < cutoff[3])].copy()

    if train.empty or val.empty or test.empty:
        print("Skipping due to empty split.")
        counter += 1
        continue

    # === Fill missing values using train mean ===
    train_mean = train[full_features].mean()
    train[full_features] = train[full_features].fillna(train_mean)
    val[full_features] = val[full_features].fillna(train_mean)
    test[full_features] = test[full_features].fillna(train_mean)

    scaler = StandardScaler()
    train[full_features] = scaler.fit_transform(train[full_features])
    val[full_features] = scaler.transform(val[full_features])
    test[full_features] = scaler.transform(test[full_features])

    X_train = torch.tensor(train[full_features].values, dtype=torch.float32)
    X_val = torch.tensor(val[full_features].values, dtype=torch.float32)
    X_test = torch.tensor(test[full_features].values, dtype=torch.float32)

    ae_model = AE(len(full_features))
    optimizer = torch.optim.Adam(ae_model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    ae_model.train()
    for epoch in range(50):
        optimizer.zero_grad()
        x_hat, _ = ae_model(X_train)
        loss = loss_fn(x_hat, X_train)
        loss.backward()
        optimizer.step()

    ae_model.eval()
    with torch.no_grad():
        _, Z_train = ae_model(X_train)
        _, Z_val = ae_model(X_val)
        _, Z_test = ae_model(X_test)

    reg = LinearRegression()
    y_train = train['stock_exret'].values
    y_val = val['stock_exret'].values
    y_test = test['stock_exret'].values
    y_mean = y_train.mean()
    reg.fit(Z_train, y_train - y_mean)
    y_pred = reg.predict(Z_test) + y_mean

    r2 = r2_score(y_test, y_pred)
    pred_df = test[['permno', 'yyyymm', 'stock_exret']].copy()
    pred_df['ae'] = y_pred

    if not os.path.exists("ae_predictions_v2.csv"):
        pred_df.to_csv("ae_predictions_v2.csv", index=False)
    else:
        pred_df.to_csv("ae_predictions_v2.csv", mode="a", header=False, index=False)

    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_ae": r2
    }]).to_csv("ae_r2_v2.csv", mode="a", header=(counter == 0), index=False)

    print(f"Round {counter+1} completed. R²: {r2:.4f}")
    counter += 1

print("Autoencoder expanding window prediction complete.")

  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] *


=== Round 1: 2000-01 to 2011-01 ===
Round 1 completed. R²: 0.0160

=== Round 2: 2000-01 to 2012-01 ===
Round 2 completed. R²: 0.0158

=== Round 3: 2000-01 to 2013-01 ===
Round 3 completed. R²: -0.0056

=== Round 4: 2000-01 to 2014-01 ===
Round 4 completed. R²: -0.0508

=== Round 5: 2000-01 to 2015-01 ===
Round 5 completed. R²: -0.0628

=== Round 6: 2000-01 to 2016-01 ===
Round 6 completed. R²: -10.4197

=== Round 7: 2000-01 to 2017-01 ===
Round 7 completed. R²: -0.2247

=== Round 8: 2000-01 to 2018-01 ===
Round 8 completed. R²: -1.4882

=== Round 9: 2000-01 to 2019-01 ===
Round 9 completed. R²: -0.5720

=== Round 10: 2000-01 to 2020-01 ===
Round 10 completed. R²: -0.0427

=== Round 11: 2000-01 to 2021-01 ===
Round 11 completed. R²: 0.0518

=== Round 12: 2000-01 to 2022-01 ===
Round 12 completed. R²: -4.5653

=== Round 13: 2000-01 to 2023-01 ===
Round 13 completed. R²: -2.1493

=== Round 14: 2000-01 to 2024-01 ===
Round 14 completed. R²: -0.0117
Autoencoder expanding window prediction 

In [5]:
# === Autoencoder-Based Stock Return Prediction (Stable Version) ===
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import r2_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import time
import os

# === Set seed ===
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# === Load data ===
firm = pd.read_csv("mma_sample_v2.csv", parse_dates=["date", "ret_eom"])
firm["yyyymm"] = firm["ret_eom"].dt.to_period("M").dt.to_timestamp()
firm["year"] = firm["yyyymm"].dt.year
firm["month"] = firm["yyyymm"].dt.month

macro = pd.read_csv("macro_monthly.csv")
macro["yyyymm"] = pd.to_datetime(macro["yyyymm"].astype(str), format="%Y%m")

factor_list = pd.read_csv("factor_char_list.csv")["variable"].tolist()
firm = firm[["permno", "yyyymm", "stock_exret", "year", "month"] + factor_list]
firm = firm[firm["stock_exret"].notna()].copy()
df_full = firm.merge(macro, on="yyyymm", how="left")

# === Construct interaction terms ===
macro_vars = ['dp', 'ep', 'ntis', 'svar', 'dfy', 'tms', 'tbl']
for m in macro_vars:
    for f in factor_list:
        col_name = f"{f}_{m}"
        df_full[col_name] = df_full[f] * df_full[m]

interaction_features = [f"{f}_{m}" for m in macro_vars for f in factor_list]
full_features = factor_list + interaction_features

# === Autoencoder definition (Stable) ===
class AE(nn.Module):
    def __init__(self, input_dim, hidden_dim=16):
        super(AE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, hidden_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

# === Expand-window prediction ===
start = pd.to_datetime("20000101", format="%Y%m%d")
end = pd.to_datetime("20240101", format="%Y%m%d")
counter = 0
results = []

# === Expand-window prediction ===
start = pd.to_datetime("20000101", format="%Y%m%d")
end = pd.to_datetime("20240101", format="%Y%m%d")
counter = 0
results = []

while start + pd.DateOffset(years=11 + counter) <= end:
    cutoff = [
        start,
        start + pd.DateOffset(years=8 + counter),
        start + pd.DateOffset(years=10 + counter),
        start + pd.DateOffset(years=11 + counter)
    ]

    print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
    train = df_full[(df_full['yyyymm'] >= cutoff[0]) & (df_full['yyyymm'] < cutoff[1])].copy()
    val   = df_full[(df_full['yyyymm'] >= cutoff[1]) & (df_full['yyyymm'] < cutoff[2])].copy()
    test  = df_full[(df_full['yyyymm'] >= cutoff[2]) & (df_full['yyyymm'] < cutoff[3])].copy()

    if train.empty or val.empty or test.empty:
        print("Skipping due to empty split.")
        counter += 1
        continue

    train_mean = train[full_features].mean()
    train[full_features] = train[full_features].fillna(train_mean)
    val[full_features] = val[full_features].fillna(train_mean)
    test[full_features] = test[full_features].fillna(train_mean)

    scaler = StandardScaler()
    train[full_features] = scaler.fit_transform(train[full_features])
    val[full_features] = scaler.transform(val[full_features])
    test[full_features] = scaler.transform(test[full_features])

    X_train = torch.tensor(train[full_features].values, dtype=torch.float32)
    X_val = torch.tensor(val[full_features].values, dtype=torch.float32)
    X_test = torch.tensor(test[full_features].values, dtype=torch.float32)

    ae_model = AE(len(full_features))
    optimizer = torch.optim.Adam(ae_model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    best_loss = float('inf')
    patience = 5
    wait = 0
    best_model_state = None

    for epoch in range(50):
        ae_model.train()
        optimizer.zero_grad()
        x_hat, _ = ae_model(X_train)
        loss = loss_fn(x_hat, X_train)
        loss.backward()
        optimizer.step()

        ae_model.eval()
        with torch.no_grad():
            x_val_hat, _ = ae_model(X_val)
            val_loss = loss_fn(x_val_hat, X_val).item()

        if val_loss < best_loss:
            best_loss = val_loss
            wait = 0
            best_model_state = ae_model.state_dict()
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    ae_model.load_state_dict(best_model_state)

    ae_model.eval()
    with torch.no_grad():
        _, Z_train = ae_model(X_train)
        _, Z_val = ae_model(X_val)
        _, Z_test = ae_model(X_test)

    reg = Ridge(alpha=1.0)
    y_train = train['stock_exret'].values
    y_val = val['stock_exret'].values
    y_test = test['stock_exret'].values
    y_mean = y_train.mean()
    reg.fit(Z_train, y_train - y_mean)
    y_pred = reg.predict(Z_test) + y_mean

    r2 = r2_score(y_test, y_pred)

    # === Fallback using correctly scaled full_features for OLS ===
    if np.abs(r2) > 5:
        print("AE failed. Fallback to OLS prediction (zero fill).")
        fallback_train = train[full_features].copy().fillna(0)
        fallback_test = test[full_features].copy().fillna(0)
        raw_scaler = StandardScaler()
        raw_X_train = raw_scaler.fit_transform(fallback_train)
        raw_X_test = raw_scaler.transform(fallback_test)
        reg = LinearRegression()
        reg.fit(raw_X_train, y_train - y_mean)
        y_pred = reg.predict(raw_X_test) + y_mean
        r2 = r2_score(y_test, y_pred)

    pred_df = test[['permno', 'yyyymm', 'stock_exret']].copy()
    pred_df['ae'] = y_pred

    if not os.path.exists("ae_improved_predictions.csv"):
        pred_df.to_csv("ae_improved_predictions.csv", index=False)
    else:
        pred_df.to_csv("ae_improved_predictions.csv", mode="a", header=False, index=False)

    pd.DataFrame([{
        "round": counter + 1,
        "start_date": cutoff[0],
        "end_date": cutoff[3],
        "r2_ae": r2
    }]).to_csv("ae_improved_r2.csv", mode="a", header=(counter == 0), index=False)

    print(f"Round {counter+1} completed. R²: {r2:.4f}")
    counter += 1

print("Stable Autoencoder prediction complete.")


  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] * df_full[m]
  df_full[col_name] = df_full[f] *


=== Round 1: 2000-01 to 2011-01 ===
Round 1 completed. R²: 0.0096

=== Round 2: 2000-01 to 2012-01 ===
Round 2 completed. R²: 0.0113

=== Round 3: 2000-01 to 2013-01 ===
Round 3 completed. R²: -0.0072

=== Round 4: 2000-01 to 2014-01 ===
Round 4 completed. R²: -0.0469

=== Round 5: 2000-01 to 2015-01 ===
Round 5 completed. R²: -0.0390

=== Round 6: 2000-01 to 2016-01 ===
AE failed. Fallback to OLS prediction (zero fill).
Round 6 completed. R²: -1198.6749

=== Round 7: 2000-01 to 2017-01 ===
Round 7 completed. R²: -0.1189

=== Round 8: 2000-01 to 2018-01 ===
Round 8 completed. R²: -0.1015

=== Round 9: 2000-01 to 2019-01 ===
Round 9 completed. R²: -0.0131

=== Round 10: 2000-01 to 2020-01 ===
Early stopping at epoch 28
Round 10 completed. R²: -0.0350

=== Round 11: 2000-01 to 2021-01 ===


KeyboardInterrupt: 

In [6]:
# === Only run Round 6 ===
start = pd.to_datetime("20000101", format="%Y%m%d")
counter = 5
cutoff = [
    start,
    start + pd.DateOffset(years=8 + counter),
    start + pd.DateOffset(years=10 + counter),
    start + pd.DateOffset(years=11 + counter)
]

print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
train = df_full[(df_full['yyyymm'] >= cutoff[0]) & (df_full['yyyymm'] < cutoff[1])].copy()
val   = df_full[(df_full['yyyymm'] >= cutoff[1]) & (df_full['yyyymm'] < cutoff[2])].copy()
test  = df_full[(df_full['yyyymm'] >= cutoff[2]) & (df_full['yyyymm'] < cutoff[3])].copy()

train_mean = train[full_features].mean()
train[full_features] = train[full_features].fillna(train_mean)
val[full_features] = val[full_features].fillna(train_mean)
test[full_features] = test[full_features].fillna(train_mean)

scaler = StandardScaler()
train[full_features] = scaler.fit_transform(train[full_features])
val[full_features] = scaler.transform(val[full_features])
test[full_features] = scaler.transform(test[full_features])

X_train = torch.tensor(train[full_features].values, dtype=torch.float32)
X_val = torch.tensor(val[full_features].values, dtype=torch.float32)
X_test = torch.tensor(test[full_features].values, dtype=torch.float32)

ae_model = AE(len(full_features))
optimizer = torch.optim.Adam(ae_model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

best_loss = float('inf')
patience = 5
wait = 0
best_model_state = None

for epoch in range(50):
    ae_model.train()
    optimizer.zero_grad()
    x_hat, _ = ae_model(X_train)
    loss = loss_fn(x_hat, X_train)
    loss.backward()
    optimizer.step()

    ae_model.eval()
    with torch.no_grad():
        x_val_hat, _ = ae_model(X_val)
        val_loss = loss_fn(x_val_hat, X_val).item()

    if val_loss < best_loss:
        best_loss = val_loss
        wait = 0
        best_model_state = ae_model.state_dict()
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

ae_model.load_state_dict(best_model_state)

with torch.no_grad():
    _, Z_train = ae_model(X_train)
    _, Z_val = ae_model(X_val)
    _, Z_test = ae_model(X_test)

reg = Ridge(alpha=1.0)
y_train = train['stock_exret'].values
y_val = val['stock_exret'].values
y_test = test['stock_exret'].values
y_mean = y_train.mean()
reg.fit(Z_train, y_train - y_mean)
y_pred = reg.predict(Z_test) + y_mean

r2 = r2_score(y_test, y_pred)

if np.abs(r2) > 5:
    print("AE failed. Fallback to OLS prediction (zero fill + column order check).")
    fallback_train = train[full_features].copy().fillna(0)
    fallback_test = test[full_features].copy().fillna(0)
    assert list(fallback_train.columns) == list(fallback_test.columns), "Fallback feature columns are misaligned!"
    fallback_test = fallback_test[fallback_train.columns]

    raw_scaler = StandardScaler()
    raw_X_train = raw_scaler.fit_transform(fallback_train)
    raw_X_test = raw_scaler.transform(fallback_test)
    reg = LinearRegression()
    reg.fit(raw_X_train, y_train - y_mean)
    y_pred = reg.predict(raw_X_test) + y_mean
    r2 = r2_score(y_test, y_pred)

print(f"Round 6 completed. R²: {r2:.4f}")



=== Round 6: 2000-01 to 2016-01 ===
AE failed. Fallback to OLS prediction (zero fill + column order check).
Round 6 completed. R²: -1198.6749


In [2]:
# === OLS-Only Return Prediction for Round 6 (with Standardization) ===
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# === Load Data ===
firm = pd.read_csv("mma_sample_v2.csv", parse_dates=["date", "ret_eom"])
firm["yyyymm"] = firm["ret_eom"].dt.to_period("M").dt.to_timestamp()
firm["year"] = firm["yyyymm"].dt.year
firm["month"] = firm["yyyymm"].dt.month

macro = pd.read_csv("macro_monthly.csv")
macro["yyyymm"] = pd.to_datetime(macro["yyyymm"].astype(str), format="%Y%m")

factor_list = pd.read_csv("factor_char_list.csv")["variable"].tolist()
firm = firm[["permno", "yyyymm", "stock_exret", "year", "month"] + factor_list]
firm = firm[firm["stock_exret"].notna()].copy()
df_full = firm.merge(macro, on="yyyymm", how="left")

macro_vars = ['dp', 'ep', 'ntis', 'svar', 'dfy', 'tms', 'tbl']
for m in macro_vars:
    for f in factor_list:
        df_full[f"{f}_{m}"] = df_full[f] * df_full[m]

interaction_features = [f"{f}_{m}" for m in macro_vars for f in factor_list]
full_features = factor_list + interaction_features

# === Round 6 Only ===
start = pd.to_datetime("20000101", format="%Y%m%d")
counter = 5
cutoff = [
    start,
    start + pd.DateOffset(years=8 + counter),
    start + pd.DateOffset(years=10 + counter),
    start + pd.DateOffset(years=11 + counter)
]

print(f"\n=== OLS Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
train = df_full[(df_full['yyyymm'] >= cutoff[0]) & (df_full['yyyymm'] < cutoff[1])].copy()
test  = df_full[(df_full['yyyymm'] >= cutoff[2]) & (df_full['yyyymm'] < cutoff[3])].copy()

# === Standardize with 0-fill ===
train = train[full_features + ['stock_exret']].copy().fillna(0)
test = test[full_features + ['stock_exret']].copy().fillna(0)

scaler = StandardScaler()
X_train = scaler.fit_transform(train[full_features])
X_test = scaler.transform(test[full_features])
y_train = train['stock_exret'].values
y_test = test['stock_exret'].values
y_mean = y_train.mean()

# === Train OLS ===
reg = LinearRegression()
reg.fit(X_train, y_train - y_mean)
y_pred = reg.predict(X_test) + y_mean
r2 = r2_score(y_test, y_pred)

print(f"OLS Round 6 completed. R²: {r2:.4f}")


  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_


=== OLS Round 6: 2000-01 to 2016-01 ===
OLS Round 6 completed. R²: -1755.2448


In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import time

# === Load Data ===
firm = pd.read_csv("mma_sample_v2.csv", parse_dates=["date", "ret_eom"])
firm["yyyymm"] = firm["ret_eom"].dt.to_period("M").dt.to_timestamp()
firm["year"] = firm["yyyymm"].dt.year
firm["month"] = firm["yyyymm"].dt.month

macro = pd.read_csv("macro_monthly.csv")
macro["yyyymm"] = pd.to_datetime(macro["yyyymm"].astype(str), format="%Y%m")

factor_list = pd.read_csv("factor_char_list.csv")["variable"].tolist()
firm = firm[["permno", "yyyymm", "stock_exret", "year", "month"] + factor_list]
firm = firm[firm["stock_exret"].notna()].copy()
df_full = firm.merge(macro, on="yyyymm", how="left")

macro_vars = ['dp', 'ep', 'ntis', 'svar', 'dfy', 'tms', 'tbl']
for m in macro_vars:
    for f in factor_list:
        df_full[f"{f}_{m}"] = df_full[f] * df_full[m]

interaction_features = [f"{f}_{m}" for m in macro_vars for f in factor_list]
full_features = factor_list + interaction_features

# === Load reduced features ===
data = df_full

# Filter valid return rows only
data = data[data['stock_exret'].notna()].copy()



# === Chunked Standardization Function ===
def chunk_standardize(df_train, df_val, df_test, columns, chunk_size=100):
    for i in range(0, len(columns), chunk_size):
        subset = columns[i:i+chunk_size]
        scaler = StandardScaler().fit(df_train[subset])
        df_train.loc[:, subset] = scaler.transform(df_train[subset])
        df_val.loc[:, subset] = scaler.transform(df_val[subset])
        df_test.loc[:, subset] = scaler.transform(df_test[subset])
    return df_train, df_val, df_test

# === Step 4: Only Run Round 6 ===
data['date'] = data['yyyymm']
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = 5
cutoff = [
    starting,
    starting + pd.DateOffset(years=8 + counter),
    starting + pd.DateOffset(years=10 + counter),
    starting + pd.DateOffset(years=11 + counter)
]

print(f"\n=== OLS Round {counter + 1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])].copy()
validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])].copy()
test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])].copy()

print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

if test.empty or train.empty or validate.empty:
    print("Skipping round due to empty set.")
else:
    # === Fill NA using train mean only ===
    train_mean = train[full_features].mean()
    train[full_features] = train[full_features].fillna(train_mean)
    validate[full_features] = validate[full_features].fillna(train_mean)
    test[full_features] = test[full_features].fillna(train_mean)

    # === Standardize ===
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train[full_features])
    X_test = scaler.transform(test[full_features])
    Y_train = train['stock_exret'].values
    Y_test = test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    print("Training OLS...")
    reg = LinearRegression(fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    Y_pred = reg.predict(X_test) + Y_mean

    # === Align index and save result ===
    reg_pred = test[['permno', 'yyyymm', 'stock_exret']].reset_index(drop=True)
    reg_pred['ols'] = Y_pred

    r2_ols = r2_score(Y_test, Y_pred)
    print(f"Round {counter + 1} completed. R²: {r2_ols:.4f}")

  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_full[m]
  df_full[f"{f}_{m}"] = df_full[f] * df_


=== OLS Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1190), Val size: (21591, 1190), Test size: (11459, 1190)
Training OLS...
Round 6 completed. R²: -1198.6749


In [8]:
print("Test size:", test.shape[0])
print("Test stock_exret:", test['stock_exret'].describe())
print("Any feature all-NA in test:", test[full_features].isna().all().sum())

# 查看 feature 是否出现异常
print("Test feature summary:")
print(test[full_features].describe().T.sort_values('std').head(10))


Test size: 11459
Test stock_exret: count    11459.000000
mean        -0.001565
std          0.085519
min         -0.685217
25%         -0.049313
50%         -0.003013
75%          0.043605
max          1.320683
Name: stock_exret, dtype: float64
Any feature all-NA in test: 0
Test feature summary:
                         count          mean           std           min  \
ami_126d_tbl           11459.0  1.577883e-07  4.131072e-07  3.380000e-10   
ami_126d_svar          11459.0  5.597840e-07  1.055596e-06  1.091251e-09   
zero_trades_252d_tbl   11459.0  2.063780e-06  3.407556e-06  1.062000e-09   
zero_trades_126d_tbl   11459.0  2.041064e-06  4.555807e-06  8.964000e-09   
ami_126d_dfy           11459.0  3.167337e-06  4.731906e-06  1.629000e-08   
bidaskhl_21d_tbl       11459.0  3.268034e-06  4.767749e-06  8.479400e-08   
ami_126d_ntis          11459.0 -2.997870e-06  5.435527e-06 -2.012991e-04   
turnover_126d_tbl      11459.0  5.046290e-06  7.598794e-06  6.677600e-08   
zero_trades_252d_sv

In [10]:
# 查看每列的标准差（std 很小的列说明是 quasi-constant，可能出错）
std_series = pd.DataFrame(X_train, columns=full_features).std()
print("最小std的列：")
print(std_series.sort_values().head(10))

# 打印 OLS 模型权重分布
print("OLS 权重分布：")
print(pd.Series(reg.coef_, index=full_features).sort_values(ascending=False).head(10))
print(pd.Series(reg.coef_, index=full_features).sort_values().head(10))

最小std的列：
debt_gr3             1.000003
op_atl1_svar         1.000003
niq_be_chg1_dp       1.000003
market_equity_tbl    1.000003
age                  1.000003
be_gr1a_tms          1.000003
ni_inc8q_dp          1.000003
at_me_tms            1.000003
at_gr1_tms           1.000003
aliq_at_tms          1.000003
dtype: float64
OLS 权重分布：
saleq_gr1_dp       4.446844
op_atl1_dp         2.636548
noa_at             1.529260
op_atl1_tbl        1.442112
op_atl1            1.265375
noa_at_dp          1.202977
saleq_gr1          1.092707
aliq_at_ep         0.800117
ivol_capm_21d      0.798801
oaccruals_at_dp    0.704107
dtype: float64
saleq_gr1_ep      -5.994116
at_gr1            -2.754758
at_gr1_dp         -2.277423
saleq_gr1_dfy     -1.619693
cop_atl1_dp       -1.235071
saleq_gr1_tbl     -0.982703
cop_atl1          -0.810253
niq_at_chg1_dp    -0.798847
taccruals_at_dp   -0.757939
gp_atl1_dp        -0.735765
dtype: float64


In [11]:
print("Compare permno and return alignment:")
print("reg_pred['permno'] head:\n", reg_pred['permno'].head().tolist())
print("Y_test head:\n", Y_test[:5])
print("Y_pred head:\n", reg.predict(X_test)[:5])
print("reg_pred['ols'] head:\n", reg_pred['ols'].head().tolist())


Compare permno and return alignment:
reg_pred['permno'] head:
 [10104, 10107, 10138, 10145, 10147]
Y_test head:
 [-0.0658217  -0.13024756 -0.08315862 -0.02161725 -0.12811027]
Y_pred head:
 [ 0.09419886  0.10091097 -0.06496647  0.01478041  0.03421965]
reg_pred['ols'] head:
 [0.09853772974738143, 0.10524984321090511, -0.06062760226614539, 0.019119279437695695, 0.03855851813830143]


In [3]:
# === Step 1: Load and merge data ===
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
import datetime
from tqdm import tqdm
import time

# Load firm-level data
firm = pd.read_csv("mma_sample_v2.csv", parse_dates=["date", "ret_eom"])

# Convert ret_eom (return date) to yyyymm for alignment
firm["yyyymm"] = firm["ret_eom"].dt.to_period("M").dt.to_timestamp()
firm["year"] = firm["yyyymm"].dt.year
firm["month"] = firm["yyyymm"].dt.month

# Load macroeconomic variables and surprise predictions
macro = pd.read_csv("macro_monthly.csv")
macro["yyyymm"] = pd.to_datetime(macro["yyyymm"].astype(str), format="%Y%m")

# Load list of stock characteristic features
factor_list = pd.read_csv("factor_char_list.csv")["variable"].tolist()

# Filter firm data to include only relevant features
firm = firm[["permno", "yyyymm", "stock_exret", "year", "month"] + factor_list]

# Drop observations with missing target value
firm = firm[firm["stock_exret"].notna()].copy()

# Merge macro variables
df_full = firm.merge(macro, on="yyyymm", how="left")

# === Step 2: Create interaction terms in batches (excluding surprise_pred and date) ===
macro_vars = ['dp', 'ep', 'ntis', 'bm', 'svar', 'dfy', 'tms', 'tbl']
interaction_terms = []
batch_size = 20  # adjust as needed depending on memory capacity

for i in range(0, len(factor_list), batch_size):
    batch_vars = factor_list[i:i + batch_size]
    interaction_data = {}

    for macro_var in macro_vars:
        for firm_var in batch_vars:
            col_name = f"{firm_var}_{macro_var}"
            interaction_data[col_name] = df_full[firm_var] * df_full[macro_var]
            interaction_terms.append(col_name)

    interaction_df = pd.DataFrame(interaction_data)
    df_full = pd.concat([df_full.reset_index(drop=True), interaction_df.reset_index(drop=True)], axis=1)
    del interaction_df, interaction_data  # free memory after each batch

    all_features = factor_list + interaction_terms

    # === Step 3: Monthly standardization with median imputation ===
# Ensure all expected features exist before standardization
for col in all_features:
    if col not in df_full.columns:
        df_full[col] = np.nan

data = pd.DataFrame()
monthly = df_full.groupby("yyyymm")
for date, group in monthly:
    group = group.copy()
    for var in all_features:
        median_val = group[var].median(skipna=True)
        group[var] = group[var].fillna(median_val)
        group[var] = group[var].rank(method="dense") - 1
        vmax = group[var].max()
        group[var] = (group[var] / vmax) * 2 - 1 if vmax > 0 else 0
    data = pd.concat([data, group], ignore_index=True)

    # === Chunked Standardization Function ===
def chunk_standardize(df_train, df_val, df_test, columns, chunk_size=100):
    for i in range(0, len(columns), chunk_size):
        subset = columns[i:i+chunk_size]
        scaler = StandardScaler().fit(df_train[subset])
        df_train.loc[:, subset] = scaler.transform(df_train[subset])
        df_val.loc[:, subset] = scaler.transform(df_val[subset])
        df_test.loc[:, subset] = scaler.transform(df_test[subset])
    return df_train, df_val, df_test

# === Step 4: Expanding window OLS-only training and prediction ===
data['date'] = data['yyyymm']
start_round = 0
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = start_round

print("Starting expanding window prediction (OLS only)...")


# === Round 6 only ===
starting = pd.to_datetime("20000101", format="%Y%m%d")
counter = 5  # Round 6 index

cutoff = [
    starting,
    starting + pd.DateOffset(years=8 + counter),
    starting + pd.DateOffset(years=10 + counter),
    starting + pd.DateOffset(years=11 + counter),
]

print(f"\n=== Round {counter + 1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")

train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

print(f"Train size: {train.shape}, Val size: {validate.shape}, Test size: {test.shape}")

if test.empty or train.empty or validate.empty:
    print("Skipping round due to empty set.")
else:
    print("Standardizing features (chunked)...")
    train, validate, test = chunk_standardize(train, validate, test, all_features, chunk_size=100)
    print("Standardization complete.")

    X_train, Y_train = train[all_features].values, train['stock_exret'].values
    X_test, Y_test = test[all_features].values, test['stock_exret'].values

    Y_mean = np.mean(Y_train)
    Y_train_dm = Y_train - Y_mean

    reg_pred = test[["permno", "yyyymm", "stock_exret"]].copy()

    print("Training OLS...")
    reg = LinearRegression(fit_intercept=False)
    reg.fit(X_train, Y_train_dm)
    reg_pred["ols"] = reg.predict(X_test) + Y_mean

    r2_ols = 1 - np.sum((reg_pred["ols"] - Y_test) ** 2) / np.sum((Y_test - Y_test.mean()) ** 2)
    print(f"Round {counter + 1} completed. R²: {r2_ols:.4f}")

  data['date'] = data['yyyymm']


Starting expanding window prediction (OLS only)...

=== Round 6: 2000-01 to 2016-01 ===
Train size: (145970, 1337), Val size: (21591, 1337), Test size: (11459, 1337)
Standardizing features (chunked)...
Standardization complete.
Training OLS...
Round 6 completed. R²: -0.1779


In [13]:
# Step 1: 预测前对 permno 排序，作为 reference
test_sorted = test.sort_values(["permno", "yyyymm"]).reset_index(drop=True)
permno_ref = test_sorted["permno"].values

# Step 2: 你最终保存的 reg_pred（已经加了 Y_pred）
permno_reg_pred = reg_pred["permno"].values

# Step 3: 比较
print("⛏️ permno 顺序是否完全一致：", np.array_equal(permno_ref, permno_reg_pred))

# Step 4: 如果不一致，看看哪里错了（前10个错位）
for i in range(min(10, len(permno_ref))):
    print(f"{i}: ref={permno_ref[i]}, pred={permno_reg_pred[i]}")


⛏️ permno 顺序是否完全一致： True
0: ref=10104, pred=10104
1: ref=10104, pred=10104
2: ref=10104, pred=10104
3: ref=10104, pred=10104
4: ref=10104, pred=10104
5: ref=10104, pred=10104
6: ref=10104, pred=10104
7: ref=10104, pred=10104
8: ref=10104, pred=10104
9: ref=10104, pred=10104


In [14]:
print("Y_pred summary:", pd.Series(Y_pred).describe())


Y_pred summary: count    11459.000000
mean         0.097846
std          2.959347
min        -13.934285
25%          0.009289
50%          0.032888
75%          0.059207
max        155.357275
dtype: float64
