# IPCA Stock Return Prediction
This notebook implements a full expanding window prediction of excess return using IPCA with monthly median imputation and standardization.

In [6]:
# === IPCA Full Expanding Window Prediction ===
import pandas as pd
import numpy as np
from tqdm import tqdm
import datetime
import os

# === Step 1: Load Data ===
firm = pd.read_csv("../data/raw/mma_sample_v2.csv", parse_dates=["date", "ret_eom"])
firm["yyyymm"] = firm["ret_eom"].dt.to_period("M").dt.to_timestamp()
firm = firm[firm["stock_exret"].notna()].copy()

macro = pd.read_csv("../data/macro/macro_monthly.csv")
macro["yyyymm"] = pd.to_datetime(macro["yyyymm"].astype(str), format="%Y%m")

factor_list = pd.read_csv("../data/raw/factor_char_list.csv")["variable"].tolist()
firm = firm[["permno", "yyyymm", "stock_exret"] + factor_list]
df_full = firm.merge(macro, on="yyyymm", how="left")

# === Step 2: Generate Interaction Terms ===
macro_vars = ['dp', 'ep', 'ntis', 'bm', 'svar', 'dfy', 'tms', 'tbl']
interaction_terms = []
for macro_var in macro_vars:
    for firm_var in factor_list:
        name = f"{firm_var}_{macro_var}"
        df_full[name] = df_full[firm_var] * df_full[macro_var]
        interaction_terms.append(name)

full_features = factor_list + interaction_terms

  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_var] * df_full[macro_var]
  df_full[name] = df_full[firm_

In [7]:
# === Updated IPCA_v1 with Out-of-Sample Prediction ===
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

class IPCA_v1:
    def __init__(self, return_column='exret', add_constant=True):
        self.return_column = return_column
        self.add_constant = add_constant
        self.pca = None
        self.Lambda = None
        self.fitted = False

    def fit(self, RZ_train: pd.DataFrame, K=6):
        RZ = RZ_train.dropna()
        Y = RZ[self.return_column].unstack()
        X = RZ.drop(columns=self.return_column)

        if self.add_constant:
            X["const"] = 1.0
        Z = X

        # Latent factor from cross-sectional mean
        G = Z.groupby(level=0).mean()
        self.pca = PCA(n_components=K)
        F = pd.DataFrame(self.pca.fit_transform(G), index=G.index)
        self.Lambda = pd.DataFrame(self.pca.components_.T, index=G.columns)

        # In-sample reconstruction (optional)
        R = Y
        Rhat = F @ self.Lambda.T
        Rhat = Rhat.reindex(columns=R.columns)
        r2_total = 1 - ((R - Rhat) ** 2).sum().sum() / ((R - R.mean()) ** 2).sum().sum()
        r2_pred = 1 - ((R - Rhat) ** 2).sum(axis=1).mean() / ((R - R.mean()) ** 2).sum(axis=1).mean()

        self.fitted = True
        return {
            'rfits': {
                'R2_Total': r2_total,
                'R2_Pred': r2_pred,
                'in_sample_prediction': Rhat.stack().rename("prediction")
            }
        }

    def predict(self, RZ_test: pd.DataFrame):
        if not self.fitted:
            raise ValueError("You must call .fit() before .predict().")

        Z_test = RZ_test.drop(columns=self.return_column)
        if self.add_constant and "const" not in Z_test.columns:
            Z_test["const"] = 1.0

        G_test = Z_test.groupby(level=0).mean()
        F_test = pd.DataFrame(self.pca.transform(G_test), index=G_test.index)
        Rhat_test = F_test @ self.Lambda.T

        # Map back to individual securities
        permnos = RZ_test.index.get_level_values(1).unique()
        Rhat_test_full = pd.DataFrame(index=G_test.index, columns=permnos)
        for t in G_test.index:
            if t in Rhat_test.index:
                Rhat_test_full.loc[t] = Rhat_test.loc[t].values.mean()
        return Rhat_test_full.stack().rename("prediction")

In [5]:
# === Step 3: Expanding Window IPCA Fit ===
data = df_full.copy()
data["date"] = data["yyyymm"]
start = pd.to_datetime("2000-01-01")
end = pd.to_datetime("2024-01-01")
counter = 0

if os.path.exists("ipca_r2_full.csv"):
    os.remove("ipca_r2_full.csv")
if os.path.exists("ipca_pred_full.csv"):
    os.remove("ipca_pred_full.csv")

while start + pd.DateOffset(years=11 + counter) <= end:
    cutoff = [
        start,
        start + pd.DateOffset(years=8 + counter),
        start + pd.DateOffset(years=10 + counter),
        start + pd.DateOffset(years=11 + counter)
    ]

    print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])].copy()
    val = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])].copy()
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])].copy()

    if train.empty or val.empty or test.empty:
        counter += 1
        continue

    trainval = pd.concat([train, val], axis=0)
    train_median = trainval[full_features].median()

    for df in [trainval, test]:
        for var in full_features:
            df[var] = df[var].fillna(train_median[var])
            df[var] = df[var].rank(method="dense") - 1
            vmax = df[var].max()
            df[var] = (df[var] / vmax) * 2 - 1 if vmax > 0 else 0

    # MultiIndex structure for IPCA
    trainval = trainval.set_index(["yyyymm", "permno"])
    test = test.set_index(["yyyymm", "permno"])

    trainval["exret"] = trainval["stock_exret"]
    test["exret"] = test["stock_exret"]

    try:
        model = IPCA_v1(return_column="exret", add_constant=True)
        model.fit(trainval[["exret"] + full_features], K=6)
        pred = model.predict(test[["exret"] + full_features])

        pred = pred.reset_index()
        test_actual = test["exret"].reset_index()
        merged = pred.merge(test_actual, on=["yyyymm", "permno"])

        r2 = 1 - ((merged['exret'] - merged['prediction']) ** 2).sum() / ((merged['exret'] - merged['exret'].mean()) ** 2).sum()
        merged.to_csv("ipca_pred_full.csv", mode="a", header=(counter == 0), index=False)
        pd.DataFrame([{
            "round": counter + 1,
            "start_date": cutoff[0],
            "end_date": cutoff[3],
            "r2": r2
        }]).to_csv("ipca_r2_full.csv", mode="a", header=(counter == 0), index=False)

        print(f"Round {counter+1} completed. R²: {r2:.4f}")

    except Exception as e:
        print(f"IPCA failed on round {counter+1}:", e)

    counter += 1

print("IPCA expanding window prediction complete.")


=== Round 1: 2000-01 to 2011-01 ===
Round 1 completed. R²: -0.4273

=== Round 2: 2000-01 to 2012-01 ===
Round 2 completed. R²: -0.2015

=== Round 3: 2000-01 to 2013-01 ===
Round 3 completed. R²: -0.0747

=== Round 4: 2000-01 to 2014-01 ===
Round 4 completed. R²: -0.4176

=== Round 5: 2000-01 to 2015-01 ===
Round 5 completed. R²: -0.2969

=== Round 6: 2000-01 to 2016-01 ===
Round 6 completed. R²: -0.3674

=== Round 7: 2000-01 to 2017-01 ===
Round 7 completed. R²: -0.2233

=== Round 8: 2000-01 to 2018-01 ===
Round 8 completed. R²: -0.2150

=== Round 9: 2000-01 to 2019-01 ===
Round 9 completed. R²: -0.4360

=== Round 10: 2000-01 to 2020-01 ===
Round 10 completed. R²: -0.0731

=== Round 11: 2000-01 to 2021-01 ===
Round 11 completed. R²: -0.2177

=== Round 12: 2000-01 to 2022-01 ===
Round 12 completed. R²: -0.0740

=== Round 13: 2000-01 to 2023-01 ===


: 

In [4]:
import pandas as pd
import numpy as np
import os

# === Step 1: Prepare expanding window ===
data = df_full.copy()
data["date"] = data["yyyymm"]
start = pd.to_datetime("2000-01-01")
rounds_to_run = [12, 13]  # Round 13 和 14（Python index 从 0 开始）

for counter in rounds_to_run:
    cutoff = [
        start,
        start + pd.DateOffset(years=8 + counter),
        start + pd.DateOffset(years=10 + counter),
        start + pd.DateOffset(years=11 + counter)
    ]

    print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])].copy()
    val = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])].copy()
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])].copy()

    if train.empty or val.empty or test.empty:
        print(f"Skipping Round {counter+1} due to empty data.")
        continue

    trainval = pd.concat([train, val], axis=0)
    train_median = trainval[full_features].median()

    for df in [trainval, test]:
        for var in full_features:
            df[var] = df[var].fillna(train_median[var])
            df[var] = df[var].rank(method="dense") - 1
            vmax = df[var].max()
            df[var] = (df[var] / vmax) * 2 - 1 if vmax > 0 else 0

    trainval = trainval.set_index(["yyyymm", "permno"])
    test = test.set_index(["yyyymm", "permno"])

    trainval["exret"] = trainval["stock_exret"]
    test["exret"] = test["stock_exret"]

    try:
        model = IPCA_v1(return_column="exret", add_constant=True)
        model.fit(trainval[["exret"] + full_features], K=6)
        pred = model.predict(test[["exret"] + full_features])

        pred = pred.reset_index()
        test_actual = test["exret"].reset_index()
        merged = pred.merge(test_actual, on=["yyyymm", "permno"])

        r2 = 1 - ((merged['exret'] - merged['prediction']) ** 2).sum() / ((merged['exret'] - merged['exret'].mean()) ** 2).sum()
        merged.to_csv("../results/ipca_pred_full.csv", mode="a", header=False, index=False)
        pd.DataFrame([{
            "round": counter + 1,
            "start_date": cutoff[0],
            "end_date": cutoff[3],
            "r2": r2
        }]).to_csv("../results/ipca_r2_full.csv", mode="a", header=False, index=False)

        print(f"Round {counter+1} completed. R²: {r2:.4f}")

    except Exception as e:
        print(f"IPCA failed on round {counter+1}:", e)

print("Selected IPCA rounds (13, 14) prediction complete.")


=== Round 13: 2000-01 to 2023-01 ===


MemoryError: Unable to allocate 2.44 GiB for an array with shape (1322, 247854) and data type float64

In [12]:
# === IPCA Round 13–14 (Lasso-Filtered Features Only) ===
import pandas as pd
import numpy as np
import os

# Load lasso-reduced feature list
lasso_features = pd.read_csv("../results/lasso_top_features.csv")
lasso_feature_list = lasso_features.iloc[:, 0].tolist()

feature_list = lasso_feature_list + ["permno", "yyyymm", "stock_exret"]

# === Step 1: Prepare expanding window ===
data = df_full[feature_list].copy()
data["date"] = data["yyyymm"]
start = pd.to_datetime("2000-01-01")
rounds_to_run = [12, 13]  # Round 13 和 14（Python index 从 0 开始）

for counter in rounds_to_run:
    cutoff = [
        start,
        start + pd.DateOffset(years=8 + counter),
        start + pd.DateOffset(years=10 + counter),
        start + pd.DateOffset(years=11 + counter)
    ]

    print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])].copy()
    val = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])].copy()
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])].copy()

    if train.empty or val.empty or test.empty:
        print(f"Skipping Round {counter+1} due to empty data.")
        continue

    trainval = pd.concat([train, val], axis=0)
    train_median = trainval[lasso_feature_list].median()

    for df in [trainval, test]:
        for var in lasso_feature_list:
            if var in df.columns:
                df[var] = df[var].fillna(train_median.get(var, 0))
                df[var] = df[var].rank(method="dense") - 1
                vmax = df[var].max()
                df[var] = (df[var] / vmax) * 2 - 1 if vmax > 0 else 0

    trainval = trainval.set_index(["yyyymm", "permno"])
    test = test.set_index(["yyyymm", "permno"])

    trainval["exret"] = trainval["stock_exret"]
    test["exret"] = test["stock_exret"]

    try:
        model = IPCA_v1(return_column="exret", add_constant=True)
        model.fit(trainval[["exret"] + lasso_feature_list], K=6)
        pred = model.predict(test[["exret"] + lasso_feature_list])

        pred = pred.reset_index()
        test_actual = test["exret"].reset_index()
        merged = pred.merge(test_actual, on=["yyyymm", "permno"])

        r2 = 1 - ((merged['exret'] - merged['prediction']) ** 2).sum() / ((merged['exret'] - merged['exret'].mean()) ** 2).sum()
        merged.to_csv("../results/ipca_pred_full.csv", mode="a", header=False, index=False)
        pd.DataFrame([{
            "round": counter + 1,
            "start_date": cutoff[0],
            "end_date": cutoff[3],
            "r2": r2
        }]).to_csv("../results/ipca_r2_full.csv", mode="a", header=False, index=False)

        print(f"Round {counter+1} completed. R²: {r2:.4f}")

    except Exception as e:
        print(f"IPCA failed on round {counter+1}:", e)

print("Selected IPCA rounds (13, 14) with Lasso-reduced features prediction complete.")


=== Round 13: 2000-01 to 2023-01 ===
Round 13 completed. R²: -0.1907

=== Round 14: 2000-01 to 2024-01 ===
Round 14 completed. R²: -0.0589
Selected IPCA rounds (13, 14) with Lasso-reduced features prediction complete.


In [4]:
# === Step 3: Expanding Window IPCA Fit (Try K = 4 and K = 8, No Constant) ===
data = df_full.copy()
data["date"] = data["yyyymm"]
start = pd.to_datetime("2000-01-01")
end = pd.to_datetime("2024-01-01")
counter = 0

if os.path.exists("ipca_tuned_r2.csv"):
    os.remove("ipca_tuned_r2.csv")
if os.path.exists("ipca_tuned_pred.csv"):
    os.remove("ipca_tuned_pred.csv")

while start + pd.DateOffset(years=11 + counter) <= end:
    cutoff = [
        start,
        start + pd.DateOffset(years=8 + counter),
        start + pd.DateOffset(years=10 + counter),
        start + pd.DateOffset(years=11 + counter)
    ]

    print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])].copy()
    val = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])].copy()
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])].copy()

    if train.empty or val.empty or test.empty:
        counter += 1
        continue

    trainval = pd.concat([train, val], axis=0)
    train_median = trainval[full_features].median()

    for df in [trainval, test]:
        for var in full_features:
            df[var] = df[var].fillna(train_median[var])
            df[var] = df[var].rank(method="dense") - 1
            vmax = df[var].max()
            df[var] = (df[var] / vmax) * 2 - 1 if vmax > 0 else 0

    trainval = trainval.set_index(["yyyymm", "permno"])
    test = test.set_index(["yyyymm", "permno"])
    trainval["exret"] = trainval["stock_exret"]
    test["exret"] = test["stock_exret"]

    best_r2 = -np.inf
    best_k = None
    best_pred = None

    for K_try in [4, 8]:
        print(f"Trying K = {K_try}...")
        try:
            model = IPCA_v1(return_column="exret", add_constant=False)
            model.fit(trainval[["exret"] + full_features], K=K_try)
            pred = model.predict(test[["exret"] + full_features])

            pred = pred.reset_index()
            test_actual = test["exret"].reset_index()
            merged = pred.merge(test_actual, on=["yyyymm", "permno"])

            r2 = 1 - ((merged['exret'] - merged['prediction']) ** 2).sum() / ((merged['exret'] - merged['exret'].mean()) ** 2).sum()

            if r2 > best_r2:
                best_r2 = r2
                best_k = K_try
                best_pred = merged

        except Exception as e:
            print(f"K = {K_try} failed: {e}")
            continue

    if best_pred is not None:
        best_pred.to_csv("ipca_tuned_pred.csv", mode="a", header=(counter == 0), index=False)
        pd.DataFrame([{
            "round": counter + 1,
            "start_date": cutoff[0],
            "end_date": cutoff[3],
            "best_K": best_k,
            "r2": best_r2
        }]).to_csv("ipca_tuned_r2.csv", mode="a", header=(counter == 0), index=False)

        print(f"Round {counter+1} completed. Best K = {best_k}, R²: {best_r2:.4f}")

    counter += 1

print("IPCA K-tuned expanding window prediction complete.")


=== Round 1: 2000-01 to 2011-01 ===
Trying K = 4...
Trying K = 8...
Round 1 completed. Best K = 4, R²: -0.3129

=== Round 2: 2000-01 to 2012-01 ===
Trying K = 4...
Trying K = 8...
Round 2 completed. Best K = 8, R²: -0.1900

=== Round 3: 2000-01 to 2013-01 ===
Trying K = 4...
Trying K = 8...
Round 3 completed. Best K = 4, R²: -0.1085

=== Round 4: 2000-01 to 2014-01 ===
Trying K = 4...
Trying K = 8...
Round 4 completed. Best K = 4, R²: -0.4016

=== Round 5: 2000-01 to 2015-01 ===
Trying K = 4...
Trying K = 8...
Round 5 completed. Best K = 4, R²: -0.1775

=== Round 6: 2000-01 to 2016-01 ===
Trying K = 4...
Trying K = 8...
Round 6 completed. Best K = 4, R²: -0.1767

=== Round 7: 2000-01 to 2017-01 ===
Trying K = 4...
Trying K = 8...
Round 7 completed. Best K = 4, R²: -0.1505

=== Round 8: 2000-01 to 2018-01 ===
Trying K = 4...
Trying K = 8...
Round 8 completed. Best K = 4, R²: -0.1439

=== Round 9: 2000-01 to 2019-01 ===
Trying K = 4...
Trying K = 8...
Round 9 completed. Best K = 4, R²: 

: 

In [15]:
# === IPCA K-tuned Round 13–14 Using Lasso Features Only ===
import pandas as pd
import numpy as np
import os

# Load lasso feature list
lasso_features = pd.read_csv("../results/lasso_top_features.csv")
full_feature = lasso_features.iloc[:, 0].tolist()
feature_list = full_feature + ["permno", "yyyymm", "stock_exret"]

# Prepare data
data = df_full[feature_list].copy()
data["date"] = data["yyyymm"]
start = pd.to_datetime("2000-01-01")
rounds_to_run = [12, 13]  # Round 13, 14

for counter in rounds_to_run:
    cutoff = [
        start,
        start + pd.DateOffset(years=8 + counter),
        start + pd.DateOffset(years=10 + counter),
        start + pd.DateOffset(years=11 + counter)
    ]

    print(f"\n=== Round {counter+1}: {cutoff[0].strftime('%Y-%m')} to {cutoff[3].strftime('%Y-%m')} ===")
    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])].copy()
    val = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])].copy()
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])].copy()

    if train.empty or val.empty or test.empty:
        print(f"Skipping Round {counter+1} due to empty data.")
        continue

    trainval = pd.concat([train, val], axis=0)
    train_median = trainval[full_features].median()

    for df in [trainval, test]:
        for var in full_features:
            df[var] = df[var].fillna(train_median[var])
            df[var] = df[var].rank(method="dense") - 1
            vmax = df[var].max()
            df[var] = (df[var] / vmax) * 2 - 1 if vmax > 0 else 0

    trainval = trainval.set_index(["yyyymm", "permno"])
    test = test.set_index(["yyyymm", "permno"])
    trainval["exret"] = trainval["stock_exret"]
    test["exret"] = test["stock_exret"]

    best_r2 = -np.inf
    best_k = None
    best_pred = None

    for K_try in [4, 8]:
        print(f"Trying K = {K_try}...")
        try:
            model = IPCA_v1(return_column="exret", add_constant=False)
            model.fit(trainval[["exret"] + full_features], K=K_try)
            pred = model.predict(test[["exret"] + full_features])

            pred = pred.reset_index()
            test_actual = test["exret"].reset_index()
            merged = pred.merge(test_actual, on=["yyyymm", "permno"])

            r2 = 1 - ((merged['exret'] - merged['prediction']) ** 2).sum() / ((merged['exret'] - merged['exret'].mean()) ** 2).sum()

            if r2 > best_r2:
                best_r2 = r2
                best_k = K_try
                best_pred = merged

        except Exception as e:
            print(f"K = {K_try} failed: {e}")
            continue

    if best_pred is not None:
        best_pred.to_csv("ipca_tuned_pred.csv", mode="a", header=False, index=False)
        pd.DataFrame([{
            "round": counter + 1,
            "start_date": cutoff[0],
            "end_date": cutoff[3],
            "best_K": best_k,
            "r2": best_r2
        }]).to_csv("ipca_tuned_r2.csv", mode="a", header=False, index=False)

        print(f"Round {counter+1} completed. Best K = {best_k}, R²: {best_r2:.4f}")

print("Selected IPCA K-tuned rounds (13, 14) with Lasso features prediction complete.")


=== Round 13: 2000-01 to 2023-01 ===
Trying K = 4...
Trying K = 8...
Round 13 completed. Best K = 4, R²: -0.1784

=== Round 14: 2000-01 to 2024-01 ===
Trying K = 4...
Trying K = 8...
Round 14 completed. Best K = 4, R²: -0.0220
Selected IPCA K-tuned rounds (13, 14) with Lasso features prediction complete.
