In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
CRSP_PATH   = "daily_crsp.csv"
COMP_PATH   = "CompFirmCharac_final.csv"
CRSP_ROWS   = 500_000          # None = full file; lower for tests
LAG_TOLERANCE = pd.Timedelta(days=180)   # ≤ 6 months old fundamentals
CUTOFF_DATE   = "2017-01-01"   # train < cutoff, test ≥ cutoff

PREDICTORS = ['capxy', 'chechy', 'epsfxy']

In [13]:
comp = (
    pd.read_csv("CompFirmCharac_final.csv", low_memory=False)
      .assign(datadate=lambda df: pd.to_datetime(df["datadate"], errors="coerce"))
      # 1️⃣ keep only rows with a real CUSIP
      .dropna(subset=["cusip"])
      # 2️⃣ leave CUSIP as string *then* zero-pad
      .assign(cusip=lambda df: df["cusip"].astype(str).str.zfill(8))
      # 3️⃣ select predictors
      .loc[:, ["cusip", "datadate", "capxy", "chechy", "epsfxy"]]
      .dropna()                          # drop rows still missing a predictor
      .sort_values(["cusip", "datadate"])
      .reset_index(drop=True)
)


In [14]:
crsp = (
    pd.read_csv(CRSP_PATH,
                usecols=["CUSIP", "DlyCalDt", "DlyRet"],
                nrows=CRSP_ROWS,
                low_memory=False)
      # 1️⃣ drop rows where CUSIP is missing
      .dropna(subset=["CUSIP"])
      # 2️⃣ convert date
      .assign(DlyCalDt=lambda df: pd.to_datetime(df["DlyCalDt"], errors="coerce"))
      # 3️⃣ standardise CUSIP
      .rename(columns={"CUSIP": "cusip"})
      .assign(cusip=lambda df: df["cusip"].astype(str).str.zfill(8))
      # 4️⃣ keep only valid dates and returns
      .dropna(subset=["DlyCalDt", "DlyRet"])
      .sort_values(["cusip", "DlyCalDt"])
      .reset_index(drop=True)
)


In [12]:
crsp['cusip'].value_counts()

cusip
00621210    6339
41959620    6296
89011010    6293
68389X10    6291
59491810    6290
            ... 
00258R10      30
98160920      28
81257210      20
20260420       4
28485330       3
Name: count, Length: 247, dtype: int64

In [15]:
merged = (
    pd.merge_asof(
        crsp, comp,
        by="cusip",
        left_on="DlyCalDt",
        right_on="datadate",
        direction="backward",
        tolerance=LAG_TOLERANCE)
      .dropna(subset=PREDICTORS)            # keep rows that really got fundamentals
)

print(merged.shape)
merged.head()

ValueError: left keys must be sorted