# Monthly CRSP

In [30]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [31]:
# Load Monthly CRSP

CRSP_PATH = 'data/monthly_crsp.csv'
df_crsp = pd.read_csv(
    CRSP_PATH ,
    parse_dates=['MthCalDt'],
    usecols=['PERMNO','CUSIP','MthCalDt','MthRet']
)

In [32]:
# Keep only good returns
df_crsp = df_crsp.dropna(subset=['MthRet'])
df_crsp['MthRet'] = df_crsp['MthRet'].astype(float)

# Sort so shift is correct
df_crsp = df_crsp.sort_values(['CUSIP','MthCalDt'])

# Create next‐month return target and binary label
df_crsp['Ret_t1'] = df_crsp.groupby('CUSIP')['MthRet'].shift(-1)
df_crsp['y'] = (df_crsp['Ret_t1'] > 0).astype(int)
df_crsp = df_crsp.dropna(subset=['y'])  # drop last obs per series

In [33]:
# Load Compustat Fundamentals

COMP_PATH = 'data/CompFirmCharac.csv'

df_comp = pd.read_csv(
    COMP_PATH,
    parse_dates=['datadate'], dayfirst=True,
)



  df_comp = pd.read_csv(
  df_comp = pd.read_csv(


In [34]:
# Trim CUSIP to 8 chars and filter to industrial/consolidated
df_comp['cusip'] = df_comp['cusip'].astype(str).str[:8]
df_comp = df_comp[
    (df_comp['indfmt'] == 'INDL') &
    (df_comp['consol'] == 'C')
]

# Pick three example fundamentals
fund_cols = ['revty', 'saley', 'capxy']

In [35]:
# Sanity check
missing = [c for c in fund_cols if c not in df_comp.columns]
if missing:
    raise KeyError(f"These Compustat codes are missing: {missing}")

In [36]:
df_comp = df_comp[['cusip','datadate'] + fund_cols].drop_duplicates()

### MERGE FEATURES 

In [None]:
df_crsp['date'] = df_crsp['MthCalDt']
df_crsp['cusip'] = df_crsp['CUSIP']

df_crsp = df_crsp.dropna(subset=['cusip', 'date'])
df_crsp['cusip'] = df_crsp['cusip'].astype(str).str[:8]
df_crsp['date']  = pd.to_datetime(df_crsp['date'])
df_crsp = df_crsp.set_index('date').sort_index()



df_comp = df_comp.dropna(subset=['cusip', 'datadate'])
df_comp['cusip'] = df_comp['cusip'].astype(str).str[:8]
df_comp['datadate'] = pd.to_datetime(df_comp['datadate'])

# Rename and set index
df_comp = df_comp.rename(columns={'datadate':'date'})
df_comp = df_comp.set_index('date').sort_index()


df_merged = pd.merge_asof(
    left=df_crsp,
    right=df_comp[['cusip'] + fund_cols],
    left_index=True,
    right_index=True,
    by='cusip',
    direction='backward',
    allow_exact_matches=True
).reset_index()  # brings 'date' back as a column

df_merged = df_merged.dropna(subset=fund_cols + ['y'])

In [None]:
df_merged.head()

Unnamed: 0,date,PERMNO,CUSIP,MthCalDt,MthRet,Ret_t1,y,cusip,revty,saley,capxy
811541,1983-09-30,57146,76047310,1983-09-30,0.047619,0.113636,1,76047310,7.044,7.044,0.024
811543,1983-09-30,70907,83153810,1983-09-30,0.111111,-0.23,0,83153810,2.295,2.295,0.097
811545,1983-09-30,12408,2360910,1983-09-30,-0.053571,-0.09434,0,2360910,19.596,19.596,0.108
811546,1983-09-30,60898,10256510,1983-09-30,0.034091,0.021978,1,10256510,3.189,3.189,0.023
811559,1983-09-30,14227,3274410,1983-09-30,0.166667,-0.153846,0,3274410,4.568,4.568,0.164


### Training

In [40]:
features = fund_cols
X = df_merged[features]
y = df_merged['y']

# Chronological 80/20 split
cut = int(len(df_merged)*0.8)
X_train, X_test = X.iloc[:cut], X.iloc[cut:]
y_train, y_test = y.iloc[:cut], y.iloc[cut:]


In [41]:
pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler()),
    ('clf',    RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42,
        n_jobs=-1
    ))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Classification Report on Test Set:\n")
print(classification_report(y_test, y_pred))

Classification Report on Test Set:

              precision    recall  f1-score   support

           0       0.56      0.32      0.41    145316
           1       0.51      0.74      0.60    140348

    accuracy                           0.52    285664
   macro avg       0.53      0.53      0.50    285664
weighted avg       0.53      0.52      0.50    285664

