# Monthly CRSP

In [21]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [22]:
# Load Monthly CRSP

CRSP_PATH = 'data/monthly_crsp.csv'
df_crsp = pd.read_csv(
    CRSP_PATH ,
    parse_dates=['MthCalDt'],
    usecols=['PERMNO','CUSIP','MthCalDt','MthRet']
)

In [23]:
# Keep only good returns
df_crsp = df_crsp.dropna(subset=['MthRet'])
df_crsp['MthRet'] = df_crsp['MthRet'].astype(float)

# Sort so shift is correct
df_crsp = df_crsp.sort_values(['CUSIP','MthCalDt'])

# # Create next‐month return target and binary label
# df_crsp['Ret_t1'] = df_crsp.groupby('CUSIP')['MthRet'].shift(-1)
# df_crsp['y'] = (df_crsp['Ret_t1'] > 0).astype(int)
# df_crsp = df_crsp.dropna(subset=['y'])  # drop last obs per series

# Create next‐month return target and binary label
df_crsp['y'] = df_crsp.groupby('CUSIP')['MthRet'].shift(-1)

In [24]:
# Load Compustat Fundamentals

COMP_PATH = 'data/CompFirmCharac.csv'

df_comp = pd.read_csv(
    COMP_PATH,
    parse_dates=['datadate'], dayfirst=True,
)

print(df_comp)

  df_comp = pd.read_csv(
  df_comp = pd.read_csv(


          gvkey   datadate  fyearq  fqtr  fyr indfmt consol popsrc datafmt  \
0          1000 1966-03-31    1966   1.0   12   INDL      C      D     STD   
1          1000 1966-06-30    1966   2.0   12   INDL      C      D     STD   
2          1000 1966-09-30    1966   3.0   12   INDL      C      D     STD   
3          1000 1966-12-31    1966   4.0   12   INDL      C      D     STD   
4          1000 1967-03-31    1967   1.0   12   INDL      C      D     STD   
...         ...        ...     ...   ...  ...    ...    ...    ...     ...   
2052009  356687 2023-09-30    2023   3.0   12   INDL      C      D     STD   
2052010  356687 2023-12-31    2023   4.0   12   INDL      C      D     STD   
2052011  356687 2024-03-31    2024   1.0   12   INDL      C      D     STD   
2052012  356687 2024-06-30    2024   2.0   12   INDL      C      D     STD   
2052013  356687 2024-09-30    2024   3.0   12   INDL      C      D     STD   

           tic  ... xoptepsqpy xoptepsy xoptqpy xopty xrdy   xs

In [25]:
# Trim CUSIP to 8 chars and filter to industrial/consolidated
df_comp['cusip'] = df_comp['cusip'].astype(str).str[:8]
df_comp = df_comp[
    (df_comp['indfmt'] == 'INDL') &
    (df_comp['consol'] == 'C')
]

# Pick three example fundamentals
fund_cols = ['revty', 'saley', 'capxy']  ########################## why choose only 3 columns?

# fund_cols = df_comp.columns

In [26]:
# Sanity check
missing = [c for c in fund_cols if c not in df_comp.columns]
if missing:
    raise KeyError(f"These Compustat codes are missing: {missing}")

In [27]:
df_comp = df_comp[['cusip','datadate'] + fund_cols].drop_duplicates()

### MERGE FEATURES 

In [46]:
df_crsp['date'] = df_crsp['MthCalDt']
df_crsp['cusip'] = df_crsp['CUSIP']

df_crsp = df_crsp.dropna(subset=['cusip', 'date'])
df_crsp['cusip'] = df_crsp['cusip'].astype(str).str[:8]
df_crsp['date']  = pd.to_datetime(df_crsp['date'])
df_crsp = df_crsp.set_index('date').sort_index()



# df_comp = df_comp.dropna(subset=['cusip', 'datadate'])
df_comp['cusip'] = df_comp['cusip'].astype(str).str[:8]
# df_comp['datadate'] = pd.to_datetime(df_comp['datadate'])

# # Rename and set index
# df_comp = df_comp.rename(columns={'datadate':'date'})
# df_comp = df_comp.set_index('date').sort_index()


df_merged = pd.merge_asof(
    left=df_crsp,
    right=df_comp[['cusip'] + fund_cols],
    left_index=True,
    right_index=True,
    by='cusip',
    direction='backward',
    allow_exact_matches=True
).reset_index()  # brings 'date' back as a column

df_merged = df_merged.dropna(subset=fund_cols + ['y'])



df_merged = df_merged.drop('MthCalDt', axis=1)
df_merged = df_merged.drop('cusip', axis=1)
# df_merged = df_merged.drop('column_name', axis=1)




df_merged['month'] = df_merged['date'].dt.month

# One-hot encode the month column
month_dummies = pd.get_dummies(df_merged['month'], prefix='month')
df_merged = pd.concat([df_merged, month_dummies], axis=1)

# Drop the original month column
df_merged.drop(columns=['month'], inplace=True)


cols = [col for col in df_merged.columns if col != 'y'] + ['y']
df_merged = df_merged[cols]

In [52]:
df_merged.head()

# from pathlib import Path

# # Set current directory
# current_dir = Path(__file__).parent

# df_merged.to_csv(current_dir / 'output_data.csv', index=False)

df_merged.to_csv('output_data.csv', index=False)


### Training

In [48]:
features = fund_cols
X = df_merged[features]
y = df_merged['y']

# Chronological 80/20 split
cut = int(len(df_merged)*0.8)
X_train, X_test = X.iloc[:cut], X.iloc[cut:]
y_train, y_test = y.iloc[:cut], y.iloc[cut:]


In [49]:
pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale',  StandardScaler()),
    ('clf',    RandomForestClassifier(
        n_estimators=100,
        max_depth=5,
        random_state=42,
        n_jobs=-1
    ))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Classification Report on Test Set:\n")
print(classification_report(y_test, y_pred))

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.