In [None]:
import pandas as pd
import seaborn as sns

from pathlib import Path
from tqdm.notebook import tqdm
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

%load_ext autoreload
%autoreload 2

In [None]:
DATA_PATH = Path(Path.cwd(), "data")

split_fraction = 0.7
stock_id_col = "PERMNO"
feature_cols = [
    "marketCapitalization", "momentum", "beta", "idiosyncraticVolatility", "bm", "ps",
    "pcf", "dpr", "roe", "GProf", "capital_ratio", "invt_act", "debt_ebitda", "fcf_ocf",
    "de_ratio", "cash_ratio", "at_turn", "rd_sale", "staff_sale", "PEG_trailing"
]
target_col = "returns"
cutoff_train = 1987
cutoff_valid = 1997
start_year = 1997
max_year = 2021

In [None]:
def impute_na(df, feature_cols, group_col, how="mean"):
    df_imp = df.copy()

    if how == "mean":
        for target in feature_cols:
            df_imp[target] = df_imp.groupby(group_col)[target].transform(
                lambda x: x.fillna(x.mean())
            )
    elif how == "median":
        for target in feature_cols:
            df_imp[target] = df_imp.groupby(group_col)[target].transform(
                lambda x: x.fillna(x.median())
            )
    else:
        for target in feature_cols:
            df_imp[target] = df_imp.groupby(group_col)[target].transform(
                lambda x: x.fillna(0)
            )

    df_imp.dropna(inplace=True)
    return df_imp.reset_index(drop=True)


def split_X_y(df, feature_cols, target_col):
    X = df[feature_cols]
    y = df[target_col]
    return X, y

In [None]:
df_stock = pd.read_csv(DATA_PATH / "US Stock Data.csv", parse_dates=[1])
imputer = FunctionTransformer(impute_na, kw_args={"feature_cols": feature_cols, "group_col": stock_id_col})
splitter = FunctionTransformer(split_X_y, kw_args={"feature_cols": feature_cols, "target_col": target_col})

In [None]:
df_stock.head()

In [None]:
model = RandomForestRegressor(n_estimators=100, max_depth=6, min_samples_split=100, max_features="sqrt")
val_r2 = {}
pred_list = []
true_list = []

In [None]:
for i in tqdm(range(cutoff_valid, max_year)):
    df_stock["tag"] = df_stock.date.map(lambda x: "train" if x.year<cutoff_train else ("valid" if x.year<cutoff_valid else "test"))
    train = df_stock[df_stock["tag"]=="train"].drop(["tag"], axis=1)
    validation = df_stock[df_stock["tag"]=="valid"].drop(["tag"], axis=1)
    test = df_stock[df_stock["tag"]=="test"].drop(["tag"], axis=1)
    test_next = test[test.date.dt.year==cutoff_valid].reset_index(drop=True)

    train = imputer.fit_transform(train)
    validation = imputer.fit_transform(validation)
    test_next = imputer.fit_transform(test_next)

    X_train, y_train = splitter.fit_transform(train)
    X_valid, y_valid = splitter.fit_transform(validation)
    X_test_next, y_test_next = splitter.fit_transform(test_next)

    model.fit(X_train, y_train)
    val_r2[cutoff_valid] = model.score(X_test_next, y_test_next)
    pred_list.append(list(model.predict(X_test_next)))
    true_list.append(list(y_test_next))
    cutoff_train += 1
    cutoff_valid += 1

In [None]:
y_pred = [item for sublist in pred_list for item in sublist]
y_true = [item for sublist in true_list for item in sublist]

overall_r2 = r2_score(y_true, y_pred)
r2_scores = pd.DataFrame.from_dict(val_r2, orient="index", columns=["r2"])

In [None]:
importances = pd.DataFrame(columns=["importance"])
for i in range(model.feature_importances_.shape[0]):
    importances.loc[X_train.columns[i], "importance"] = model.feature_importances_[i]

importances.sort_values("importance", inplace=True, ascending=False)

In [None]:
fig = plt.figure(figsize=(12, 6))
plt.xticks(rotation=90)
plt.bar(list(importances.index), importances["importance"])
plt.show()

In [None]:
sns.heatmap(X_train.corr())
plt.show()