In [1]:
from sqlalchemy import create_engine
import pymssql
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor 
from sklearn.tree import DecisionTreeRegressor

from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from scipy.stats import uniform

In [2]:
transform = make_column_transformer(
    (OneHotEncoder(), ["industry"]),
    remainder="passthrough"
)

qt = QuantileTransformer(output_distribution="normal")
poly = PolynomialFeatures(degree=2, include_bias=False)

In [3]:
server = "mssql-82792-0.cloudclusters.net:16272"
username = "user"
password = "RiceOwls1912" # paste password between quote marks
database = "ghz"
string = "mssql+pymssql://" + username + ":" + password + "@" + server + "/" + database

conn = create_engine(string).connect()

In [4]:
df = pd.read_sql(
    """
    select ticker, date, ret, roeq, bm, mom12m, mve, siccd
    from data
    where date='2000-01'
    order by date, ticker
    """, 
    conn
)
df = df.dropna()
conn.close()

features = ["roeq", "bm", "mom12m"]
df = df.set_index(["date", "ticker"])

#### Follow third backtest

In [5]:
df["size_rnk"] = df.groupby("date").mve.rank(ascending=False)
df = df[df.size_rnk>500]

In [6]:
def qt_df(d):
    x = qt.fit_transform(d)
    return pd.DataFrame(x, columns=d.columns, index=d.index)

df[features] = df.groupby("date", group_keys=False)[features].apply(qt_df)

def qt_ser(s):
    x = s.copy()
    x = x.to_numpy().reshape(-1, 1)
    x = qt.fit_transform(x).flatten()
    return pd.Series(x, index=s.index)

df["target"] = df.groupby("date", group_keys=False).ret.apply(qt_ser)

In [7]:
inds = pd.read_csv("files/siccodes12.csv", index_col="industry")
ind_names = inds.index.unique().to_list()

def industry(sic):
  try:
    return inds[(inds.start<=sic)&(sic<=inds.end)].index[0]
  except:
    return "Other"
    
siccds = df.groupby("ticker").siccd.last()
siccds = pd.DataFrame(siccds)
siccds["industry"] = siccds.siccd.map(industry)
siccds = siccds.drop(columns = "siccd")

df = df.reset_index().merge(siccds, on="ticker", how="left").set_index(["date", "ticker"])
features.append("industry")

#### Training data

In [8]:
Xtrain = df[features]
ytrain = df["target"]

#### Fit GridSearchCV for random forest

In [9]:
model = RandomForestRegressor()
pipe = make_pipeline(
  transform,
  poly,
  model
)
cv = GridSearchCV(
  pipe,
  param_grid={
    "randomforestregressor__max_depth": range(1, 11)
  },
)
_ = cv.fit(Xtrain, ytrain)

#### Plot results for random forest

In [10]:
results = pd.DataFrame(cv.cv_results_["params"])
results.columns = [x.split("__")[-1] for x in results.columns]
results["score"] = cv.cv_results_["mean_test_score"]

trace = go.Scatter(
    x = results.max_depth,
    y = results.score,
    mode="lines+markers",
    marker=dict(size=12, line=dict(width=2, color="DarkSlateGrey")),
    hovertemplate="""
            max depth = %{x}<br>
            mean score = %{y:.1%}<extra></extra>
            """,
)
fig = go.Figure(trace)
fig.update_layout(
    title="Random Forest",
    xaxis_title="Max Depth",
    yaxis_title="Mean Test Score",
    yaxis_tickformat=".0%",
    template="plotly_white",
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    font_size=16
)
fig.show()

#### Fit GridSearchCV for multi-layer perceptron

In [12]:
model = MLPRegressor(max_iter=500)

pipe = make_pipeline(
  transform,
  poly,
  model
)

from itertools import combinations_with_replacement
max_layers = 3
neurons = [16, 8, 4]
hidden_layer_sizes = []
for i in range(1, max_layers+1):
    hidden_layer_sizes += combinations_with_replacement(neurons, i) 

cv = GridSearchCV(
  pipe,
  param_grid={
    "mlpregressor__hidden_layer_sizes": hidden_layer_sizes
  },
)
_ = cv.fit(Xtrain, ytrain)


Stochastic Optimizer: Maximum iterations (500) reached and the optimization hasn't converged yet.



#### Plot results for multi-layer perceptron

In [13]:
results = pd.DataFrame(cv.cv_results_["params"])
results.columns = [x.split("__")[-1] for x in results.columns]
results["score"] = cv.cv_results_["mean_test_score"]
results["number_layers"] = [len(x) for x in results.hidden_layer_sizes]
results["hidden_layer_sizes"] = [repr(x) for x in results.hidden_layer_sizes]

traces = []
for i in range(1, 4):
    trace = go.Scatter(
        x=results[results.number_layers==i].hidden_layer_sizes,
        y=results[results.number_layers==i].score,
        mode="markers",
        marker=dict(size=12, line=dict(width=2, color="DarkSlateGrey")),
        name =f"{i} layer",
        hovertemplate="""
            hidden layer sizes = %{x}<br>
            mean score = %{y:.1%}<extra></extra>
        """
    )
    traces.append(trace)
fig = go.Figure()
for trace in traces:
    fig.add_trace(trace)
fig.update_layout(
    title="Multi-Layer Perceptron",
    xaxis_title="Hidden Layer Sizes",
    yaxis_title="Mean Test Score",
    yaxis_tickformat=".0%",
    template="plotly_white",
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    font_size=16
)
fig.show()

#### Fit RandomizedSearchCV for gradient boosting

In [18]:
model = GradientBoostingRegressor()
 
pipe = make_pipeline(
  transform,
  poly,
  model
)

u = uniform(scale=0.2)

cv = RandomizedSearchCV(
  pipe,
  param_distributions={
    "gradientboostingregressor__learning_rate": u,
    "gradientboostingregressor__max_depth": range(2, 12, 2)},
  n_iter=20
)
_ = cv.fit(Xtrain, ytrain) 

#### Plot results for gradient boosting

In [19]:
results = pd.DataFrame(cv.cv_results_["params"])
results.columns = [x.split("__")[-1] for x in results.columns]
results["score"] = cv.cv_results_["mean_test_score"]

depths = np.sort(results.max_depth.unique())
traces = []
for depth in depths:
    d = results[results.max_depth==depth]
    trace = go.Scatter(
        x=d.learning_rate,
        y=d.score,
        mode="markers",
        marker=dict(size=12, line=dict(width=2, color="DarkSlateGrey")),
        hovertemplate="""
            max depth = %{depth}<br>
            learning rate = %{x:.2f}<br>
            mean score = %{y:.1%}<extra></extra>
            """,
        name = f"depth={depth}"
    )
    traces.append(trace)
fig = go.Figure()
for trace in traces: 
    fig.add_trace(trace)
fig.update_layout(
    title="Gradient Boosting Regressor",
    xaxis_title="Learning Rate",
    yaxis_title="Mean Test Score",
    yaxis_tickformat=".0%",
    template="plotly_white",
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    font_size=16
)
fig.show()

#### Fit RandomizedSearchCV for adaptive boosting

In [20]:
model = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(
        max_depth=1,
    ),
)

pipe = make_pipeline(
  transform,
  poly,
  model
)

u = uniform(scale=0.2)

cv = RandomizedSearchCV(
  pipe,
  param_distributions={
    "adaboostregressor__learning_rate": u,
    "adaboostregressor__estimator__max_depth": range(2, 12, 2)},
  n_iter=20
)
_ = cv.fit(Xtrain, ytrain) 

#### Plot results for adaptive boosting

In [21]:
results = pd.DataFrame(cv.cv_results_["params"])
results.columns = [x.split("__")[-1] for x in results.columns]
results["score"] = cv.cv_results_["mean_test_score"]

depths = np.sort(results.max_depth.unique())
traces = []
for depth in depths:
    d = results[results.max_depth==depth]
    trace = go.Scatter(
        x=d.learning_rate,
        y=d.score,
        mode="markers",
        marker=dict(
            size=12, 
            line=dict(
                width=2, 
                color="DarkSlateGrey"
            )
        ),
        hovertemplate="""
            max depth = %{depth}<br>
            learning rate = %{x:.2f}<br>
            mean score = %{y:.1%}<extra></extra>
            """,
        name = f"depth={depth}"
    )
    traces.append(trace)
fig = go.Figure()
for trace in traces: 
    fig.add_trace(trace)
fig.update_layout(
    title="AdaBoost Regressor",
    xaxis_title="Learning Rate",
    yaxis_title="Mean Test Score",
    yaxis_tickformat=".0%",
    template="plotly_white",
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    font_size=16
)
fig.show()