In [None]:
from sqlalchemy import create_engine
import pymssql
import pandas as pd
import numpy as np
import plotly.graph_objects as go

from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor 
from sklearn.tree import DecisionTreeRegressor

from scipy.stats import spearmanr
from sklearn.metrics import make_scorer
from scipy.stats import uniform

In [None]:
transform = make_column_transformer(
    (OneHotEncoder(), ["industry"]),
    remainder="passthrough"
)

qt = QuantileTransformer(output_distribution="normal")
poly = PolynomialFeatures(degree=2, include_bias=False)

In [None]:
server = "mssql-82792-0.cloudclusters.net:16272"
username = "user"
password = "RiceOwls1912" # paste password between quote marks
database = "ghz"
string = "mssql+pymssql://" + username + ":" + password + "@" + server + "/" + database

conn = create_engine(string).connect()

In [None]:
df = pd.read_sql(
    """
    select ticker, date, ret, roeq, bm, mom12m, mve, siccd
    from data
    where date<'2005-01'
    order by date, ticker
    """, 
    conn
)
df = df.dropna()
conn.close()

features = ["roeq", "bm", "mom12m"]
df = df.set_index(["date", "ticker"])

In [None]:
df["size_rnk"] = df.groupby("date").mve.rank(ascending=False)
df = df[df.size_rnk>500]

In [None]:
def qt_df(d):
    x = qt.fit_transform(d)
    return pd.DataFrame(x, columns=d.columns, index=d.index)

df[features] = df.groupby("date", group_keys=False)[features].apply(qt_df)

def qt_ser(s):
    x = s.copy()
    x = x.to_numpy().reshape(-1, 1)
    x = qt.fit_transform(x).flatten()
    return pd.Series(x, index=s.index)

df["target"] = df.groupby("date", group_keys=False).ret.apply(qt_ser)

In [None]:
inds = pd.read_csv("files/siccodes12.csv", index_col="industry")
ind_names = inds.index.unique().to_list()

def industry(sic):
  try:
    return inds[(inds.start<=sic)&(sic<=inds.end)].index[0]
  except:
    return "Other"
    
siccds = df.groupby("ticker").siccd.last()
siccds = pd.DataFrame(siccds)
siccds["industry"] = siccds.siccd.map(industry)
siccds = siccds.drop(columns = "siccd")

df = df.reset_index().merge(siccds, on="ticker", how="left")
features.append("industry")

In [None]:
train = df[df.date<"2004-01"]
test = df[df.date>="2004-01"]

Xtrain = train[features]
ytrain = train["target"]

Xtest = test[features]
ytest = test["target"]

In [None]:
scorer = make_scorer(
    lambda a, b: spearmanr(a, b).statistic, 
    greater_is_better=True
)

In [26]:
depths = [2, 3, 4, 6, 8, 10, 15]
scores = []

for depth in depths:
    model = RandomForestRegressor(max_depth=depth)
    pipe = make_pipeline(
        transform,
        poly,
        model
    )
    pipe.fit(Xtrain, ytrain)
    scores.append(scorer(pipe, Xtest, ytest))

In [None]:
trace = go.Scatter(
    x = depths,
    y = scores,
    mode="lines+markers",
    marker=dict(size=12, line=dict(width=2, color="DarkSlateGrey")),
    hovertemplate="""
            max depth = %{x}<br>
            test score = %{y:.2%}<extra></extra>
            """,
)
fig = go.Figure(trace)
fig.update_layout(
    title="Random Forest",
    xaxis_title="Max Depth",
    yaxis_title="Test Score",
    yaxis_tickformat=".1%",
    template="plotly_white",
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    font_size=16
)

fig.show()

In [None]:
from itertools import combinations_with_replacement
max_layers = 4
neurons = [8, 4, 2]
hidden_layers = []
for i in range(1, max_layers+1):
    hidden_layers += combinations_with_replacement(neurons, i) 

scores = []
for layers in hidden_layers:
    model = MLPRegressor(hidden_layer_sizes=layers, max_iter=500)
    pipe = make_pipeline(
        transform,
        poly,
        model
    )
    pipe.fit(Xtrain, ytrain)
    scores.append(scorer(pipe, Xtest, ytest))

In [None]:
number_layers = [len(x) for x in hidden_layers]
hidden_layer_sizes = [repr(x) for x in hidden_layers]

d = pd.DataFrame(
    {
        "score": scores,
        "numlayers": number_layers,
        "hiddenlayers": hidden_layer_sizes

    }
)

traces = []
for i in range(1, max_layers+1):
    trace = go.Scatter(
        x = d[d.numlayers==i]["hiddenlayers"],
        y = d[d.numlayers==i]["score"],
        mode="markers",
        marker=dict(size=12, line=dict(width=2, color="DarkSlateGrey")),
        hovertemplate="""
            hidden layer sizes = %{x}<br>
            test score = %{y:.2%}<extra></extra>
            """,
        name=f"{i} layers"
    )
    traces.append(trace)

fig = go.Figure()
for trace in traces:
    fig.add_trace(trace)

fig.update_layout(
    title="Multi-Layer Perceptron",
    xaxis_title="Hidden Layer Sizes",
    yaxis_title="Test Score",
    yaxis_tickformat=".1%",
    template="plotly_white",
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
)
fig.show()

In [None]:
depths = [3]*4 + [6]*4 + [9]*4
learning_rates = [0.01, 0.05, 0.1, 0.2] * 3

scores=[]
for depth, lr in zip(depths, learning_rates):
    model = GradientBoostingRegressor(
        max_depth=depth,
        learning_rate=lr
    )
    pipe = make_pipeline(
        transform,
        poly, 
        model
    )
    pipe.fit(Xtrain, ytrain)
    scores.append(scorer(pipe, Xtest, ytest))

In [None]:
d = pd.DataFrame(
    {
        "score": scores,
        "depth": depths,
        "rate": learning_rates
    }
)

traces = []
for depth in [3, 6, 9]:
    d2 = d[d.depth==depth]
    trace = go.Scatter(
        x = d2.rate,
        y = d2.score,
        text = [depth]*4,
        mode="lines+markers",
        marker=dict(size=12, line=dict(width=2, color="DarkSlateGrey")),
        hovertemplate="""
            max depth = %{text}<br>
            learning rate = %{x:.3f}<br>
            test score = %{y:.2%}<extra></extra>
            """,
        name=f"depth={depth}"
    )
    traces.append(trace)

fig = go.Figure()
for trace in traces:
    fig.add_trace(trace)

fig.update_layout(
    title="Gradient Boosting",
    xaxis_title="Learning Rate",
    yaxis_title="Test Score",
    yaxis_tickformat=".1%",
    template="plotly_white",
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    font_size=16
)
fig.show()

In [None]:
scores=[]
for depth, lr in zip(depths, learning_rates):
    model = AdaBoostRegressor(
        estimator=DecisionTreeRegressor(max_depth=depth),
        learning_rate=lr
    )
    pipe = make_pipeline(
        transform,
        poly, 
        model
    )
    pipe.fit(Xtrain, ytrain)
    scores.append(scorer(pipe, Xtest, ytest))

In [None]:
d = pd.DataFrame(
    {
        "score": scores,
        "depth": depths,
        "rate": learning_rates
    }
)

traces = []
for depth in [3, 6, 9]:
    d2 = d[d.depth==depth]
    trace = go.Scatter(
        x = d2.rate,
        y = d2.score,
        text = [depth]*4,
        mode="lines+markers",
        marker=dict(size=12, line=dict(width=2, color="DarkSlateGrey")),
        hovertemplate="""
            max depth = %{text}<br>
            learning rate = %{x:.3f}<br>
            test score = %{y:.2%}<extra></extra>
            """,
        name=f"depth={depth}"
    )
    traces.append(trace)

fig = go.Figure()
for trace in traces:
    fig.add_trace(trace)

fig.update_layout(
    title="Adaptive Boosting",
    xaxis_title="Learning Rate",
    yaxis_title="Test Score",
    yaxis_tickformat=".1%",
    template="plotly_white",
    xaxis_title_font_size=18,
    yaxis_title_font_size=18,
    font_size=16
)
fig.show()