In [1]:
from sqlalchemy import create_engine
import pymssql
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

model = LinearRegression()
qt = QuantileTransformer(output_distribution="normal")
poly = PolynomialFeatures(degree=2)
pipe = make_pipeline(poly, model)

from joblib import dump

In [2]:
server = ""
username = ""
password = "" 
dfbase = ""
string = "mssql+pymssql://" + username + ":" + password + "@" + server + "/" + dfbase
conn = create_engine(string).connect()

#### Get data

In [3]:
df = pd.read_sql(
    """
    select ticker, date, ret, bm, mom12m, roeq, mve
    from data
    where date>='2000-01'
    order by date, ticker
    """, 
    conn
)
df = df.dropna()
conn.close()

features = ["bm", "mom12m", "roeq"]
df = df.set_index(["date", "ticker"])

#### Drop largest 500 stocks each month

In [None]:
df["size_rnk"] = df.groupby("date").mve.rank(ascending=False)
df = df[df.size_rnk>500]

#### Transform features each month

In [None]:
def qt_df(d):
    x = qt.fit_transform(d)
    return pd.DataFrame(x, columns=d.columns, index=d.index)

df[features] = df.groupby("date", group_keys=False)[features].apply(qt_df)

#### Transform target each month

In [None]:
def qt_ser(s):
    x = s.copy()
    x = x.to_numpy().reshape(-1, 1)
    x = qt.fit_transform(x).flatten()
    return pd.Series(x, index=s.index)

df["target"] = df.groupby("date", group_keys=False).ret.apply(qt_ser)

#### Train and save

In [None]:
X = df[features]
y = df["target"]
pipe.fit(X, y)
dump(pipe, "files/linear_model_2023-01-20.joblib")

['files/linear_model.joblib']