In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, SGDRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv('data/processed_data.csv', index_col=0)

  mask |= (ar1 == a)


In [3]:
X = df.loc[:,['return', 'gain_loss', 'macd', 'macd_diff_signal', \
              'diff_day', 'diff_10day', 'diff_ema12', 'diff_ema26',\
              'Exchange_NGM', 'Exchange_NMS', 'Exchange_NYQ', \
              'Sector_Communication Services', 'Sector_Consumer Cyclical', \
              'Sector_Consumer Defensive', 'Sector_Energy', \
              'Sector_Financial Services', 'Sector_Healthcare', 'Sector_Industrials', \
              'Sector_Real Estate', 'Sector_Technology', 'Sector_Utilities', \
              'cap_grouping_medium', 'cap_grouping_small']]
y = df.loc[:,['next_close']]

In [4]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
#model = DecisionTreeRegressor()
#model.fit(X_train, y_train)

#y_pred = model.predict(X_test)

In [6]:
from sklearn.metrics import r2_score

#score = r2_score(y_pred, y_test)
#print(score)

In [7]:
#model = Lasso(alpha=0.1)
#model.fit(X_train, y_train)

#y_pred = model.predict(X_test)

In [8]:
#score = r2_score(y_pred, y_test)
#print(score)

In [9]:
#model = SGDRegressor()
#model.fit(X_train, y_train.to_numpy().ravel())

#y_pred = model.predict(X_test)

In [10]:
#score = r2_score(y_pred, y_test)
#print(score)

In [11]:
#model = RandomForestRegressor(n_jobs=-1)
#model.fit(X_train, y_train.to_numpy().ravel())

#y_pred = model.predict(X_test)

In [12]:
#score = r2_score(y_pred, y_test)
#print(score)

In [13]:
#Trying to use TimeSeriesSplit instead of train_test_split
tscv = TimeSeriesSplit()

train_size = round(len(X) * 0.8)
X_train = X.iloc[:train_size]
X_test = X.iloc[train_size:]
y_train = y.to_numpy().ravel()[:train_size]
y_test = y.to_numpy().ravel()[train_size:]

In [14]:
#Scaling because it may be important in some regressors
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
for i, (train_idx, test_idx) in enumerate(tscv.split(X_train)):
    model = DecisionTreeRegressor()
    model.fit(X.iloc[train_idx], y.to_numpy().ravel()[train_idx])

    y_pred = model.predict(X.iloc[test_idx])
    score = r2_score(y_pred, y.to_numpy().ravel()[test_idx])
    print(f"Fold number: {i+1}")
    print(f"R2 score is: {score}")

Fold number: 1
R2 score is: 0.9972395793513694
Fold number: 2
R2 score is: 0.997961125671099
Fold number: 3
R2 score is: 0.9974280021119938
Fold number: 4
R2 score is: 0.9982680002750909
Fold number: 5
R2 score is: 0.6728916945533914


Decision Tree seemed feasible, but for some reason, it does not fit the last fold very well (arguably the most important).

In [16]:
for i, (train_idx, test_idx) in enumerate(tscv.split(X_train)):
    model = Lasso(alpha=0.1)
    model.fit(X.iloc[train_idx], y.to_numpy().ravel()[train_idx])

    y_pred = model.predict(X.iloc[test_idx])
    score = r2_score(y_pred, y.to_numpy().ravel()[test_idx])
    print(f"Fold number: {i+1}")
    print(f"R2 score is: {score}")

Fold number: 1
R2 score is: 0.9994420837225217
Fold number: 2
R2 score is: 0.9992624240665585
Fold number: 3
R2 score is: 0.9990557312896744
Fold number: 4
R2 score is: 0.9993213598832869
Fold number: 5
R2 score is: 0.9994920161796849


It seems like Lasso is a strong candidate for the final model, it fits well on every fold, and fits the best on the final fold even.

In [17]:
for i, (train_idx, test_idx) in enumerate(tscv.split(X_train)):
    model = SGDRegressor(alpha=0.1)
    model.fit(X.iloc[train_idx], y.to_numpy().ravel()[train_idx])

    y_pred = model.predict(X.iloc[test_idx])
    score = r2_score(y_pred, y.to_numpy().ravel()[test_idx])
    print(f"Fold number: {i+1}")
    print(f"R2 score is: {score}")

Fold number: 1
R2 score is: -0.27580306551099754
Fold number: 2
R2 score is: -0.5803813470061407
Fold number: 3
R2 score is: -1.148517732353508
Fold number: 4
R2 score is: -0.013787860724928436
Fold number: 5
R2 score is: -0.015929998643702703


SGDRegression does not seem to work well at all.

In [18]:
for i, (train_idx, test_idx) in enumerate(tscv.split(X_train)):
    model = RandomForestRegressor(n_jobs=-1)
    model.fit(X.iloc[train_idx], y.to_numpy().ravel()[train_idx])

    y_pred = model.predict(X.iloc[test_idx])
    score = r2_score(y_pred, y.to_numpy().ravel()[test_idx])
    print(f"Fold number: {i+1}")
    print(f"R2 score is: {score}")

Fold number: 1
R2 score is: 0.997986939080824
Fold number: 2
R2 score is: 0.9990818783835307
Fold number: 3
R2 score is: 0.9988817930186307
Fold number: 4
R2 score is: 0.9991873488570192
Fold number: 5
R2 score is: 0.6796619140774616


Between the large training time, and the poor fitting in the last fold, Random Forest would not work.

In [22]:
#Lets do some grid search for Lasso
for alpha in [0.001, 0.01, 0.1]:
    for i, (train_idx, test_idx) in enumerate(tscv.split(X_train)):
        model = Lasso(alpha=alpha)
        model.fit(X.iloc[train_idx], y.to_numpy().ravel()[train_idx])

        y_pred = model.predict(X.iloc[test_idx])
        score = r2_score(y_pred, y.to_numpy().ravel()[test_idx])
        print(f"Fold number: {i+1}")
        print(f"Alpha is: {alpha}")
        print(f"R2 score is: {score}")

  model = cd_fast.enet_coordinate_descent(


Fold number: 1
Alpha is: 0.001
R2 score is: 0.9994425883131448


  model = cd_fast.enet_coordinate_descent(


Fold number: 2
Alpha is: 0.001
R2 score is: 0.9992620521465488


  model = cd_fast.enet_coordinate_descent(


Fold number: 3
Alpha is: 0.001
R2 score is: 0.9990554841616918


  model = cd_fast.enet_coordinate_descent(


Fold number: 4
Alpha is: 0.001
R2 score is: 0.9993211166086611


  model = cd_fast.enet_coordinate_descent(


Fold number: 5
Alpha is: 0.001
R2 score is: 0.9994920362197447


  model = cd_fast.enet_coordinate_descent(


Fold number: 1
Alpha is: 0.01
R2 score is: 0.9994424340286878


  model = cd_fast.enet_coordinate_descent(


Fold number: 2
Alpha is: 0.01
R2 score is: 0.9992624262513455


  model = cd_fast.enet_coordinate_descent(


Fold number: 3
Alpha is: 0.01
R2 score is: 0.9990555405331865


  model = cd_fast.enet_coordinate_descent(


Fold number: 4
Alpha is: 0.01
R2 score is: 0.9993212955424239


  model = cd_fast.enet_coordinate_descent(


Fold number: 5
Alpha is: 0.01
R2 score is: 0.999491970895336
Fold number: 1
Alpha is: 0.1
R2 score is: 0.9994420837225217
Fold number: 2
Alpha is: 0.1
R2 score is: 0.9992624240665585
Fold number: 3
Alpha is: 0.1
R2 score is: 0.9990557312896744
Fold number: 4
Alpha is: 0.1
R2 score is: 0.9993213598832869
Fold number: 5
Alpha is: 0.1
R2 score is: 0.9994920161796849


It seems like although technically the final fold score for alpha=0.001 was better than for 0.1, having an alpha did not raise any warnings about not converging, so a Lasso model with an alpha of 0.1 seems the best.