In [1]:
import math
import torch
from utils import *
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

import joblib

In [2]:
df = fetch_data()
df_combined = combine_features(df, context=5)
X_train, y_train, X_val, y_val, X_test, y_test = split_dataset(df_combined)
X_train.head()

Unnamed: 0,date_id,time_id,symbol_id,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,...,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78
1157744,112,831,33,,,,,,0.649997,1.250937,...,0.083852,-0.186456,-0.247947,0.107769,-0.238652,-0.309708,-0.161242,-0.344682,-0.322328,-0.291973
131284,15,804,10,,,,,,0.237513,0.622777,...,-0.2831,0.008067,-0.055721,-0.109639,0.106814,0.112933,-0.30925,-0.418137,-0.406175,-0.37299
881903,91,729,34,,,,,,-0.892826,-0.437469,...,0.503609,0.482061,-0.309142,-0.100867,-1.433333,-1.712605,-0.882016,-1.518287,-1.419028,-1.389193
977227,100,2,7,,,,,,-0.959374,-0.367786,...,-0.865782,-0.981885,-0.401543,-0.604826,,,-0.350785,-0.316638,-0.324064,-0.323139
1137021,111,410,3,,,,,,0.493863,-0.05212,...,-0.314514,-0.76261,0.838707,-0.185329,0.25868,0.204036,-0.698816,-0.590934,0.271598,0.350628


In [14]:
X_train.shape

(1749789, 83)

In [15]:
X_train_small = X_train[:100000]
y_train_small = y_train[:100000]

### __Random Forest Regressor__

In [None]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('model', RandomForestRegressor(random_state=42))
], verbose=1)

pipeline.fit(X_train_small, y_train_small["responder_0"]);

[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.2s
[Pipeline] ............. (step 2 of 2) Processing model, total=25.4min


In [22]:
pipeline2 = joblib.load("models/pipeline.pkl")

In [23]:
Y_pred = pipeline2.predict(X_test)

mse = mean_squared_error(y_test["responder_0"], Y_pred)
print("Mean Squared Error:", mse)

column_range = df['responder_0'].max() - df['responder_0'].min()
print(f"RANGE: \t{column_range}")
print(f"MAX: \t{df['responder_0'].max()}")
print(f"MIN: \t{df['responder_0'].min()}")

Mean Squared Error: 0.767311558678644
RANGE: 	10.0
MAX: 	5.0
MIN: 	-5.0


### __MLP (Multi Layer Perceptron)__
- `Context`: `5` __time steps__

- `Input`: `79` __features__ $\times$ `5` __time steps__ = `395` __features__
- `Output`: __Responder 6__
- `Model`: `3` __hidden layers__ with `256`, `128`, `64` __neurons__
- `Activation`: __ReLU__
- `Regularization`: __L2__
- `Optimizer`: __Adam__
- `Loss`: __Weighted Zero-Mean R-squared Score ($R^2$)__

$$
  R^2 = 1 - \frac{\sum w_i (y_i - \hat{y}_i)^2}{\sum w_i y_i^2}
$$



In [4]:
df.head()

Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,...,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8
0,0,0,1,3.889038,,,,,,0.851033,...,-0.281498,0.738489,-0.069556,1.380875,2.005353,0.186018,1.218368,0.775981,0.346999,0.095504
1,0,0,7,1.370613,,,,,0.676961,...,-0.302441,2.965889,1.190077,-0.523998,3.849921,2.626981,5.0,0.703665,0.216683,0.778639,
2,0,0,9,2.285698,,,,,1.056285,...,-0.096792,-0.864488,-0.280303,-0.326697,0.375781,1.271291,0.099793,2.109352,0.670881,0.772828,
3,0,0,10,0.690606,,,,,1.139366,...,-0.296244,0.408499,0.223992,2.294888,1.097444,1.225872,1.225376,1.114137,0.775199,-1.379516,
4,0,0,14,0.44057,,,,,0.9552,...,3.418133,-0.373387,-0.502764,-0.348021,-3.928148,-1.591366,-5.0,-3.57282,-1.089123,-5.0,


In [8]:
df.describe()

Unnamed: 0,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,...,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
count,1944210.0,1944210.0,1944210.0,1944210.0,0.0,0.0,0.0,0.0,0.0,1.944210e+06,...,1944210.0,1944210.0,1944210.0,1944210.0,1944210.0,1944210.0,1944210.0,1944210.0,1944210.0,1944210.0,,,,
mean,93.84629,424.0,13.76638,1.973281,,,,,,-4.463175e-02,...,-0.09805073,0.008424639,0.01076465,0.002412764,0.01139089,0.0218548,0.003326982,0.001487634,-0.000481798,0.001078154,,,,
std,48.13196,245.0851,11.08778,0.9679003,,,,,,9.471079e-01,...,0.6392438,0.9543021,1.139575,0.8429762,1.274226,1.264991,1.222891,0.8696651,0.890911,0.8726581,,,,
min,0.0,0.0,0.0,0.4405696,,,,,,-1.176608e+01,...,-3.393299,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0,,,,
25%,54.0,212.0,7.0,1.323803,,,,,,-4.756072e-01,...,-0.3195696,-0.2322211,-0.2667868,-0.1210319,-0.4444537,-0.5146699,-0.2569897,-0.3558709,-0.3914038,-0.2892584,,,,
50%,99.0,424.0,12.0,1.763827,,,,,-0.0581818,...,-2.468792e-01,-0.003943805,-0.02333469,-0.001248489,-0.01031224,-0.01836913,-0.00469055,-0.009597129,-0.02376243,-0.000310783,,,,,
75%,136.0,636.0,17.0,2.393846,,,,,0.3493771,...,-1.256560e-01,0.2309678,0.2538152,0.1191232,0.4292141,0.5065366,0.2438874,0.2538152,0.1191232,0.4292141,0.5065366,0.2438874,0.3360999,0.3463979,0.2840819
max,169.0,848.0,38.0,6.011999,,,,,,1.532000e+01,...,43.70195,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,,,,
