In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import colors
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [8]:
traindf = pd.read_csv('alananer/datasets/observations_train.csv')
testdf = pd.read_csv('alananer/datasets/observations_test.csv')

In [10]:
traindf['date']=pd.to_datetime(traindf["date"])
testdf['date']=pd.to_datetime(testdf["date"])

Missing data in training set 

In [11]:
df_train = pd.DataFrame()
week_train = pd.DataFrame()
monthly_train = pd.DataFrame()
daily_train_m = pd.DataFrame()
for col in np.unique(traindf['series_id']):
    temp = traindf[traindf['series_id']==col].set_index('date')
    SP500 = traindf[traindf['series_id']=='SP500'].set_index('date')
    idx = pd.date_range(min(np.unique(SP500.index)), max(np.unique(SP500.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_train = df_train.append(temp)

In [12]:
from collections import Counter
total = np.unique(df_train['series_id'])
res = np.unique(df_train[df_train['value'].isnull()]['series_id'])
use_feat = list((Counter(total)-Counter(res)-Counter(['SP500'])).elements())

In [13]:
X_train = df_train[df_train['series_id'].isin(use_feat)]
X_train = X_train.pivot(columns='series_id')

X_train.columns = X_train.columns.droplevel(0)
X_train.reset_index(drop=True, inplace=True)
Y_train = df_train[df_train['series_id']=='SP500']
Y_train = Y_train.pivot(columns='series_id')
Y_train.columns = Y_train.columns.droplevel(0)
Y_train.reset_index(drop=True, inplace=True)

In [15]:
X_train.shape

(2511, 20)

In [17]:
X_train_bestfeat = X_train[['BAA10Y','DEXCHUS','DEXUSEU']]

In [18]:
X_train_bestfeat

series_id,BAA10Y,DEXCHUS,DEXUSEU
0,2.5300,6.596500,1.3474
1,2.5300,6.588400,1.3494
2,2.5300,6.588400,1.3547
3,2.5600,6.585000,1.3612
4,2.5800,6.573100,1.3673
...,...,...,...
2506,1.7825,6.550925,1.1860
2507,1.7800,6.542900,1.1867
2508,1.7600,6.556000,1.1902
2509,1.7600,6.532200,1.1952


In [31]:
rf =  RandomForestRegressor()
params = [{'n_estimators':[200],'max_features':[1,2,3],}]
tscv = TimeSeriesSplit(n_splits=2)
clf = GridSearchCV(rf, params, cv=tscv, scoring=['r2','neg_root_mean_squared_error'], refit=False, verbose=0)

In [35]:
rf_model = clf.fit(X_train_bestfeat,Y_train.values.ravel())

In [39]:
pr2 = rf_model.cv_results_['params'][ np.argmin(rf_model.cv_results_['rank_test_r2'])]
prmse = rf_model.cv_results_['params'][ np.argmin(rf_model.cv_results_['rank_test_neg_root_mean_squared_error'])]

In [41]:
rf.set_params(**pr2)
rf.fit(X_train_bestfeat,Y_train.values.ravel())

train_r2_predictions = rf.predict(X_train_bestfeat)

In [42]:
rf.set_params(**prmse)
rf.fit(X_train_bestfeat,Y_train.values.ravel())

train_rmse_predictions = rf.predict(X_train_bestfeat)

In [46]:
r2_score(Y_train,train_r2_predictions)

0.9995876010315907

In [49]:
mean_squared_error(Y_train,train_rmse_predictions,squared=False)

8.23099228140441