In [628]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import TimeSeriesSplit

In [621]:
train_df = pd.read_csv("observations_train.csv")
test_df = pd.read_csv("obs_test.csv")

In [622]:
train_df['date']=pd.to_datetime(train_df["date"])
test_df['date']=pd.to_datetime(test_df["date"])
test_df = test_df.drop(columns=['Unnamed: 0'])

In [625]:
len(set(train_df['series_id']))

68

Missing data in training set

In [603]:
df_train = pd.DataFrame()
for col in np.unique(train_df['series_id']):
    temp = train_df[train_df['series_id']==col].set_index('date')
    idx = pd.date_range(min(np.unique(temp.index)), max(np.unique(temp.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_train = df_train.append(temp)
df_train

Unnamed: 0,series_id,value
2000-01-03,AAA10Y,1.170000
2000-01-04,AAA10Y,1.200000
2000-01-05,AAA10Y,1.160000
2000-01-06,AAA10Y,1.150000
2000-01-07,AAA10Y,1.170000
2000-01-08,AAA10Y,1.163333
2000-01-09,AAA10Y,1.156667
2000-01-10,AAA10Y,1.150000
2000-01-11,AAA10Y,1.140000
2000-01-12,AAA10Y,1.130000


Missing data within the SP500 timeframe

In [629]:
df_train = pd.DataFrame()
for col in np.unique(train_df['series_id']):
    temp = train_df[train_df['series_id']==col].set_index('date')
    SP500 = train_df[train_df['series_id']=='SP500'].set_index('date')
    idx = pd.date_range(min(np.unique(SP500.index)), max(np.unique(SP500.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_train = df_train.append(temp)

Missing data in test set

In [701]:
df_test = pd.DataFrame()
for col in np.unique(test_df['series_id']):
    temp = test_df[test_df['series_id']==col].set_index('date')
    idx = pd.date_range(min(np.unique(temp.index)), max(np.unique(temp.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_test = df_test.append(temp)

In [626]:
from collections import Counter
total = np.unique(df_train['series_id'])
res = np.unique(df_train[df_train['value'].isnull()]['series_id'])
use_feat = list((Counter(total)-Counter(res)-Counter(['SP500'])).elements())
len(use_feat)

20

In [606]:
X_train = df_train[df_train['series_id'].isin(use_feat)]
X_train = X_train.pivot(columns='series_id')

X_train.columns = X_train.columns.droplevel(0)
X_train.reset_index(drop=True, inplace=True)
Y_train = df_train[df_train['series_id']=='SP500']
Y_train = Y_train.pivot(columns='series_id')
Y_train.columns = Y_train.columns.droplevel(0)
Y_train.reset_index(drop=True, inplace=True)

In [608]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, Y_train)
score_random_forest = random_forest.score(X_train, Y_train)

knn = KNeighborsRegressor(n_neighbors = 3) 
knn.fit(X_train, Y_train)  
score_knn = knn.score(X_train, Y_train)

decision_tree = DecisionTreeRegressor() 
decision_tree.fit(X_train, Y_train)   
score_decision_tree = decision_tree.score(X_train, Y_train)


results = pd.DataFrame({
    'Model': ['KNN', 
              'Random Forest', 
              'Decision Tree'],
    'Score': [score_knn, score_random_forest, score_decision_tree]})
result_df = results.sort_values(by='Score')
result_df = result_df.set_index('Score')
result_df

  


Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.926384,KNN
0.999611,Random Forest
1.0,Decision Tree


In [609]:
reg = RandomForestRegressor(n_estimators=100)
reg.fit(X_train, Y_train)
dict(zip(X_train.columns, reg.feature_importances_))

  


{'AAA10Y': 0.0030577252503654483,
 'BAA10Y': 0.1256156084257581,
 'DEXCHUS': 0.15618398381570842,
 'DEXUSEU': 0.6399125609450178,
 'DEXUSUK': 0.00879271261192378,
 'DFII10': 0.0005883158675934175,
 'DFII20': 0.006474792590573117,
 'DFII30': 0.0010893148657149141,
 'DFII5': 0.0030123320743439905,
 'DFII7': 0.0003996415335726453,
 'DLTIIT': 0.0005271265293476373,
 'DPCREDIT': 7.309690628454024e-06,
 'DPRIME': 0.0008383943903802302,
 'EFFR': 0.04291902373141397,
 'INFECTDISEMVTRACKD': 5.8222801108734016e-05,
 'IOER': 0.0009232835202386064,
 'IORR': 0.0008371329604329438,
 'T10YIE': 0.006905113772597543,
 'TEDRATE': 0.0012896157568257462,
 'WLEMUINDXD': 0.0005677888664544732}

In [700]:
f = df_train[df_train['series_id'].isin(['BAA10Y','DEXCHUS','DEXUSEU'])].pivot(columns='series_id')
f.columns = f.columns.droplevel(0)

In [644]:
X_train = df_train[df_train['series_id'].isin(['BAA10Y','DEXCHUS','DEXUSEU'])]
X_train = X_train.pivot(columns='series_id')
X_train.columns = X_train.columns.droplevel(0)
#X_train.reset_index(drop=True, inplace=True)
Y_train = df_train[df_train['series_id']=='SP500']
Y_train = Y_train.pivot(columns='series_id')
Y_train.columns = Y_train.columns.droplevel(0)
#Y_train.reset_index(drop=True, inplace=True)

In [699]:
X_train_bestfeat = X_train[['BAA10Y','DEXCHUS','DEXUSEU']]

In [669]:
#x_train, x_test, y_train, y_test = X_train[-7:][0:6], X_train[-7:][6:7], Y_train[-7:][0:6], Y_train[-7:][6:7]
#random_forest = RandomForestRegressor(n_estimators=100)
#random_forest.fit(x_train, y_train)
#random_forest.score(x_train, y_train)
rf =  RandomForestRegressor()
params = [{'n_estimators':[100],'max_features':[1,2,3],}]
tscv = TimeSeriesSplit(n_splits=100)
clf = GridSearchCV(rf, params, cv=tscv, scoring=['r2'], refit=False, verbose=0)
rf_model = clf.fit(X_train_bestfeat,Y_train.values.ravel())
pr2 = rf_model.cv_results_['params'][ np.argmin(rf_model.cv_results_['rank_test_r2'])]
rf.set_params(**pr2)
rf.fit(X_train_bestfeat,Y_train.values.ravel())

rf.predict(X_train_bestfeat)

array([1390.7636    , 1355.903     , 1358.0523    , ..., 2679.8407    ,
       2683.10196667, 2676.10236667])

In [675]:
rf.score(X_train_bestfeat,Y_train)

0.9995210448802597

In [617]:
for series, df_series in df_train.groupby('series_id'):
    name_df = series
    exec(name_df + "= df_series")

In [618]:
# replace min(SP500.index) with given date
d = (max(SP500.index)-min(SP500.index)).days
max(SP500.index)

Timestamp('2017-12-29 00:00:00')

In [620]:
# Build Model
model = ARIMA(SP500.value[~np.isnan(SP500.value)], order=(2, 1 ,2))  
fitted = model.fit(disp=-1)  
print(fitted.summary())

# Forecast
fc, se, conf = fitted.forecast(7, alpha=0.05)  # 95% conf

(fc + random_forest.predict(x_test)[0])/2

  start=index[0], end=index[-1], freq=freq)


                             ARIMA Model Results                              
Dep. Variable:                D.value   No. Observations:                 2510
Model:                 ARIMA(2, 1, 2)   Log Likelihood               -9665.223
Method:                       css-mle   S.D. of innovations             11.379
Date:                Sun, 11 Apr 2021   AIC                          19342.446
Time:                        02:13:42   BIC                          19377.414
Sample:                    02-15-2011   HQIC                         19355.138
                         - 12-29-2017                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.5339      0.240      2.227      0.026       0.064       1.004
ar.L1.D.value     1.1007      0.134      8.225      0.000       0.838       1.363
ar.L2.D.value    -0.8030      0.080    -

array([2679.47943921, 2679.69757412, 2679.86050987, 2680.0521874 ,
       2680.3198251 , 2680.64799027, 2680.9817802 ])