In [599]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
import datetime

In [621]:
train_df = pd.read_csv("observations_train.csv")
test_df = pd.read_csv("obs_test.csv")

In [622]:
train_df['date']=pd.to_datetime(train_df["date"])
test_df['date']=pd.to_datetime(test_df["date"])
test_df = test_df.drop(columns=['Unnamed: 0'])

In [625]:
len(set(train_df['series_id']))

68

Missing data in training set

In [603]:
df_train = pd.DataFrame()
for col in np.unique(train_df['series_id']):
    temp = train_df[train_df['series_id']==col].set_index('date')
    idx = pd.date_range(min(np.unique(temp.index)), max(np.unique(temp.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_train = df_train.append(temp)
df_train

Unnamed: 0,series_id,value
2000-01-03,AAA10Y,1.170000
2000-01-04,AAA10Y,1.200000
2000-01-05,AAA10Y,1.160000
2000-01-06,AAA10Y,1.150000
2000-01-07,AAA10Y,1.170000
2000-01-08,AAA10Y,1.163333
2000-01-09,AAA10Y,1.156667
2000-01-10,AAA10Y,1.150000
2000-01-11,AAA10Y,1.140000
2000-01-12,AAA10Y,1.130000


Missing data within the SP500 timeframe

In [604]:
df_train = pd.DataFrame()
for col in np.unique(train_df['series_id']):
    temp = train_df[train_df['series_id']==col].set_index('date')
    SP500 = train_df[train_df['series_id']=='SP500'].set_index('date')
    idx = pd.date_range(min(np.unique(SP500.index)), max(np.unique(SP500.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_train = df_train.append(temp)
df_train

Unnamed: 0,series_id,value
2011-02-14,AAA10Y,1.59
2011-02-15,AAA10Y,1.65
2011-02-16,AAA10Y,1.65
2011-02-17,AAA10Y,1.68
2011-02-18,AAA10Y,1.70
2011-02-19,AAA10Y,1.71
2011-02-20,AAA10Y,1.72
2011-02-21,AAA10Y,1.73
2011-02-22,AAA10Y,1.74
2011-02-23,AAA10Y,1.71


In [605]:
from collections import Counter
total = np.unique(df_train['series_id'])
res = np.unique(df_train[df_train['value'].isnull()]['series_id'])
use_feat = list((Counter(total)-Counter(res)-Counter(['SP500'])).elements())

In [606]:
X_train = df_train[df_train['series_id'].isin(use_feat)]
X_train = X_train.pivot(columns='series_id')

X_train.columns = X_train.columns.droplevel(0)
X_train.reset_index(drop=True, inplace=True)
Y_train = df_train[df_train['series_id']=='SP500']
Y_train = Y_train.pivot(columns='series_id')
Y_train.columns = Y_train.columns.droplevel(0)
Y_train.reset_index(drop=True, inplace=True)

In [607]:
X_train

series_id,AAA10Y,BAA10Y,DEXCHUS,DEXUSEU,DEXUSUK,DFII10,DFII20,DFII30,DFII5,DFII7,DLTIIT,DPCREDIT,DPRIME,EFFR,INFECTDISEMVTRACKD,IOER,IORR,T10YIE,TEDRATE,WLEMUINDXD
0,1.590000,2.530000,6.596500,1.347400,1.602300,1.340000,1.910000,2.180000,0.510000,1.060000,1.980000,0.75,3.25,0.150000,0.59,0.25,0.25,2.280000,0.180000,30.44
1,1.650000,2.530000,6.588400,1.349400,1.612800,1.330000,1.910000,2.170000,0.440000,1.000000,1.970000,0.75,3.25,0.160000,0.45,0.25,0.25,2.280000,0.180000,23.19
2,1.650000,2.530000,6.588400,1.354700,1.606300,1.370000,1.950000,2.210000,0.430000,1.010000,1.990000,0.75,3.25,0.150000,0.34,0.25,0.25,2.250000,0.190000,45.74
3,1.680000,2.560000,6.585000,1.361200,1.617200,1.310000,1.900000,2.190000,0.330000,0.920000,1.950000,0.75,3.25,0.150000,0.87,0.25,0.25,2.270000,0.220000,14.88
4,1.700000,2.580000,6.573100,1.367300,1.624500,1.250000,1.860000,2.160000,0.260000,0.870000,1.910000,0.75,3.25,0.150000,0.00,0.25,0.25,2.340000,0.210000,12.48
5,1.710000,2.590000,6.574700,1.367000,1.621600,1.217500,1.835000,2.140000,0.212500,0.825000,1.885000,0.75,3.25,0.150000,0.00,0.25,0.25,2.340000,0.205000,65.76
6,1.720000,2.600000,6.576300,1.366700,1.618700,1.185000,1.810000,2.120000,0.165000,0.780000,1.860000,0.75,3.25,0.150000,0.00,0.25,0.25,2.340000,0.200000,56.51
7,1.730000,2.610000,6.577900,1.366400,1.615800,1.152500,1.785000,2.100000,0.117500,0.735000,1.835000,0.75,3.25,0.150000,0.61,0.25,0.25,2.340000,0.195000,31.41
8,1.740000,2.620000,6.579500,1.366100,1.612900,1.120000,1.760000,2.080000,0.070000,0.690000,1.810000,0.75,3.25,0.150000,0.00,0.25,0.25,2.340000,0.190000,40.86
9,1.710000,2.600000,6.574100,1.375800,1.622100,1.100000,1.750000,2.060000,0.010000,0.650000,1.790000,0.75,3.25,0.150000,0.00,0.25,0.25,2.390000,0.190000,34.44


In [608]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, Y_train)
score_random_forest = random_forest.score(X_train, Y_train)

knn = KNeighborsRegressor(n_neighbors = 3) 
knn.fit(X_train, Y_train)  
score_knn = knn.score(X_train, Y_train)

decision_tree = DecisionTreeRegressor() 
decision_tree.fit(X_train, Y_train)   
score_decision_tree = decision_tree.score(X_train, Y_train)


results = pd.DataFrame({
    'Model': ['KNN', 
              'Random Forest', 
              'Decision Tree'],
    'Score': [score_knn, score_random_forest, score_decision_tree]})
result_df = results.sort_values(by='Score')
result_df = result_df.set_index('Score')
result_df

  


Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.926384,KNN
0.999611,Random Forest
1.0,Decision Tree


In [609]:
reg = RandomForestRegressor(n_estimators=100)
reg.fit(X_train, Y_train)
dict(zip(X_train.columns, reg.feature_importances_))

  


{'AAA10Y': 0.0030577252503654483,
 'BAA10Y': 0.1256156084257581,
 'DEXCHUS': 0.15618398381570842,
 'DEXUSEU': 0.6399125609450178,
 'DEXUSUK': 0.00879271261192378,
 'DFII10': 0.0005883158675934175,
 'DFII20': 0.006474792590573117,
 'DFII30': 0.0010893148657149141,
 'DFII5': 0.0030123320743439905,
 'DFII7': 0.0003996415335726453,
 'DLTIIT': 0.0005271265293476373,
 'DPCREDIT': 7.309690628454024e-06,
 'DPRIME': 0.0008383943903802302,
 'EFFR': 0.04291902373141397,
 'INFECTDISEMVTRACKD': 5.8222801108734016e-05,
 'IOER': 0.0009232835202386064,
 'IORR': 0.0008371329604329438,
 'T10YIE': 0.006905113772597543,
 'TEDRATE': 0.0012896157568257462,
 'WLEMUINDXD': 0.0005677888664544732}

In [610]:
X_train = df_train[df_train['series_id'].isin(['BAA10Y','DEXCHUS','DEXUSEU'])]
X_train = X_train.pivot(columns='series_id')
X_train.columns = X_train.columns.droplevel(0)
X_train.reset_index(drop=True, inplace=True)
Y_train = df_train[df_train['series_id']=='SP500']
Y_train = Y_train.pivot(columns='series_id')
Y_train.columns = Y_train.columns.droplevel(0)
Y_train.reset_index(drop=True, inplace=True)

In [611]:
X_train[-7:][0:6]

series_id,BAA10Y,DEXCHUS,DEXUSEU
2504,1.7875,6.566975,1.1846
2505,1.785,6.55895,1.1853
2506,1.7825,6.550925,1.186
2507,1.78,6.5429,1.1867
2508,1.76,6.556,1.1902
2509,1.76,6.5322,1.1952


In [612]:
Y_train[-7:][6:7]

series_id,SP500
2510,2673.61


In [613]:
x_train, x_test, y_train, y_test = X_train[-7:][0:6], X_train[-7:][6:7], Y_train[-7:][0:6], Y_train[-7:][6:7]
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(x_train, y_train)
random_forest.score(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


0.7278106026245337

In [614]:
random_forest.predict(x_test)[0]

2684.6897000000026

In [615]:
y_test

series_id,SP500
2510,2673.61


Missing data in test set

In [616]:
df_test = pd.DataFrame()
for col in np.unique(test_df['series_id']):
    temp = test_df[test_df['series_id']==col].set_index('date')
    idx = pd.date_range(min(np.unique(temp.index)), max(np.unique(temp.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_test = df_test.append(temp)

In [617]:
for series, df_series in df_train.groupby('series_id'):
    name_df = series
    exec(name_df + "= df_series")

In [618]:
# replace min(SP500.index) with given date
d = (max(SP500.index)-min(SP500.index)).days
max(SP500.index)

Timestamp('2017-12-29 00:00:00')

In [620]:
# Build Model
model = ARIMA(SP500.value[~np.isnan(SP500.value)], order=(2, 1 ,2))  
fitted = model.fit(disp=-1)  
print(fitted.summary())

# Forecast
fc, se, conf = fitted.forecast(7, alpha=0.05)  # 95% conf

(fc + random_forest.predict(x_test)[0])/2

  start=index[0], end=index[-1], freq=freq)


                             ARIMA Model Results                              
Dep. Variable:                D.value   No. Observations:                 2510
Model:                 ARIMA(2, 1, 2)   Log Likelihood               -9665.223
Method:                       css-mle   S.D. of innovations             11.379
Date:                Sun, 11 Apr 2021   AIC                          19342.446
Time:                        02:13:42   BIC                          19377.414
Sample:                    02-15-2011   HQIC                         19355.138
                         - 12-29-2017                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
const             0.5339      0.240      2.227      0.026       0.064       1.004
ar.L1.D.value     1.1007      0.134      8.225      0.000       0.838       1.363
ar.L2.D.value    -0.8030      0.080    -

array([2679.47943921, 2679.69757412, 2679.86050987, 2680.0521874 ,
       2680.3198251 , 2680.64799027, 2680.9817802 ])