In [48]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [9]:
train_df = pd.read_csv("observations_train.csv")
test_df = pd.read_csv("observations_test.csv")

In [10]:
train_df['date']=pd.to_datetime(train_df["date"])
test_df['date']=pd.to_datetime(test_df["date"])
test_df = test_df.drop(columns=['Unnamed: 0'])
test_df

Unnamed: 0,series_id,date,value
0,AAA10Y,2018-01-02,1.06
1,AAA10Y,2018-01-03,1.06
2,AAA10Y,2018-01-04,1.01
3,AAA10Y,2018-01-05,1.03
4,AAA10Y,2018-01-08,1.01
5,AAA10Y,2018-01-09,1.02
6,AAA10Y,2018-01-10,1.03
7,AAA10Y,2018-01-11,0.99
8,AAA10Y,2018-01-12,0.95
9,AAA10Y,2018-01-15,


In [17]:
week_train_m = train_df[(train_df['series_id'].isin(['BUSAPPWNSAUS','BUSAPPWNSAUSYY','CBUSAPPWNSAUS','CBUSAPPWNSAUSYY','MORTGAGE15US','MORTGAGE30US','TLAACBW027NBOG','TLBACBW027NBOG']))]
month_train_m = train_df[(train_df['series_id'].isin(['ASEANTOT','CUUR0000SA0R','EECTOT','FRGSHPUSM649NCIS','GVIPT50002S','GVIPT50030S','GVIPT51000S','GVIPT51100S','GVIPT51120S','GVIPT51200S','GVIPT52000S','GVIPT52100S','GVIPT521A3S','GVIPT52300S','GVIPT54000S','GVIPT54100S','GVIPT54200S','GVIPT54220S']))]
daily_train_m = train_df[(train_df['series_id'].isin(['AAA10Y','BAA10Y','DEXCHUS','DEXJPUS','DEXUSEU','DEXUSUK','DFII10','DFII20','DFII30','DFII5','DFII7','DLTIIT','DPCREDIT','DPRIME','EFFR','EFFRVOL','INFECTDISEMVTRACKD','IOER','IORR','OBFR','OBFRVOL','SOFR','SOFRVOL','T10YIE','TEDRATE','WLEMUINDXD']))]

Missing data in training set

In [105]:
df_train = pd.DataFrame()
for col in np.unique(train_df['series_id']):
    temp = train_df[train_df['series_id']==col].set_index('date')
    idx = pd.date_range(min(np.unique(temp.index)), max(np.unique(temp.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_train = df_train.append(temp)
df_train

Unnamed: 0,series_id,value
2000-01-03,AAA10Y,1.170000
2000-01-04,AAA10Y,1.200000
2000-01-05,AAA10Y,1.160000
2000-01-06,AAA10Y,1.150000
2000-01-07,AAA10Y,1.170000
2000-01-08,AAA10Y,1.163333
2000-01-09,AAA10Y,1.156667
2000-01-10,AAA10Y,1.150000
2000-01-11,AAA10Y,1.140000
2000-01-12,AAA10Y,1.130000


Missing data within the SP500 timeframe

In [106]:
df_train = pd.DataFrame()
for col in np.unique(train_df['series_id']):
    temp = train_df[train_df['series_id']==col].set_index('date')
    SP500 = train_df[train_df['series_id']=='SP500'].set_index('date')
    idx = pd.date_range(min(np.unique(SP500.index)), max(np.unique(SP500.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_train = df_train.append(temp)
df_train

Unnamed: 0,series_id,value
2011-02-14,AAA10Y,1.59
2011-02-15,AAA10Y,1.65
2011-02-16,AAA10Y,1.65
2011-02-17,AAA10Y,1.68
2011-02-18,AAA10Y,1.70
2011-02-19,AAA10Y,1.71
2011-02-20,AAA10Y,1.72
2011-02-21,AAA10Y,1.73
2011-02-22,AAA10Y,1.74
2011-02-23,AAA10Y,1.71


In [45]:
from collections import Counter
total = np.unique(df_train['series_id'])
res = np.unique(df_train[df_train['value'].isnull()]['series_id'])
use_feat = list((Counter(total)-Counter(res)-Counter(['SP500'])).elements())

In [208]:
X_train = df_train[df_train['series_id'].isin(use_feat)]
X_train = X_train.pivot(columns='series_id')

X_train.columns = X_train.columns.droplevel(0)
X_train.reset_index(drop=True, inplace=True)
Y_train = df_train[df_train['series_id']=='SP500']
Y_train = Y_train.pivot(columns='series_id')
Y_train.columns = Y_train.columns.droplevel(0)
Y_train.reset_index(drop=True, inplace=True)

In [209]:
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(X_train, Y_train)
score_random_forest = random_forest.score(X_train, Y_train)

knn = KNeighborsRegressor(n_neighbors = 3) 
knn.fit(X_train, Y_train)  
score_knn = knn.score(X_train, Y_train)

decision_tree = DecisionTreeRegressor() 
decision_tree.fit(X_train, Y_train)   
score_decision_tree = decision_tree.score(X_train, Y_train)


results = pd.DataFrame({
    'Model': ['KNN', 
              'Random Forest', 
              'Decision Tree'],
    'Score': [score_knn, score_random_forest, score_decision_tree]})
result_df = results.sort_values(by='Score')
result_df = result_df.set_index('Score')
result_df

  


Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.926384,KNN
0.999718,Random Forest
1.0,Decision Tree


In [101]:
reg = DecisionTreeRegressor()
reg.fit(X_train, Y_train)
dict(zip(X_train.columns, reg.feature_importances_))

{'AAA10Y': 0.0013830642072373916,
 'BAA10Y': 0.14753971451703496,
 'DEXCHUS': 0.1780990826233674,
 'DEXUSEU': 0.6408516529960299,
 'DEXUSUK': 0.013367655178449067,
 'DFII10': 0.00041861747500974435,
 'DFII20': 0.007977892302555495,
 'DFII30': 6.46198217729462e-05,
 'DFII5': 0.00026797165034252114,
 'DFII7': 0.000530527752083274,
 'DLTIIT': 0.0007531897344254923,
 'DPCREDIT': 9.157753758065049e-09,
 'DPRIME': 0.0,
 'EFFR': 0.00011066935707086915,
 'INFECTDISEMVTRACKD': 5.139504248505676e-05,
 'IOER': 3.3685683587801554e-08,
 'IORR': 2.8539760381489136e-11,
 'T10YIE': 0.00757146417596201,
 'TEDRATE': 0.000544867434905543,
 'WLEMUINDXD': 0.0004675728592912827}

In [210]:
X_train = df_train[df_train['series_id'].isin(['BAA10Y','DEXCHUS','DEXUSEU'])]
X_train = X_train.pivot(columns='series_id')
X_train.columns = X_train.columns.droplevel(0)
X_train.reset_index(drop=True, inplace=True)
Y_train = df_train[df_train['series_id']=='SP500']
Y_train = Y_train.pivot(columns='series_id')
Y_train.columns = Y_train.columns.droplevel(0)
Y_train.reset_index(drop=True, inplace=True)

In [295]:
len(Y_train)*0.003

7.533

In [313]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.997)
random_forest = RandomForestRegressor(n_estimators=100)
random_forest.fit(x_train, y_train)
random_forest.score(x_test, y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


0.7724325441721948

In [247]:
random_forest.predict(x_test.iloc[[0]])

array([2163.1866])

In [248]:
y_test

series_id,SP500
1621,2079.650000
1612,2107.400000
1595,2072.256667
1519,2092.430000
2086,2126.150000
766,1545.800000
1516,2102.060000
1811,1939.953333
2372,2457.666667
546,1404.110000


Missing data in test set

In [96]:
df_test = pd.DataFrame()
for col in np.unique(test_df['series_id']):
    temp = test_df[test_df['series_id']==col].set_index('date')
    idx = pd.date_range(min(np.unique(temp.index)), max(np.unique(temp.index)))
    temp.index = pd.DatetimeIndex(temp.index)
    temp = temp.reindex(idx, fill_value=None)
    temp['series_id']=col
    temp['value'] = temp['value'].interpolate(method='time')
    df_test = df_test.append(temp)
df_test

Unnamed: 0,series_id,value
2018-01-02,AAA10Y,1.060000
2018-01-03,AAA10Y,1.060000
2018-01-04,AAA10Y,1.010000
2018-01-05,AAA10Y,1.030000
2018-01-06,AAA10Y,1.023333
2018-01-07,AAA10Y,1.016667
2018-01-08,AAA10Y,1.010000
2018-01-09,AAA10Y,1.020000
2018-01-10,AAA10Y,1.030000
2018-01-11,AAA10Y,0.990000
