In [49]:
## Load helpers

# Python Basic Tools
import pandas as pd
import numpy as np
import seaborn as sn
import os
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# Sklearn Basic Tools
import sklearn.metrics
import sklearn.neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# Sklearn Regression and Classification Tools
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVC
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.naive_bayes import GaussianNB

# Usual cross-val tools and Grid searching 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

# Set random seed
np.random.seed(47)

In [52]:
# IMPORT DATA
# Import Factors (no ER)
def daily_data_converter(data, freq=1):
    if freq == 1:
        period = 'M'
    elif freq == 3:
        period = 'Q'
    elif freq == 12:
        period = 'A'
    
    data.Date = pd.to_datetime(data.Date)
    data.set_index('Date', inplace=True)
    
    data = data/100 + 1
    data['mdate'] = pd.Series(data.index).dt.to_period(period).values
    out = data.groupby('mdate').transform(np.product)-1
    out.drop_duplicates(inplace=True, keep='last')
    return out

Original_Factors = pd.read_csv('US Index & ETF Return1.csv')
#Original_Factors.index = pd.to_datetime(Original_Factors.Date) 
#del Original_Factors['Date']
#Original_Factors = Original_Factors[Original_Factors.Date >= '1999-04-01']
#Original_Factors = Original_Factors[Original_Factors.Date <= '2021-04-30']
#Original_Factors.index = Original_Factors.index.date

Original_Factors.set_index('Date', inplace=True)

factor_names = ['Market', 'Momentum', 'Quality', 'Min Vol', 'Value',
                'Aggressive Growth', 'Small']
Original_Factors = Original_Factors[factor_names]

# Get Monthly Return for Factors
Monthly_Factors = daily_data_converter(Original_Factors.reset_index(), freq=1)

# Get Monthly ER

Monthly_ER_Factors = pd.DataFrame({'Market':Monthly_Factors.iloc[:,0]})
cols = ['Market']
for i in range(len(Monthly_Factors.columns)-1):
    Monthly_ER_Factors = pd.concat([Monthly_ER_Factors, 
                                (Monthly_Factors.iloc[:, (i+1)] - Monthly_Factors.iloc[:,0])], axis=1)
    cols.append(Monthly_Factors.columns[i+1])

Monthly_ER_Factors.columns = cols
market = Monthly_ER_Factors['Market']
Monthly_ER_Factors.drop(columns=['Market'], inplace=True)

Monthly_ER_Factors

Unnamed: 0_level_0,Momentum,Quality,Min Vol,Value,Aggressive Growth,Small
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1999-03-31,0.017054,0.001990,-0.006678,-0.004014,0.027291,-0.007609
1999-04-30,-0.035776,-0.014176,0.006061,0.048531,-0.014891,0.050932
1999-05-28,-0.003899,-0.005979,0.015097,0.030625,-0.007785,0.038223
1999-06-30,0.025598,0.003067,-0.037398,-0.023282,0.052022,-0.010272
1999-07-30,-0.002015,-0.001582,0.008144,0.013141,0.013360,0.003812
...,...,...,...,...,...,...
2021-02-26,-0.033412,0.005451,-0.031345,0.042137,-0.028885,0.034782
2021-03-31,-0.056184,0.004209,0.011486,0.028398,-0.026645,-0.033790
2021-04-30,0.017800,-0.004512,-0.012809,-0.035990,0.005753,-0.032344
2021-05-28,-0.023318,0.004296,0.001304,0.022037,-0.019010,-0.004948


In [53]:



Monthly_ER_Factors1= Monthly_ER_Factors.shift(-3).copy() #Shift 3 months


# Set Binary Variables for Classifcation
Monthly_ER_Factors1[Monthly_ER_Factors1 > 0] = 1 # Excess Return above 1 label "1"
Monthly_ER_Factors1[Monthly_ER_Factors1 < 0] = 0# Excess Return less than 1 label "0"
Monthly_ER_Factors1
Monthly_ER_Factors1.to_csv(r'C:\Users\Administrator\Downloads\Strategic Advisers\Monthly_ER_Factors1.csv')


In [54]:
# Import Combined Dataset
## Include Monthly Index Return data, Macro Indicators, Valuation Indicators and Momentum Indicators added
## Already Normorlized
## Already Check Stationarity, then convert non-ststionay to ststionary

df=  pd.read_csv("All Data.csv")
df.index = pd.to_datetime(df.Date) 
df = df[df.index <= '2021-01-01']
df
del df['Date']
df


Unnamed: 0_level_0,Unemployment Rate,New Privately-Owned Housing Units Started: Total Units,10-Year Treasury Constant Maturity Minus Federal Funds Rate,Producer Price Index by Commodity: All Commodities,Producer Price Index - Petroleum,"All Employees, Total Nonfarm",M1 for the United States,PMI Composite Index,Consumer Price Index: Total All Items for the United States,Value-Weighted \nReturn-incl. dividends,...,DJIA All Caps _lag_3,DJIA All Caps _lag_4,DJIA All Caps _lag_5,DJIA All Caps _lag_6,DJIA All Caps _lag_7,DJIA All Caps _lag_8,DJIA All Caps _lag_9,DJIA All Caps _lag_10,DJIA All Caps _lag_11,DJIA All Caps _lag_12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-05-01,-1.185698,-0.590086,0.949817,0.245481,0.683491,-0.512409,-1.822092,-0.934958,-0.177245,0.809459,...,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952,-0.037864,-0.150667,0.234545,-0.104150
2000-06-01,-1.205172,-0.167572,-2.427361,1.094435,0.720068,-0.534268,-1.043800,-1.245555,0.949817,1.614293,...,0.264565,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952,-0.037864,-0.150667,0.234545
2000-07-01,-1.225114,-1.011479,-0.206370,-0.448223,-0.659608,-0.469401,-0.578780,0.647806,0.177245,0.707771,...,-0.253474,0.264565,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952,-0.037864,-0.150667
2000-08-01,-1.094435,0.836201,-0.891570,-0.757636,0.314881,-0.480068,-1.129813,-1.580168,-0.590086,0.186936,...,-0.147818,-0.253474,0.264565,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952,-0.037864
2000-09-01,-1.432184,-0.355208,-0.157914,0.822757,0.480068,-0.437709,-0.849798,-0.090668,0.934958,-0.119420,...,0.202455,-0.147818,-0.253474,0.264565,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-01,0.836201,0.732476,0.265175,0.501571,-0.458786,0.695580,1.333068,0.081102,-0.129024,0.523307,...,0.121227,0.273150,0.628810,-0.503818,-0.437000,-0.003333,0.136190,0.186700,0.094478,0.087050
2020-10-01,0.683491,0.659608,0.578780,0.284972,-0.081102,0.757636,0.757636,1.688816,-0.469401,0.284972,...,0.253773,0.121227,0.273150,0.628810,-0.503818,-0.437000,-0.003333,0.136190,0.186700,0.094478
2020-11-01,0.612928,0.314881,0.490791,0.863554,0.071543,0.796304,1.094435,-0.683491,-0.770397,1.077249,...,0.332143,0.253773,0.121227,0.273150,0.628810,-0.503818,-0.437000,-0.003333,0.136190,0.186700
2020-12-01,0.601468,1.225114,0.375586,1.060376,1.077249,0.744997,0.905843,1.459285,-0.245481,0.949817,...,-0.166333,0.332143,0.253773,0.121227,0.273150,0.628810,-0.503818,-0.437000,-0.003333,0.136190


In [55]:
# Read excess return data
df_er=  pd.read_csv("Monthly_ER_Factors1.csv",parse_dates=['Date']) # Read Excess Return Data
df_er.index = pd.to_datetime(df_er.Date) 

df_er = df_er[df_er.index <= '2021-01-01'] # Subset dataset to unify data time frame
del df_er['Date']
df_er

Unnamed: 0_level_0,Momentum,Quality,Min Vol,Value,Aggressive Growth,Small
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1999-03-31,1.0,1.0,0.0,0.0,1.0,0.0
1999-04-30,0.0,0.0,1.0,1.0,1.0,1.0
1999-05-28,1.0,1.0,0.0,0.0,1.0,0.0
1999-06-30,1.0,0.0,0.0,0.0,1.0,1.0
1999-07-30,1.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...
2020-08-31,0.0,1.0,0.0,1.0,1.0,1.0
2020-09-30,0.0,0.0,0.0,0.0,1.0,1.0
2020-10-30,1.0,0.0,0.0,1.0,1.0,1.0
2020-11-30,0.0,1.0,0.0,1.0,0.0,1.0


In [56]:
# Merge index excess return data with all normorlized indicators
df = df.merge(df_er, left_index=True, right_index=True, how='inner')
df = df.fillna(method='ffill')
df


Unnamed: 0_level_0,Unemployment Rate,New Privately-Owned Housing Units Started: Total Units,10-Year Treasury Constant Maturity Minus Federal Funds Rate,Producer Price Index by Commodity: All Commodities,Producer Price Index - Petroleum,"All Employees, Total Nonfarm",M1 for the United States,PMI Composite Index,Consumer Price Index: Total All Items for the United States,Value-Weighted \nReturn-incl. dividends,...,DJIA All Caps _lag_9,DJIA All Caps _lag_10,DJIA All Caps _lag_11,DJIA All Caps _lag_12,Momentum_y,Quality_y,Min Vol_y,Value_y,Aggressive Growth,Small
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [9]:
df1=df[df.columns[44:55]]
df1

df1

Unnamed: 0_level_0,Market_x,Momentum,Quality,Min Vol,Value,Aggressive Growth (QQQ),Russell Large,Russell Mid,Russell Small,Russell All Caps,DJIA All Caps
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-05-01,-0.082591,-0.266727,-0.058182,-0.037409,-0.057727,-0.489636,-0.106955,-0.103000,-0.250500,-0.116455,-0.147818
2000-06-01,0.116000,0.276955,0.185045,-0.027273,-0.119091,0.565136,0.113818,0.143227,0.394818,0.138545,0.202455
2000-07-01,-0.073600,-0.077100,-0.043000,-0.052350,0.043550,-0.161050,-0.085650,-0.050100,-0.153750,-0.083350,-0.096250
2000-08-01,0.264304,0.343522,0.179913,0.222174,0.314087,0.576391,0.287609,0.400739,0.322435,0.313217,0.307000
2000-09-01,-0.267200,-0.478450,-0.149000,-0.017700,-0.435300,-0.662650,-0.281950,-0.066900,-0.141350,-0.226950,-0.234050
...,...,...,...,...,...,...,...,...,...,...,...
2020-09-01,-0.172762,-0.166810,-0.141571,-0.070333,-0.090381,-0.251667,-0.191286,-0.083571,-0.148000,-0.164571,-0.166333
2020-10-01,-0.114273,-0.184000,-0.119318,-0.154045,-0.045818,-0.125682,-0.149091,0.035727,0.104045,-0.091091,-0.089682
2020-11-01,0.525600,0.503000,0.550750,0.399050,0.776250,0.544000,0.531200,0.655000,0.861900,0.580300,0.582850
2020-12-01,0.173000,0.158864,0.168045,0.105909,0.136818,0.220364,0.182773,0.210318,0.382773,0.201545,0.201000


In [10]:
# Drop Index Return Data that include in X Dataset
       
df.drop(df.columns[44:55],axis=1,inplace=True)
df

Unnamed: 0_level_0,Unemployment Rate,New Privately-Owned Housing Units Started: Total Units,10-Year Treasury Constant Maturity Minus Federal Funds Rate,Producer Price Index by Commodity: All Commodities,Producer Price Index - Petroleum,"All Employees, Total Nonfarm",M1 for the United States,PMI Composite Index,Consumer Price Index: Total All Items for the United States,Value-Weighted \nReturn-incl. dividends,...,Momentum Excess Return,Quality Excess Return,Min Vol Excess Return,Value Excess Return,QQQ Excess Return,Russel Large Excess Return,Russel Mid Excess Return,Russel Small Excess Return,Russell All Caps Excess Return,DJIA All Caps Excess Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-05-01,-1.185698,-0.590086,0.949817,0.245481,0.683491,-0.512409,-1.822092,-0.934958,-0.177245,0.809459,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2000-06-01,-1.205172,-0.167572,-2.427361,1.094435,0.720068,-0.534268,-1.043800,-1.245555,0.949817,1.614293,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
2000-07-01,-1.225114,-1.011479,-0.206370,-0.448223,-0.659608,-0.469401,-0.578780,0.647806,0.177245,0.707771,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-08-01,-1.094435,0.836201,-0.891570,-0.757636,0.314881,-0.480068,-1.129813,-1.580168,-0.590086,0.186936,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-09-01,-1.432184,-0.355208,-0.157914,0.822757,0.480068,-0.437709,-0.849798,-0.090668,0.934958,-0.119420,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-01,0.836201,0.732476,0.265175,0.501571,-0.458786,0.695580,1.333068,0.081102,-0.129024,0.523307,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-10-01,0.683491,0.659608,0.578780,0.284972,-0.081102,0.757636,0.757636,1.688816,-0.469401,0.284972,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2020-11-01,0.612928,0.314881,0.490791,0.863554,0.071543,0.796304,1.094435,-0.683491,-0.770397,1.077249,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
2020-12-01,0.601468,1.225114,0.375586,1.060376,1.077249,0.744997,0.905843,1.459285,-0.245481,0.949817,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Set Index Excess Return Data as y
df1=df[df.columns[704:715]]
df1

df1

Unnamed: 0_level_0,Market_y,Momentum Excess Return,Quality Excess Return,Min Vol Excess Return,Value Excess Return,QQQ Excess Return,Russel Large Excess Return,Russel Mid Excess Return,Russel Small Excess Return,Russell All Caps Excess Return,DJIA All Caps Excess Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-05-01,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2000-06-01,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
2000-07-01,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-08-01,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-09-01,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
2020-09-01,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
2020-10-01,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2020-11-01,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
2020-12-01,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Set Indicators Data as X
df2 =df[df.columns[0:704]]
df2



Unnamed: 0_level_0,Unemployment Rate,New Privately-Owned Housing Units Started: Total Units,10-Year Treasury Constant Maturity Minus Federal Funds Rate,Producer Price Index by Commodity: All Commodities,Producer Price Index - Petroleum,"All Employees, Total Nonfarm",M1 for the United States,PMI Composite Index,Consumer Price Index: Total All Items for the United States,Value-Weighted \nReturn-incl. dividends,...,DJIA All Caps _lag_3,DJIA All Caps _lag_4,DJIA All Caps _lag_5,DJIA All Caps _lag_6,DJIA All Caps _lag_7,DJIA All Caps _lag_8,DJIA All Caps _lag_9,DJIA All Caps _lag_10,DJIA All Caps _lag_11,DJIA All Caps _lag_12
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-05-01,-1.185698,-0.590086,0.949817,0.245481,0.683491,-0.512409,-1.822092,-0.934958,-0.177245,0.809459,...,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952,-0.037864,-0.150667,0.234545,-0.104150
2000-06-01,-1.205172,-0.167572,-2.427361,1.094435,0.720068,-0.534268,-1.043800,-1.245555,0.949817,1.614293,...,0.264565,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952,-0.037864,-0.150667,0.234545
2000-07-01,-1.225114,-1.011479,-0.206370,-0.448223,-0.659608,-0.469401,-0.578780,0.647806,0.177245,0.707771,...,-0.253474,0.264565,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952,-0.037864,-0.150667
2000-08-01,-1.094435,0.836201,-0.891570,-0.757636,0.314881,-0.480068,-1.129813,-1.580168,-0.590086,0.186936,...,-0.147818,-0.253474,0.264565,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952,-0.037864
2000-09-01,-1.432184,-0.355208,-0.157914,0.822757,0.480068,-0.437709,-0.849798,-0.090668,0.934958,-0.119420,...,0.202455,-0.147818,-0.253474,0.264565,0.117650,-0.199800,0.335000,0.160714,0.303524,-0.119952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-09-01,0.836201,0.732476,0.265175,0.501571,-0.458786,0.695580,1.333068,0.081102,-0.129024,0.523307,...,0.121227,0.273150,0.628810,-0.503818,-0.437000,-0.003333,0.136190,0.186700,0.094478,0.087050
2020-10-01,0.683491,0.659608,0.578780,0.284972,-0.081102,0.757636,0.757636,1.688816,-0.469401,0.284972,...,0.253773,0.121227,0.273150,0.628810,-0.503818,-0.437000,-0.003333,0.136190,0.186700,0.094478
2020-11-01,0.612928,0.314881,0.490791,0.863554,0.071543,0.796304,1.094435,-0.683491,-0.770397,1.077249,...,0.332143,0.253773,0.121227,0.273150,0.628810,-0.503818,-0.437000,-0.003333,0.136190,0.186700
2020-12-01,0.601468,1.225114,0.375586,1.060376,1.077249,0.744997,0.905843,1.459285,-0.245481,0.949817,...,-0.166333,0.332143,0.253773,0.121227,0.273150,0.628810,-0.503818,-0.437000,-0.003333,0.136190


In [25]:
X = df2
y = df1["Momentum Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)



{'bootstrap': True, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 12}
-0.06582030131149359
0.019486379980681996


In [24]:
X = df2
y = df1["Quality Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)


{'bootstrap': True, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 15}
-0.047273753898663985
0.04840746509771121


In [23]:
X = df2
y = df1["Min Vol Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)


{'bootstrap': True, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 12}
-0.05067926625450495
0.007149173310789496


In [22]:
X = df2
y = df1["Value Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)


{'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 7}
-0.06378589507726058
0.07736831208898765


In [21]:
X = df2
y = df1["QQQ Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)


{'bootstrap': True, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 11}
-0.06615141263682522
0.07107649697967895


In [20]:
X = df2
y = df1["Russel Large Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)


{'bootstrap': True, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 15}
-0.0582803463766605
0.03378182243550546


In [19]:
X = df2
y = df1["Russel Mid Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)


{'bootstrap': False, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 7}
-0.026083999536941332
0.07493874926513817


In [18]:
X = df2
y = df1["Russel Small Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)


{'bootstrap': True, 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 10}
-0.08781368234465463
0.04514801752937869


In [16]:
X = df2
y = df1["DJIA All Caps Excess Return"]


# Split the dataframe into random train and test subsets
from sklearn.model_selection import GridSearchCV
param_grid = { 'bootstrap': [True,False],
              'max_depth': [5, 10], 
              'max_features': ['auto', 'log2'], 
              'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15], 
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]}
from sklearn.ensemble import RandomForestClassifier
rfr = RandomForestRegressor()

g_search = GridSearchCV(estimator = rfr, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

g_search.fit(X_train, y_train)
print(g_search.best_params_)
print(g_search.best_score_)

g_search.fit(X_test, y_test)
print(g_search.best_score_)


{'bootstrap': True, 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 13}
-0.055641916891742395
0.07342875648880283
