In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

import datetime

plt.style.use('fivethirtyeight')

In [2]:
dataset = pd.read_csv('data/DataForClustering.csv')
dataset.head()


Unnamed: 0,Date,XLF,XLK,XLI,HY,XLY,XLU,XLP,SPY,VBMFX,Target_FR12Mts
0,12/1/1998,19.039398,32.625,24.5625,6.44,26.125,30.234375,27.15625,123.3125,10.27,0.19
1,1/1/1999,19.369415,37.8125,24.296875,6.52,27.46875,29.484375,26.796875,127.65625,10.3,0.09
2,2/1/1999,19.674046,34.0625,24.515625,6.46,27.296875,28.671875,26.515625,123.5625,10.07,0.11
3,3/1/1999,20.257921,36.59375,24.9375,6.57,28.59375,26.6875,26.4375,128.375,10.06,0.17
4,4/1/1999,21.679529,36.8125,28.6875,6.8,29.34375,29.265625,25.515625,133.25,10.05,0.09


In [3]:
dataset['Class'] = np.where(dataset['Target_FR12Mts'] < 0, 1, 0)
dataset.head()

Unnamed: 0,Date,XLF,XLK,XLI,HY,XLY,XLU,XLP,SPY,VBMFX,Target_FR12Mts,Class
0,12/1/1998,19.039398,32.625,24.5625,6.44,26.125,30.234375,27.15625,123.3125,10.27,0.19,0
1,1/1/1999,19.369415,37.8125,24.296875,6.52,27.46875,29.484375,26.796875,127.65625,10.3,0.09,0
2,2/1/1999,19.674046,34.0625,24.515625,6.46,27.296875,28.671875,26.515625,123.5625,10.07,0.11,0
3,3/1/1999,20.257921,36.59375,24.9375,6.57,28.59375,26.6875,26.4375,128.375,10.06,0.17,0
4,4/1/1999,21.679529,36.8125,28.6875,6.8,29.34375,29.265625,25.515625,133.25,10.05,0.09,0


In [4]:
X = dataset.drop(['Date','XLI','XLK','Class'], 1)
y = dataset['Class']

In [5]:


from sklearn import metrics
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [7]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   58.4s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [8]:
print(rf_random.best_params_)

{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 50, 'bootstrap': True}


In [9]:
print(rf_random.best_score_)

0.9940476190476191


In [10]:
y_pred = rf_random.predict(X_test)

In [11]:
from sklearn.metrics import confusion_matrix
y_pred = rf_random.predict(X_test)
import numpy as np
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[50  0]
 [ 0 23]]


In [12]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred,
                            ))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        23

    accuracy                           1.00        73
   macro avg       1.00      1.00      1.00        73
weighted avg       1.00      1.00      1.00        73

