# __Predicting Outcomes of Call Option Contracts with Multi-Class Classification__

## Notebook is presently setup for: *21Q4 through 22Q1, SPY, Weekly Call Contracts*

# SECTION 1: Preparation

##  1.1.) Loading the python packages

In [1]:

from sklearn import datasets

# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
import random

# Load libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from evolutionary_search import EvolutionaryAlgorithmSearchCV

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

#importing classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

#from sklearn.metrics import balanced_accuracy_score
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

#Libraries for Saving the Model
from pickle import dump
from pickle import load

import warnings
warnings.filterwarnings('ignore')



## 1.2.) Loading the Data

In [2]:
#Import training data (an entirely separate chunk of time from validation data, as we are not doing randomized train/test/split approach)

df = pd.read_csv(
    Path("../Resources/train_2qs.csv")
)

#NOTE:
#use train.csv for Q1 22 only
#use train_2qs.csv for Q4 21 + Q1 22 

In [3]:
df.head()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%
0,10/4/2021 9:30,430.0 2021-10-08,432.95,2021-10-08,4.27,0.6131,0.03556,0.19356,-0.43339,0.03538,0.21263,10,5.93,430.0,0.007,-2.95,6.11,3.035413,1.0,0.3
1,10/4/2021 9:30,431.0 2021-10-08,432.95,2021-10-08,4.27,0.57914,0.0373,0.19781,-0.43196,0.03344,0.20709,1,5.21,431.0,0.004,-1.95,5.31,1.919386,1.0,0.3
2,10/4/2021 9:30,432.0 2021-10-08,432.95,2021-10-08,4.27,0.54144,0.03858,0.20139,-0.43035,0.03157,0.20271,0,5.61,432.0,0.002,-0.95,4.63,-17.468806,1.0,0.3
3,10/4/2021 9:30,433.0 2021-10-08,432.95,2021-10-08,4.27,0.50267,0.03978,0.20224,-0.42316,0.02886,0.19849,1,4.01,433.0,0.0,0.05,3.95,-1.496259,1.0,0.3
4,10/4/2021 9:30,434.0 2021-10-08,432.95,2021-10-08,4.27,0.46199,0.04063,0.20081,-0.4104,0.02677,0.19261,13,3.42,434.0,0.002,1.05,3.22,-5.847953,1.0,0.3


# SECTION 2: Exploratory Data Analysis

## 2.1.) Descriptive Statistics

In [4]:
df.shape

(14626, 20)

In [5]:
#Check for any null values and remove the null values
print('Null Values =', df.isnull().values.any())

#Drop NaNs
df.dropna()

Null Values = False


Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%
0,10/4/2021 9:30,430.0 2021-10-08,432.95,2021-10-08,4.27,0.61310,0.03556,0.19356,-0.43339,0.03538,0.21263,10,5.93,430.0,0.007,-2.95,6.11,3.035413,1.0,0.3
1,10/4/2021 9:30,431.0 2021-10-08,432.95,2021-10-08,4.27,0.57914,0.03730,0.19781,-0.43196,0.03344,0.20709,1,5.21,431.0,0.004,-1.95,5.31,1.919386,1.0,0.3
2,10/4/2021 9:30,432.0 2021-10-08,432.95,2021-10-08,4.27,0.54144,0.03858,0.20139,-0.43035,0.03157,0.20271,0,5.61,432.0,0.002,-0.95,4.63,-17.468806,1.0,0.3
3,10/4/2021 9:30,433.0 2021-10-08,432.95,2021-10-08,4.27,0.50267,0.03978,0.20224,-0.42316,0.02886,0.19849,1,4.01,433.0,0.000,0.05,3.95,-1.496259,1.0,0.3
4,10/4/2021 9:30,434.0 2021-10-08,432.95,2021-10-08,4.27,0.46199,0.04063,0.20081,-0.41040,0.02677,0.19261,13,3.42,434.0,0.002,1.05,3.22,-5.847953,1.0,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14621,2022-03-23 16:00:00,455.0 2022-03-25,443.84,3/25/2022,2.00,0.04696,0.01493,0.03746,-0.09216,0.00106,0.17010,12879,0.13,455.0,0.025,11.16,0.13,0.000000,1.0,0.8
14622,2022-03-23 16:00:00,456.0 2022-03-25,443.84,3/25/2022,2.00,0.03309,0.01144,0.02826,-0.06611,0.00075,0.16963,10707,0.09,456.0,0.027,12.16,0.09,0.000000,1.0,0.8
14623,2022-03-23 16:00:00,457.0 2022-03-25,443.84,3/25/2022,2.00,0.02264,0.00839,0.02085,-0.04539,0.00083,0.16879,4434,0.05,457.0,0.030,13.16,0.05,0.000000,1.0,0.8
14624,2022-03-23 16:00:00,458.0 2022-03-25,443.84,3/25/2022,2.00,0.01819,0.00688,0.01736,-0.03784,0.00030,0.17380,4001,0.04,458.0,0.032,14.16,0.04,0.000000,1.0,0.8


In [6]:
display(df.head())
df.tail()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%
0,10/4/2021 9:30,430.0 2021-10-08,432.95,2021-10-08,4.27,0.6131,0.03556,0.19356,-0.43339,0.03538,0.21263,10,5.93,430.0,0.007,-2.95,6.11,3.035413,1.0,0.3
1,10/4/2021 9:30,431.0 2021-10-08,432.95,2021-10-08,4.27,0.57914,0.0373,0.19781,-0.43196,0.03344,0.20709,1,5.21,431.0,0.004,-1.95,5.31,1.919386,1.0,0.3
2,10/4/2021 9:30,432.0 2021-10-08,432.95,2021-10-08,4.27,0.54144,0.03858,0.20139,-0.43035,0.03157,0.20271,0,5.61,432.0,0.002,-0.95,4.63,-17.468806,1.0,0.3
3,10/4/2021 9:30,433.0 2021-10-08,432.95,2021-10-08,4.27,0.50267,0.03978,0.20224,-0.42316,0.02886,0.19849,1,4.01,433.0,0.0,0.05,3.95,-1.496259,1.0,0.3
4,10/4/2021 9:30,434.0 2021-10-08,432.95,2021-10-08,4.27,0.46199,0.04063,0.20081,-0.4104,0.02677,0.19261,13,3.42,434.0,0.002,1.05,3.22,-5.847953,1.0,0.3


Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%
14621,2022-03-23 16:00:00,455.0 2022-03-25,443.84,3/25/2022,2.0,0.04696,0.01493,0.03746,-0.09216,0.00106,0.1701,12879,0.13,455.0,0.025,11.16,0.13,0.0,1.0,0.8
14622,2022-03-23 16:00:00,456.0 2022-03-25,443.84,3/25/2022,2.0,0.03309,0.01144,0.02826,-0.06611,0.00075,0.16963,10707,0.09,456.0,0.027,12.16,0.09,0.0,1.0,0.8
14623,2022-03-23 16:00:00,457.0 2022-03-25,443.84,3/25/2022,2.0,0.02264,0.00839,0.02085,-0.04539,0.00083,0.16879,4434,0.05,457.0,0.03,13.16,0.05,0.0,1.0,0.8
14624,2022-03-23 16:00:00,458.0 2022-03-25,443.84,3/25/2022,2.0,0.01819,0.00688,0.01736,-0.03784,0.0003,0.1738,4001,0.04,458.0,0.032,14.16,0.04,0.0,1.0,0.8
14625,2022-03-23 16:00:00,459.0 2022-03-25,443.84,3/25/2022,2.0,0.01514,0.00542,0.01463,-0.03065,0.00091,0.17742,2672,0.03,459.0,0.034,15.16,0.03,0.0,1.0,0.8


In [7]:
df.describe()

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%
count,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0,14626.0
mean,447.516106,3.146906,0.297005,0.03117,0.117027,-0.313209,0.012526,0.203294,5260.667236,2.419528,452.998017,0.016665,5.481912,2.816305,17.155754,0.943183,0.586736
std,15.533692,0.825703,0.241732,0.01984,0.061411,0.231404,0.010118,0.089424,8666.938674,2.882071,16.0435,0.012103,7.28922,3.839334,107.475367,0.836263,0.186143
min,415.92,2.0,0.00193,0.0009,0.00272,-1.06316,-0.00041,0.05779,0.0,0.01,418.0,0.0,-18.4,0.01,-97.701149,0.0,0.3
25%,434.145,2.21,0.075698,0.017593,0.063615,-0.46781,0.00354,0.130263,614.25,0.24,441.0,0.007,0.42,0.09,-57.268199,0.0,0.4
50%,447.33,3.15,0.25126,0.02775,0.13102,-0.28228,0.01074,0.182185,2220.0,1.3,450.0,0.015,5.55,0.97,-2.597403,1.0,0.5
75%,461.95,4.06,0.4744,0.04109,0.16751,-0.11026,0.019,0.26988,5938.75,3.65,468.0,0.025,10.76,4.27,55.207373,2.0,0.8
max,473.22,4.27,0.95707,0.15775,0.22042,-0.00376,0.05213,0.49516,126743.0,19.37,491.0,0.071,29.62,18.47,912.5,2.0,0.9


In [8]:
df.dtypes

QUOTE_READTIME          object
CONTRACT                object
SPY PRICE              float64
EXPIRE_DATE             object
DTE                    float64
C_DELTA                float64
C_GAMMA                float64
C_VEGA                 float64
C_THETA                float64
C_RHO                  float64
C_IV                   float64
C_VOLUME                 int64
C_LAST                 float64
STRIKE                 float64
STRIKE_DISTANCE_PCT    float64
STRIKE DISTANCE        float64
PRICECLOSE             float64
ROI %                  float64
y                      float64
INFLATION%             float64
dtype: object

## 2.2.) Feature Analysis and Exploration

### Plotting features according to contract length

In [9]:
#
#contract_outcome = df.groupby('STRIKE_DISTANCE_PCT')['y'].value_counts(normalize=True).loc[:,1]
#sns.set(rc={'figure.figsize':(12,5)})
#sns.barplot(x=contract_outcome.index, y=contract_outcome.values, color='#5975A4', saturation=1)

### Eliminate Uncorrelated Features

In [10]:
#Calculate correlation of each feature with 'y'

correlation = df.corr()
correlation_df = abs(correlation['y'])

In [11]:
correlation_df.sort_values(ascending=False)

y                      1.000000
ROI %                  0.787194
PRICECLOSE             0.492342
STRIKE                 0.219902
INFLATION%             0.189284
C_DELTA                0.183924
STRIKE DISTANCE        0.169937
C_RHO                  0.165782
C_VEGA                 0.163887
SPY PRICE              0.147350
STRIKE_DISTANCE_PCT    0.141856
C_LAST                 0.136744
C_GAMMA                0.119466
C_THETA                0.113537
C_IV                   0.072511
C_VOLUME               0.032511
DTE                    0.020364
Name: y, dtype: float64

In [12]:
#Drop variables with less than 1.5% correlation with contract outcome ('y')

drop_list_corr = sorted(list(correlation_df[correlation_df < 0.015].index))
print(drop_list_corr)

[]


In [13]:
#Dropping because C_IV (sometimes?) convolutes outcome for some buggish reason yet tbd

#df.drop(labels=["C_IV"], axis=1, inplace=True)

### Drop Columns Not Needed

In [14]:
df.drop(columns=["ROI %"], inplace=True) 

In [15]:
df.drop(columns=["PRICECLOSE"], inplace=True)

In [16]:
df.drop(columns=["EXPIRE_DATE"], inplace=True)

In [17]:
df.drop(columns=["CONTRACT"], inplace=True)

In [18]:
df.drop(columns=["QUOTE_READTIME"], inplace=True)

In [19]:
#df.drop(columns=["INFLATION%"], inplace=True)

In [20]:
df.drop(labels=["STRIKE DISTANCE"], axis=1, inplace=True)

# SECTION 3: Evaluate Algorithms and Models

## 3.1.) Train Test Split and Evaluation Metrics

In [21]:
#Import validation data (an entirely separate chunk of time from training data, as we are not doing randomized train/test/split approach)

test = pd.read_csv(
    Path("../Resources/2qs_test_ready.csv")
)
test

#NOTE:
#use test_ready.csv for 2022_Q1 only
#use 2qs_test_ready.csv for 2021_Q4 + 2022_Q1

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,y,INFLATION%
0,466.25,4.08,0.03813,0.01323,0.04436,-0.04721,0.00247,0.11638,137,0.09,477.5,0.024,11.25,0.0,0.9
1,466.25,4.08,0.03425,0.01197,0.04012,-0.04335,0.00231,0.11865,1218,0.08,478.0,0.025,11.75,0.0,0.9
2,466.25,4.08,0.02334,0.00877,0.02989,-0.03026,0.00188,0.11759,392,0.05,479.0,0.027,12.75,0.0,0.9
3,465.93,4.06,0.66737,0.03894,0.19384,-0.36992,0.04046,0.17491,3299,6.00,462.0,0.008,-3.93,0.0,0.9
4,465.93,4.06,0.64929,0.04032,0.19768,-0.37281,0.03856,0.17200,3272,5.59,462.5,0.007,-3.43,0.0,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,473.41,2.06,0.00497,0.00210,0.00546,-0.00950,-0.00016,0.16082,38,0.02,491.0,0.037,17.59,0.0,0.8
3654,471.69,2.04,0.34400,0.06538,0.15074,-0.40846,0.01179,0.13661,9952,1.28,474.0,0.005,2.31,0.0,0.8
3655,471.69,2.04,0.27673,0.06096,0.13782,-0.35256,0.00969,0.13330,18824,0.94,475.0,0.007,3.31,0.0,0.8
3656,471.69,2.04,0.21498,0.05423,0.11988,-0.29226,0.00813,0.13059,22826,0.65,476.0,0.009,4.31,0.0,0.8


In [22]:
#test.drop(columns=["INFLATION%"], inplace=True)
#test

In [23]:
test.drop(labels=["STRIKE DISTANCE"], axis=1, inplace=True)

In [24]:
#Separate predicted variable from features for training dataset

y_train = df["y"]

X_train = df.drop(columns='y')


In [25]:
y_train.value_counts()

0.0    5553
2.0    4722
1.0    4351
Name: y, dtype: int64

In [26]:
#Separate predicted variable from features for validation dataset

y_validation = test["y"]

X_validation = test.drop(columns='y')


In [27]:
X_validation

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,INFLATION%
0,466.25,4.08,0.03813,0.01323,0.04436,-0.04721,0.00247,0.11638,137,0.09,477.5,0.024,0.9
1,466.25,4.08,0.03425,0.01197,0.04012,-0.04335,0.00231,0.11865,1218,0.08,478.0,0.025,0.9
2,466.25,4.08,0.02334,0.00877,0.02989,-0.03026,0.00188,0.11759,392,0.05,479.0,0.027,0.9
3,465.93,4.06,0.66737,0.03894,0.19384,-0.36992,0.04046,0.17491,3299,6.00,462.0,0.008,0.9
4,465.93,4.06,0.64929,0.04032,0.19768,-0.37281,0.03856,0.17200,3272,5.59,462.5,0.007,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3653,473.41,2.06,0.00497,0.00210,0.00546,-0.00950,-0.00016,0.16082,38,0.02,491.0,0.037,0.8
3654,471.69,2.04,0.34400,0.06538,0.15074,-0.40846,0.01179,0.13661,9952,1.28,474.0,0.005,0.8
3655,471.69,2.04,0.27673,0.06096,0.13782,-0.35256,0.00969,0.13330,18824,0.94,475.0,0.007,0.8
3656,471.69,2.04,0.21498,0.05423,0.11988,-0.29226,0.00813,0.13059,22826,0.65,476.0,0.009,0.8


In [28]:
X_train

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,INFLATION%
0,432.95,4.27,0.61310,0.03556,0.19356,-0.43339,0.03538,0.21263,10,5.93,430.0,0.007,0.3
1,432.95,4.27,0.57914,0.03730,0.19781,-0.43196,0.03344,0.20709,1,5.21,431.0,0.004,0.3
2,432.95,4.27,0.54144,0.03858,0.20139,-0.43035,0.03157,0.20271,0,5.61,432.0,0.002,0.3
3,432.95,4.27,0.50267,0.03978,0.20224,-0.42316,0.02886,0.19849,1,4.01,433.0,0.000,0.3
4,432.95,4.27,0.46199,0.04063,0.20081,-0.41040,0.02677,0.19261,13,3.42,434.0,0.002,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14621,443.84,2.00,0.04696,0.01493,0.03746,-0.09216,0.00106,0.17010,12879,0.13,455.0,0.025,0.8
14622,443.84,2.00,0.03309,0.01144,0.02826,-0.06611,0.00075,0.16963,10707,0.09,456.0,0.027,0.8
14623,443.84,2.00,0.02264,0.00839,0.02085,-0.04539,0.00083,0.16879,4434,0.05,457.0,0.030,0.8
14624,443.84,2.00,0.01819,0.00688,0.01736,-0.03784,0.00030,0.17380,4001,0.04,458.0,0.032,0.8


In [29]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_validation_scaled = X_scaler.transform(X_validation)

## 3.2.) Quick Check of Models and Algorithms

In [None]:
#Spot check Classification algorithms

models = []

#Boosting methods

models.append(('XGB', XGBClassifier()))
models.append(('AB', AdaBoostClassifier()))
models.append(('GBM', GradientBoostingClassifier()))

#Bagging methods

models.append(('RF', RandomForestClassifier()))
models.append(('ET', ExtraTreesClassifier()))

#SVC 
models.append(('SVC', SVC()))

#K Nearest Neighbors
models.append(('KNN', KNeighborsClassifier()))

In [None]:
#Test options for classification

num_folds = 10
seed = 7

In [None]:
results = []

names = []

scoring = 'accuracy'

for name, model in models:
    kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    findings_summary = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(findings_summary)

In [None]:
#Compare the model outcomes

fig = pyplot.figure()
fig.suptitle('Model Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
fig.set_size_inches(8,4)
pyplot.show()

## 3.3.) Selecting Random Forest (or whichver tree method best), Calculating its Baseline with Cross Validation

In [None]:
#Estimate accuracy on training set
#Test options for classification

num_folds = 10
seed = 7

rf_model = RandomForestClassifier()

cv = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)
scores = cross_val_score(rf_model, X_train_scaled, y_train, scoring='accuracy', cv=cv)
avg_score = np.mean(scores)
avg_score

In [62]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]

#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

#Create new KNN object
knn_2 = KNeighborsClassifier()

#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)

#Fit the model
best_model = clf.fit(X_train_scaled, y_train)

#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

KeyboardInterrupt: 

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

# SECTION 4: Model Tuning 

## 4.1.1.) Hyperparameter Tuning for Random Forest

### APPROACH (A) 
## __*(CAUTION TAKES A LONG TIME!)*__

In [None]:


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {}

paramgrid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,}
              # 'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
              # 'bootstrap': bootstrap}

random.seed(1)

cv = EvolutionaryAlgorithmSearchCV(estimator=RandomForestClassifier(),
                                   params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(n_splits=4),
                                   verbose=1,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=4)
cv.fit(X_train_scaled, y_train)

### APPROACH (B)
## __*(CAUTION TAKES A LONG TIME!)*__

In [30]:
# Grid Search: (select model) Tuning

num_folds = 10
seed = 7

scoring = 'accuracy'

n_estimators = [20, 100, 180, 1000]
max_features = [1, 2, 3, 4]

param_grid = dict(n_estimators=n_estimators, max_features=max_features)

rf_model = RandomForestClassifier()

kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)

grid = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train_scaled, y_train)

#Print Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
ranks = grid_result.cv_results_['rank_test_score']
for mean, stdev, param, rank in zip(means, stds, params, ranks):
    print("#%d %f (%f) with: %r" % (rank, mean, stdev, param))

Best: 0.955832 using {'max_features': 4, 'n_estimators': 180}
#16 0.925612 (0.006572) with: {'max_features': 1, 'n_estimators': 20}
#15 0.934638 (0.005966) with: {'max_features': 1, 'n_estimators': 100}
#14 0.936415 (0.005442) with: {'max_features': 1, 'n_estimators': 180}
#13 0.937783 (0.005866) with: {'max_features': 1, 'n_estimators': 1000}
#12 0.942910 (0.004592) with: {'max_features': 2, 'n_estimators': 20}
#9 0.948859 (0.005488) with: {'max_features': 2, 'n_estimators': 100}
#10 0.948585 (0.004023) with: {'max_features': 2, 'n_estimators': 180}
#8 0.949816 (0.004829) with: {'max_features': 2, 'n_estimators': 1000}
#11 0.948175 (0.005273) with: {'max_features': 3, 'n_estimators': 20}
#6 0.952961 (0.004447) with: {'max_features': 3, 'n_estimators': 100}
#4 0.953850 (0.003999) with: {'max_features': 3, 'n_estimators': 180}
#3 0.954533 (0.004566) with: {'max_features': 3, 'n_estimators': 1000}
#7 0.949816 (0.007001) with: {'max_features': 4, 'n_estimators': 20}
#5 0.953713 (0.005844)

## 4.1.2.) Implementing Tuned Hyperparameters for Random Forest

In [58]:
#Prepare the model with parameters decided in previous cell

rf_model = RandomForestClassifier(n_estimators=180, max_features=4, max_depth=30, min_samples_split=5, min_samples_leaf=1, bootstrap=False)
rf_model.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=False, max_depth=30, max_features=4,
                       min_samples_split=5, n_estimators=180)

In [59]:
#Score predictions of training set

training_predictions = rf_model.predict(X_train_scaled)

print(accuracy_score(y_train, training_predictions))

print('Micro Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_train, training_predictions, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_train, training_predictions, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_train, training_predictions, average='weighted')))


print(classification_report(y_train, training_predictions, target_names=["Strong Sell", "Sell", "Pass"])) #, "Buy", "Strong Buy", "Very High Return"]))

1.0
Micro Precision: 1.00
Micro Recall: 1.00
Micro F1-score: 1.00

Macro Precision: 1.00
Macro Recall: 1.00
Macro F1-score: 1.00

Weighted Precision: 1.00
Weighted Recall: 1.00
Weighted F1-score: 1.00
              precision    recall  f1-score   support

 Strong Sell       1.00      1.00      1.00      5553
        Sell       1.00      1.00      1.00      4351
        Pass       1.00      1.00      1.00      4722

    accuracy                           1.00     14626
   macro avg       1.00      1.00      1.00     14626
weighted avg       1.00      1.00      1.00     14626



In [60]:
#Score predictions of validation set

predictions = rf_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Micro Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_validation, predictions, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_validation, predictions, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))


print(classification_report(y_validation, predictions, target_names=["Sell", "Pass", "Buy"])) #"Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

0.6413340623291416
Micro Precision: 0.64
Micro Recall: 0.64
Micro F1-score: 0.64

Macro Precision: 0.63
Macro Recall: 0.56
Macro F1-score: 0.55

Weighted Precision: 0.64
Weighted Recall: 0.64
Weighted F1-score: 0.60
              precision    recall  f1-score   support

        Sell       0.62      0.95      0.75      1643
        Pass       0.76      0.51      0.61      1182
         Buy       0.51      0.21      0.30       833

    accuracy                           0.64      3658
   macro avg       0.63      0.56      0.55      3658
weighted avg       0.64      0.64      0.60      3658

[[1567   75    1]
 [ 410  604  168]
 [ 541  117  175]]


In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

## 4.1.3.) Determining Feature Importance 

In [None]:
#Feature importances

importances = rf_model.feature_importances_

In [None]:
important_features = zip(X.columns, rf_model.feature_importances_)

In [None]:
importances_df = pd.DataFrame(important_features)

In [None]:
importances_df = importances_df.rename(columns={0: 'Feature', 1: 'Importance'})

In [None]:
importances_df = importances_df.set_index('Feature')

In [None]:
importances_df = importances_df.sort_values(by='Importance', ascending=False)

In [None]:
importances_df

In [None]:
#Plot the top 10 most important features

importances_df[0:10].plot(
    kind='barh',
    color='green',
    title='Feature Importance',
    legend=True)

## 4.2.1.) Hyperparameter Tuning for XGBoost: Bayesian Optimization with HYPEROPT

## __*(CAUTION TAKES A COUPLE OF MIN!)*__

The available hyperopt optimization algorithms are -

hp.choice(label, options) — Returns one of the options, which should be a list or tuple.

hp.randint(label, upper) — Returns a random integer between the range [0, upper).

hp.uniform(label, low, high) — Returns a value uniformly between low and high.

hp.quniform(label, low, high, q) — Returns a value round(uniform(low, high) / q) * q, i.e it rounds the decimal values and returns an integer.

hp.normal(label, mean, std) — Returns a real value that’s normally-distributed with mean and standard deviation sigma.

In [None]:
#Initialize domain space for range of values 
 
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1, 9),
        'reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }


In [None]:
#Define "objective" function to yield the lowest output value, the “loss”.

def objective(space):
    xgb_model = xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [(X_train_scaled, y_train), (X_validation_scaled, y_validation)]
    
    xgb_model.fit(X_train_scaled, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = xgb_model.predict(X_validation_scaled)
    accuracy = accuracy_score(y_validation, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
#Establish Hyperopt Trials() object
trials = Trials()

#Getting the best hyperparameters with "fmin" function
best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
#Informs of the best hyperparameters from above search

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

## 4.2.2.) Implementing Tuned Hyperparameters for XGBoost

In [None]:
#Establish XGB model instance
xgb_model = xgb.XGBClassifier(colsample_bytree=0.5000433132663876, gamma=8.469311149071062, max_depth=15, min_child_weight=10.0, reg_alpha=0, reg_lambda=0.5056047598954592) #20 30

#Fit training dataset to model
xgb_model.fit(X_train_scaled, y_train)

In [None]:
#Predicting on training set

training_predictions = xgb_model.predict(X_train_scaled)

print(accuracy_score(y_train, training_predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_train, training_predictions, average='weighted')))

print(classification_report(y_train, training_predictions, target_names=["Strong Sell", "Sell", "Pass"])) #, "Buy", "Strong Buy", "Very High Return"]))

In [None]:
#Predicting on validation set

predictions = xgb_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))

print(classification_report(y_validation, predictions, target_names=["Sell", "Pass", "Buy"])) #, "Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

In [None]:
#Get probabilities of predictions

pred_proba = xgb_model.predict_proba(X_validation_scaled)
pred_proba

In [None]:
proba_df = pd.DataFrame(pred_proba.round(2))

In [None]:
#Formatting probabilities DataFrame

proba_df["Actual"] = y_validation.reset_index(drop=True)
proba_df.columns = ["'Sell' Probability", "'Pass' Probability", "'Buy' Probability"] #"'Pass' Probability", "'Buy' Probability", "'Strong Buy' Probability", "'Very High Return' Prob.", "Actual"]
proba_df

In [None]:
#Labeling caterogires

proba_df.loc[proba_df["Actual"] == 0, "Actual"] = "Sell"
proba_df.loc[proba_df["Actual"] == 1, "Actual"] = "Pass"
proba_df.loc[proba_df["Actual"] == 2, "Actual"] = "Buy"

proba_df.head(50)

In [None]:
#Get ROC AUC score

roc_auc_score(y_validation, pred_proba, multi_class="ovr")

## 4.2.3.) Determining Feature Importance for XGBoost Model

In [None]:
#Feature importances

importances = xgb_model.feature_importances_

In [None]:
important_features = zip(X.columns, xgb_model.feature_importances_)

In [None]:
importances_df = pd.DataFrame(important_features)

In [None]:
importances_df = importances_df.rename(columns={0: 'Feature', 1: 'Importance'})

In [None]:
importances_df = importances_df.set_index('Feature')

In [None]:
importances_df = importances_df.sort_values(by='Importance', ascending=False)

In [None]:
importances_df

In [None]:
#Plot the top 10 most important features

importances_df[0:10].plot(
    kind='barh',
    color='green',
    title='Feature Importance',
    legend=True)

## 4.3.1.) Hyperparameter Tuning for KNN

In [None]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]

#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

#Create new KNN object
knn_2 = KNeighborsClassifier()

#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)

#Fit the model
best_model = clf.fit(X_train_scaled, y_train)

#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

## 4.3.2.) Implementing Tuned Hyperparameters for KNN

In [None]:
#Estimate accuracy on validation set

kn_model = KNeighborsClassifier(n_neighbors=16, leaf_size=1, p=1)

kn_model.fit(X_train_scaled, y_train)

predictions = kn_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))

print(classification_report(y_validation, predictions, target_names=["Sell", "Pass", "Buy"])) #, "Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

# APPENDIX

## Extra Models to Test

In [None]:
#et_model = ExtraTreesClassifier()
#et_model.fit(X_train_scaled, y_train)

In [None]:
#ab_model = AdaBoostClassifier()
#ab_model.fit(X_train_scaled, y_train)

In [None]:
#gb_model = GradientBoostingClassifier()
#ab_model.fit(X_train_scaled, y_train)