# __Predicting Outcomes of Call Option Contracts__

# SECTION 1: Preparation

##  1.1.) Loading the python packages

In [1]:

from sklearn import datasets

# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
import random

# Load libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from evolutionary_search import EvolutionaryAlgorithmSearchCV

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

#importing classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

#from sklearn.metrics import balanced_accuracy_score
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

#Libraries for Saving the Model
from pickle import dump
from pickle import load

import warnings
warnings.filterwarnings('ignore')



## 1.2.) Loading the Data

In [2]:
#NOTE: This iris dataset is for debugging purposes only

#iris = datasets.load_iris()

#X = iris.data

#y = iris.target

#validation_size = 0.2

#seed = 7

In [3]:
#Import call options info

df = pd.read_csv(
    Path("../Resources/jfprices.csv")
)


In [4]:
df.head()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,PRICECLOSE,ROI %,y,VIX PRICE,AVG SCORE,MED SCORE,Sentiment Indicator Average,SEN IND AVG NUM,Sentiment Indicator Median,SEN IND MED NUM
0,2022-01-03 09:30:00,474.0 2022-01-07,476.43,1/7/2022,4.27,0.62598,0.05055,0.21136,-0.28773,0.03788,...,0.55,-87.089202,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
1,2022-01-03 09:30:00,491.0 2022-01-07,476.43,1/7/2022,4.27,0.01171,0.00474,0.01717,-0.01484,0.00056,...,0.01,0.0,2.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
2,2022-01-03 09:30:00,490.0 2022-01-07,476.43,1/7/2022,4.27,0.01569,0.00635,0.02257,-0.01973,0.00132,...,0.02,0.0,2.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
3,2022-01-03 09:30:00,489.0 2022-01-07,476.43,1/7/2022,4.27,0.02331,0.00888,0.03151,-0.02878,0.00105,...,0.01,-66.666667,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
4,2022-01-03 09:30:00,488.0 2022-01-07,476.43,1/7/2022,4.27,0.02852,0.01072,0.03713,-0.03303,0.00202,...,0.01,-75.0,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0


# SECTION 2: Exploratory Data Analysis

## 2.1.) Descriptive Statistics

In [5]:
df.shape

(10414, 27)

In [6]:
#Check for any null values and remove the null values
print('Null Values =', df.isnull().values.any())

#Drop NaNs
df.dropna()

Null Values = True


Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,PRICECLOSE,ROI %,y,VIX PRICE,AVG SCORE,MED SCORE,Sentiment Indicator Average,SEN IND AVG NUM,Sentiment Indicator Median,SEN IND MED NUM
0,2022-01-03 09:30:00,474.0 2022-01-07,476.43,1/7/2022,4.27,0.62598,0.05055,0.21136,-0.28773,0.03788,...,0.55,-87.089202,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
1,2022-01-03 09:30:00,491.0 2022-01-07,476.43,1/7/2022,4.27,0.01171,0.00474,0.01717,-0.01484,0.00056,...,0.01,0.000000,2.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
2,2022-01-03 09:30:00,490.0 2022-01-07,476.43,1/7/2022,4.27,0.01569,0.00635,0.02257,-0.01973,0.00132,...,0.02,0.000000,2.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
3,2022-01-03 09:30:00,489.0 2022-01-07,476.43,1/7/2022,4.27,0.02331,0.00888,0.03151,-0.02878,0.00105,...,0.01,-66.666667,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
4,2022-01-03 09:30:00,488.0 2022-01-07,476.43,1/7/2022,4.27,0.02852,0.01072,0.03713,-0.03303,0.00202,...,0.01,-75.000000,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10409,2022-02-25 16:00:00,440.0 2022-02-25,437.66,2/25/2022,0.00,0.01714,0.02027,0.01301,-0.01440,0.00002,...,0.12,1100.000000,5.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1
10410,2022-02-25 16:00:00,436.0 2022-02-25,437.66,2/25/2022,0.00,0.73812,0.29126,0.03481,-0.09513,0.00052,...,0.27,-83.435583,0.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1
10411,2022-02-25 16:00:00,437.0 2022-02-25,437.66,2/25/2022,0.00,0.53086,0.28405,0.04657,-0.29000,0.00073,...,0.22,-76.595745,0.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1
10412,2022-02-25 16:00:00,438.0 2022-02-25,437.66,2/25/2022,0.00,0.25513,0.22841,0.04662,-0.27959,0.00117,...,0.18,-30.769231,1.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1


In [7]:
display(df.head())
df.tail()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,PRICECLOSE,ROI %,y,VIX PRICE,AVG SCORE,MED SCORE,Sentiment Indicator Average,SEN IND AVG NUM,Sentiment Indicator Median,SEN IND MED NUM
0,2022-01-03 09:30:00,474.0 2022-01-07,476.43,1/7/2022,4.27,0.62598,0.05055,0.21136,-0.28773,0.03788,...,0.55,-87.089202,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
1,2022-01-03 09:30:00,491.0 2022-01-07,476.43,1/7/2022,4.27,0.01171,0.00474,0.01717,-0.01484,0.00056,...,0.01,0.0,2.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
2,2022-01-03 09:30:00,490.0 2022-01-07,476.43,1/7/2022,4.27,0.01569,0.00635,0.02257,-0.01973,0.00132,...,0.02,0.0,2.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
3,2022-01-03 09:30:00,489.0 2022-01-07,476.43,1/7/2022,4.27,0.02331,0.00888,0.03151,-0.02878,0.00105,...,0.01,-66.666667,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0
4,2022-01-03 09:30:00,488.0 2022-01-07,476.43,1/7/2022,4.27,0.02852,0.01072,0.03713,-0.03303,0.00202,...,0.01,-75.0,0.0,17.68,0.196649,0.0258,Neutral,0,Neutral,0


Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,PRICECLOSE,ROI %,y,VIX PRICE,AVG SCORE,MED SCORE,Sentiment Indicator Average,SEN IND AVG NUM,Sentiment Indicator Median,SEN IND MED NUM
10409,2022-02-25 16:00:00,440.0 2022-02-25,437.66,2/25/2022,0.0,0.01714,0.02027,0.01301,-0.0144,2e-05,...,0.12,1100.0,5.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1
10410,2022-02-25 16:00:00,436.0 2022-02-25,437.66,2/25/2022,0.0,0.73812,0.29126,0.03481,-0.09513,0.00052,...,0.27,-83.435583,0.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1
10411,2022-02-25 16:00:00,437.0 2022-02-25,437.66,2/25/2022,0.0,0.53086,0.28405,0.04657,-0.29,0.00073,...,0.22,-76.595745,0.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1
10412,2022-02-25 16:00:00,438.0 2022-02-25,437.66,2/25/2022,0.0,0.25513,0.22841,0.04662,-0.27959,0.00117,...,0.18,-30.769231,1.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1
10413,2022-02-25 16:00:00,435.0 2022-02-25,437.66,2/25/2022,0.0,0.80172,0.2118,0.03048,-0.08037,0.00036,...,0.33,-87.452471,0.0,27.61,0.445772,0.5574,Bullish,1,Bullish,1


In [8]:
df.describe()

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,...,STRIKE DISTANCE,INFLATION%,PRICECLOSE,ROI %,y,VIX PRICE,AVG SCORE,MED SCORE,SEN IND AVG NUM,SEN IND MED NUM
count,10414.0,10414.0,10414.0,10414.0,10414.0,10414.0,10414.0,10399.0,10414.0,10414.0,...,10414.0,10414.0,10414.0,10414.0,10414.0,10414.0,10414.0,10414.0,10414.0,10414.0
mean,450.280797,2.137048,0.292804,0.029345,0.095248,-0.360877,0.009928,0.242156,8054.671404,2.481052,...,6.174263,0.583244,3.475443,1133.806973,2.878817,24.292258,0.029897,0.004,-0.024198,-0.024198
std,15.003126,1.415136,0.266005,0.024182,0.066847,0.327025,0.009815,0.094053,14007.969857,3.127882,...,8.942593,0.110474,4.314829,5424.267314,1.999737,4.773683,0.126856,0.112251,0.268358,0.268358
min,410.87,0.0,0.00105,0.0,0.0,-2.59018,-0.00048,-0.00039,0.0,0.01,...,-18.4,0.5,0.01,-97.701149,0.0,16.52,-0.218369,-0.3182,-1.0,-1.0
25%,438.38,1.06,0.03873,0.01235,0.0278,-0.524552,0.00098,0.176555,671.0,0.1,...,-0.11,0.5,0.12,-40.599068,1.0,20.31,-0.044775,0.0,0.0,0.0
50%,449.79,2.15,0.23205,0.02658,0.09778,-0.318365,0.007295,0.21904,2785.0,1.125,...,5.76,0.5,1.36,33.333333,3.0,23.54,0.030762,0.0,0.0,0.0
75%,462.94,3.21,0.494405,0.04016,0.15362,-0.066318,0.01658,0.294385,8470.0,3.89,...,11.79,0.6,5.9,201.883009,5.0,28.07,0.11278,0.0,0.0,0.0
max,479.65,4.27,1.0,0.53614,0.22191,0.0,0.04032,0.73616,144660.0,19.37,...,40.84,0.8,18.47,108100.0,5.0,37.98,0.445772,0.5574,1.0,1.0


In [9]:
df.dtypes

QUOTE_READTIME                  object
CONTRACT                        object
SPY PRICE                      float64
EXPIRE_DATE                     object
DTE                            float64
C_DELTA                        float64
C_GAMMA                        float64
C_VEGA                         float64
C_THETA                        float64
C_RHO                          float64
C_IV                           float64
C_VOLUME                         int64
C_LAST                         float64
STRIKE                           int64
STRIKE_DISTANCE_PCT            float64
STRIKE DISTANCE                float64
INFLATION%                     float64
PRICECLOSE                     float64
ROI %                          float64
y                              float64
VIX PRICE                      float64
AVG SCORE                      float64
MED SCORE                      float64
Sentiment Indicator Average     object
SEN IND AVG NUM                  int64
Sentiment Indicator Media

## 2.2.) Feature Analysis and Exploration

### Plotting features according to contract length

In [10]:
#
#contract_outcome = df.groupby('STRIKE_DISTANCE_PCT')['y'].value_counts(normalize=True).loc[:,1]
#sns.set(rc={'figure.figsize':(12,5)})
#sns.barplot(x=contract_outcome.index, y=contract_outcome.values, color='#5975A4', saturation=1)

### Eliminate Uncorrelated Features

In [11]:
#Calculate correlation of each feature with 'y'

correlation = df.corr()
correlation_df = abs(correlation['y'])

In [12]:
correlation_df.sort_values(ascending=False)

y                      1.000000
DTE                    0.440692
PRICECLOSE             0.311122
C_VEGA                 0.306445
C_RHO                  0.237214
ROI %                  0.230027
SEN IND AVG NUM        0.167595
C_LAST                 0.162357
C_DELTA                0.142154
C_THETA                0.130967
STRIKE DISTANCE        0.119132
STRIKE_DISTANCE_PCT    0.116141
C_VOLUME               0.112772
SPY PRICE              0.099494
AVG SCORE              0.097522
MED SCORE              0.074263
SEN IND MED NUM        0.068456
INFLATION%             0.065834
C_GAMMA                0.047975
VIX PRICE              0.045752
STRIKE                 0.026900
C_IV                   0.003510
Name: y, dtype: float64

In [13]:
#Drop variables with less than 3% correlation with contract outcome ('y')

drop_list_corr = sorted(list(correlation_df[correlation_df < 0.03].index))
print(drop_list_corr)

['C_IV', 'STRIKE']


In [14]:
#Dropping because C_IV convolutes outcome for some buggish reason

#df.drop(labels=["C_IV"], axis=1, inplace=True)

### Drop Columns Not Needed for Machine Learning

In [15]:
df.drop(columns=["QUOTE_READTIME"], inplace=True)

In [16]:
df.drop(columns=["ROI %"], inplace=True)

In [17]:
df.drop(columns=["PRICECLOSE"], inplace=True)

In [18]:
df.drop(columns=["EXPIRE_DATE"], inplace=True)

In [19]:
df.drop(columns=["CONTRACT"], inplace=True)

In [20]:
df.drop(columns=["Sentiment Indicator Average"], inplace=True)

In [21]:
df.drop(columns=["Sentiment Indicator Median"], inplace=True)

# SECTION 3: Evaluate Algorithms and Models

## 3.1.) Train Test Split and Evaluation Metrics

In [22]:
#Import call options info for March as testing data

test = pd.read_csv(
    Path("../Resources/march_test.csv")
)


In [23]:
test.drop(columns=["Unnamed: 0"], inplace=True)
test

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,INFLATION%,y,VIX PRICE,AVG SCORE,MED SCORE,SEN IND AVG NUM,SEN IND MED NUM
0,435.49,3.27,0.46294,0.02910,0.18055,-0.72613,0.02137,0.29926,10,4.63,437,0.003,1.51,0.6,2.0,31.00,-0.072222,0.0,0,0
1,435.49,3.27,0.43284,0.02923,0.17971,-0.70858,0.02040,0.29494,0,4.75,438,0.006,2.51,0.6,1.0,31.00,-0.072222,0.0,0,0
2,435.49,3.27,0.64946,0.02456,0.16861,-0.74482,0.03047,0.33156,0,9.98,430,0.013,-5.49,0.6,2.0,31.00,-0.072222,0.0,0,0
3,435.49,3.27,0.62567,0.02541,0.17242,-0.75270,0.02865,0.32734,0,9.23,431,0.010,-4.49,0.6,2.0,31.00,-0.072222,0.0,0,0
4,435.49,3.27,0.60079,0.02627,0.17575,-0.75961,0.02818,0.32261,0,8.57,432,0.008,-3.49,0.6,2.0,31.00,-0.072222,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4783,452.55,0.00,0.00641,0.00891,0.00322,-0.00479,0.00030,0.12766,12333,0.01,456,0.008,3.45,0.8,5.0,20.84,0.129276,0.0,0,0
4784,452.55,0.00,0.00626,0.00637,0.00281,-0.00530,0.00038,0.14620,6479,0.01,457,0.010,4.45,0.8,5.0,20.84,0.129276,0.0,0,0
4785,452.55,0.00,1.00000,0.00000,0.00000,0.00000,0.00000,,18221,3.26,449,0.008,-3.55,0.8,0.0,20.84,0.129276,0.0,0,0
4786,452.55,0.00,1.00000,0.00000,0.00000,0.00000,0.00000,,89369,2.40,450,0.006,-2.55,0.8,0.0,20.84,0.129276,0.0,0,0


In [24]:
#Split into training and testing groups

#Split out validation dataset for the end
y_train = df["y"]

X_train = df.drop(columns='y')


In [25]:
y_train.value_counts()

5.0    4043
0.0    2057
1.0    1336
3.0    1213
2.0    1135
4.0     630
Name: y, dtype: int64

In [26]:
#Split into training and testing groups

#Split out validation dataset for the end
y_validation = test["y"]

X_validation = test.drop(columns='y')


In [27]:

#validation_size = 0.2

#seed = 7

#X_train, y_train = train_test_split(X, y, stratify=y, random_state=seed)

In [28]:
X_validation

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,INFLATION%,VIX PRICE,AVG SCORE,MED SCORE,SEN IND AVG NUM,SEN IND MED NUM
0,435.49,3.27,0.46294,0.02910,0.18055,-0.72613,0.02137,0.29926,10,4.63,437,0.003,1.51,0.6,31.00,-0.072222,0.0,0,0
1,435.49,3.27,0.43284,0.02923,0.17971,-0.70858,0.02040,0.29494,0,4.75,438,0.006,2.51,0.6,31.00,-0.072222,0.0,0,0
2,435.49,3.27,0.64946,0.02456,0.16861,-0.74482,0.03047,0.33156,0,9.98,430,0.013,-5.49,0.6,31.00,-0.072222,0.0,0,0
3,435.49,3.27,0.62567,0.02541,0.17242,-0.75270,0.02865,0.32734,0,9.23,431,0.010,-4.49,0.6,31.00,-0.072222,0.0,0,0
4,435.49,3.27,0.60079,0.02627,0.17575,-0.75961,0.02818,0.32261,0,8.57,432,0.008,-3.49,0.6,31.00,-0.072222,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4783,452.55,0.00,0.00641,0.00891,0.00322,-0.00479,0.00030,0.12766,12333,0.01,456,0.008,3.45,0.8,20.84,0.129276,0.0,0,0
4784,452.55,0.00,0.00626,0.00637,0.00281,-0.00530,0.00038,0.14620,6479,0.01,457,0.010,4.45,0.8,20.84,0.129276,0.0,0,0
4785,452.55,0.00,1.00000,0.00000,0.00000,0.00000,0.00000,,18221,3.26,449,0.008,-3.55,0.8,20.84,0.129276,0.0,0,0
4786,452.55,0.00,1.00000,0.00000,0.00000,0.00000,0.00000,,89369,2.40,450,0.006,-2.55,0.8,20.84,0.129276,0.0,0,0


In [29]:
X_train

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,INFLATION%,VIX PRICE,AVG SCORE,MED SCORE,SEN IND AVG NUM,SEN IND MED NUM
0,476.43,4.27,0.62598,0.05055,0.21136,-0.28773,0.03788,0.13515,1,4.26,474,0.005,-2.43,0.8,17.68,0.196649,0.0258,0,0
1,476.43,4.27,0.01171,0.00474,0.01717,-0.01484,0.00056,0.11400,0,0.01,491,0.031,14.57,0.8,17.68,0.196649,0.0258,0,0
2,476.43,4.27,0.01569,0.00635,0.02257,-0.01973,0.00132,0.11196,0,0.02,490,0.028,13.57,0.8,17.68,0.196649,0.0258,0,0
3,476.43,4.27,0.02331,0.00888,0.03151,-0.02878,0.00105,0.11290,0,0.03,489,0.026,12.57,0.8,17.68,0.196649,0.0258,0,0
4,476.43,4.27,0.02852,0.01072,0.03713,-0.03303,0.00202,0.10768,0,0.04,488,0.024,11.57,0.8,17.68,0.196649,0.0258,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10409,437.66,0.00,0.01714,0.02027,0.01301,-0.01440,0.00002,0.17109,96493,0.01,440,0.005,2.34,0.6,27.61,0.445772,0.5574,1,1
10410,437.66,0.00,0.73812,0.29126,0.03481,-0.09513,0.00052,0.28965,90480,1.63,436,0.004,-1.66,0.6,27.61,0.445772,0.5574,1,1
10411,437.66,0.00,0.53086,0.28405,0.04657,-0.29000,0.00073,0.24958,133936,0.94,437,0.002,-0.66,0.6,27.61,0.445772,0.5574,1,1
10412,437.66,0.00,0.25513,0.22841,0.04662,-0.27959,0.00117,0.19267,102729,0.26,438,0.001,0.34,0.6,27.61,0.445772,0.5574,1,1


In [30]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_validation_scaled = X_scaler.transform(X_validation)

## 3.2.) Quick Check of Models and Algorithms

In [31]:
#Spot check Classification algorithms

models = []

#Boosting methods

models.append(('XGB', XGBClassifier()))
models.append(('AB', AdaBoostClassifier()))
models.append(('GBM', GradientBoostingClassifier()))

#Bagging methods

models.append(('RF', RandomForestClassifier()))
models.append(('ET', ExtraTreesClassifier()))

#SVC 
models.append(('SVC', SVC()))

In [32]:
#Test options for classification

num_folds = 10
seed = 7

In [33]:
results = []

names = []

scoring = 'accuracy'

for name, model in models:
    kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    findings_summary = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(findings_summary)

XGB: 0.937586 (0.007183)
AB: nan (nan)
GBM: nan (nan)
RF: nan (nan)
ET: nan (nan)
SVC: nan (nan)


In [None]:
#Compare the model outcomes

fig = pyplot.figure()
fig.suptitle('Model Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
fig.set_size_inches(8,4)
pyplot.show()

## 3.3.) Selecting Random Forest (or whichver tree method best), Calculating its Baseline 

In [None]:
#Estimate accuracy on training set

rf_model = RandomForestClassifier()

cv = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)
scores = cross_val_score(rf_model, X_train_scaled, y_train, scoring='accuracy', cv=cv)
avg_score = np.mean(scores)
avg_score

In [None]:
#Estimate accuracy on validation set

rf_model = RandomForestClassifier()

rf_model.fit(X_train_scaled, y_train)

predictions = rf_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))

print(classification_report(y_validation, predictions, target_names=["Strong Sell", "Sell", "Pass", "Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

# SECTION 4: Model Tuning 

## 4.1.) Hyperparameter Tuning for Random Forest

### APPROACH (A) 
## __*(CAUTION TAKES A LONG TIME!)*__

In [None]:


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {}

paramgrid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,}
              # 'min_samples_split': min_samples_split,
               #'min_samples_leaf': min_samples_leaf,
              # 'bootstrap': bootstrap}

random.seed(1)

cv = EvolutionaryAlgorithmSearchCV(estimator=RandomForestClassifier(),
                                   params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(n_splits=4),
                                   verbose=1,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=4)
cv.fit(X_train_scaled, y_train)

### APPROACH (B)
## __*(CAUTION TAKES A LONG TIME!)*__

In [None]:
# Grid Search: (select model) Tuning

n_estimators = [20, 100, 180, 1000]
max_features = [1, 2, 3, 4]

param_grid = dict(n_estimators=n_estimators, max_features=max_features)

rf_model = RandomForestClassifier()

kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)

grid = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train_scaled, y_train)

#Print Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
ranks = grid_result.cv_results_['rank_test_score']
for mean, stdev, param, rank in zip(means, stds, params, ranks):
    print("#%d %f (%f) with: %r" % (rank, mean, stdev, param))

### PROCEEDING WITH BEST APPROACH

In [None]:
#Prepare the model with parameters decided in previous cell

rf_model = RandomForestClassifier() #(n_estimators=200, max_features="sqrt", max_depth=100, min_samples_split=5, min_samples_leaf=1, bootstrap=False)
rf_model.fit(X_train_scaled, y_train)

In [None]:
#Score predictions of training set

training_predictions = rf_model.predict(X_train_scaled)

print(accuracy_score(y_train, training_predictions))

print('Micro Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_train, training_predictions, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_train, training_predictions, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_train, training_predictions, average='weighted')))


print(classification_report(y_train, training_predictions, target_names=["Strong Sell", "Sell", "Pass", "Buy", "Strong Buy", "Very High Return"]))

In [None]:
#Score predictions of validation set

predictions = rf_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Micro Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_validation, predictions, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_validation, predictions, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))


print(classification_report(y_validation, predictions, target_names=["Strong Sell", "Sell", "Pass", "Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

## 4.2.) Determining Feature Importance

In [None]:
#Feature importances

importances = rf_model.feature_importances_

In [None]:
important_features = zip(X.columns, rf_model.feature_importances_)

In [None]:
importances_df = pd.DataFrame(important_features)

In [None]:
importances_df = importances_df.rename(columns={0: 'Feature', 1: 'Importance'})

In [None]:
importances_df = importances_df.set_index('Feature')

In [None]:
importances_df = importances_df.sort_values(by='Importance', ascending=False)

In [None]:
importances_df

In [None]:
#Plot the top 10 most important features

importances_df[0:10].plot(
    kind='barh',
    color='green',
    title='Feature Importance',
    legend=True)

## 4.2. Bayesian Optimization with HYPEROPT for XGB

## __*(CAUTION TAKES A COUPLE OF MIN!)*__

In [None]:
#Initialize domain space for range of values 
 
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1, 9),
        'reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }


The available hyperopt optimization algorithms are -

hp.choice(label, options) — Returns one of the options, which should be a list or tuple.

hp.randint(label, upper) — Returns a random integer between the range [0, upper).

hp.uniform(label, low, high) — Returns a value uniformly between low and high.

hp.quniform(label, low, high, q) — Returns a value round(uniform(low, high) / q) * q, i.e it rounds the decimal values and returns an integer.

hp.normal(label, mean, std) — Returns a real value that’s normally-distributed with mean and standard deviation sigma.

In [None]:
def objective(space):
    xgb_model = xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [(X_train_scaled, y_train), (X_validation_scaled, y_validation)]
    
    xgb_model.fit(X_train_scaled, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = xgb_model.predict(X_validation_scaled)
    accuracy = accuracy_score(y_validation, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
#Establish XGB model instance
#Fitting training dataset to model

xgb_model = xgb.XGBClassifier(colsample_bytree=0.7417632345631163, max_depth=8, gamma=1.5654068039190379, min_child_weight=3.0, reg_lambda=0.8585451920481999, reg_alpha=0)
xgb_model.fit(X_train_scaled, y_train)

In [None]:
#Predicting on training set

training_predictions = xgb_model.predict(X_train_scaled)

print(accuracy_score(y_train, training_predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_train, training_predictions, average='weighted')))

print(classification_report(y_train, training_predictions, target_names=["Strong Sell", "Sell", "Pass", "Buy", "Strong Buy", "Very High Return"]))

In [None]:
#Predicting on validation set

predictions = xgb_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))

print(classification_report(y_validation, predictions, target_names=["Strong Sell", "Sell", "Pass", "Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Get probabilities of predictions

pred_proba = xgb_model.predict_proba(X_validation_scaled)
pred_proba

In [None]:
proba_df = pd.DataFrame(pred_proba.round(2))

In [None]:
#Formatting probabilities DataFrame

proba_df["Actual"] = y_validation.reset_index(drop=True)
proba_df.columns = ["'Strong Sell' Probability", "'Sell' Probability", "'Pass' Probability", "'Buy' Probability", "'Strong Buy' Probability", "'Very High Return' Prob.", "Actual"]
proba_df

In [None]:
#Labeling caterogires

proba_df.loc[proba_df["Actual"] == 0, "Actual"] = "Strong Sell"
proba_df.loc[proba_df["Actual"] == 1, "Actual"] = "Sell"
proba_df.loc[proba_df["Actual"] == 2, "Actual"] = "Pass"
proba_df.loc[proba_df["Actual"] == 3, "Actual"] = "Buy"
proba_df.loc[proba_df["Actual"] == 4, "Actual"] = "Strong Buy"
proba_df.loc[proba_df["Actual"] == 5, "Actual"] = "Very High Return"

proba_df.head(50)

In [None]:
#Get ROC AUC score

roc_auc_score(y_validation, pred_proba, multi_class="ovr")

## 4.2.) Determining Feature Importance for XGBoost Model

In [None]:
#Feature importances

importances = xgb_model.feature_importances_

In [None]:
important_features = zip(X.columns, xgb_model.feature_importances_)

In [None]:
importances_df = pd.DataFrame(important_features)

In [None]:
importances_df = importances_df.rename(columns={0: 'Feature', 1: 'Importance'})

In [None]:
importances_df = importances_df.set_index('Feature')

In [None]:
importances_df = importances_df.sort_values(by='Importance', ascending=False)

In [None]:
importances_df

In [None]:
#Plot the top 10 most important features

importances_df[0:10].plot(
    kind='barh',
    color='green',
    title='Feature Importance',
    legend=True)

In [None]:
#et_model = ExtraTreesClassifier()
#et_model.fit(X_train_scaled, y_train)

In [None]:
#ab_model = AdaBoostClassifier()
#ab_model.fit(X_train_scaled, y_train)

In [None]:
#gb_model = GradientBoostingClassifier()
#ab_model.fit(X_train_scaled, y_train)