# __Predicting Outcomes of Call Option Contracts with Multi-Class Classification__

## Notebook is presently setup for: *21Q4 through 22Q1, SPY, Weekly Call Contracts*

# __SECTION 1: Preparation__

##  1.1.) Loading the Python packages

In [1]:

from sklearn import datasets

# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
import random

# Load libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from evolutionary_search import EvolutionaryAlgorithmSearchCV

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

#importing classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

#Libraries for Saving the Model
from pickle import dump
from pickle import load

import warnings
warnings.filterwarnings('ignore')



## 1.2.) Loading the Data: Training and Validation Datasets 
### *(Separate timeframes for train and validation datasets are used)*

In [2]:
#Import training data (an entirely separate chunk of time from validation data, as we are not doing randomized train/test/split approach)

df = pd.read_csv(
    Path("../Resources/train_2qs.csv")
)

#NOTE:
#use train.csv for Q1 22 only
#use train_2qs.csv for Q4 21 + Q1 22 

In [3]:
df.head()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%
0,10/4/2021 9:30,430.0 2021-10-08,432.95,2021-10-08,4.27,0.6131,0.03556,0.19356,-0.43339,0.03538,0.21263,10,5.93,430.0,0.007,-2.95,6.11,3.035413,2.0,0.3
1,10/4/2021 9:30,431.0 2021-10-08,432.95,2021-10-08,4.27,0.57914,0.0373,0.19781,-0.43196,0.03344,0.20709,1,5.21,431.0,0.004,-1.95,5.31,1.919386,2.0,0.3
2,10/4/2021 9:30,432.0 2021-10-08,432.95,2021-10-08,4.27,0.54144,0.03858,0.20139,-0.43035,0.03157,0.20271,0,5.61,432.0,0.002,-0.95,4.63,-17.468806,1.0,0.3
3,10/4/2021 9:30,433.0 2021-10-08,432.95,2021-10-08,4.27,0.50267,0.03978,0.20224,-0.42316,0.02886,0.19849,1,4.01,433.0,0.0,0.05,3.95,-1.496259,2.0,0.3
4,10/4/2021 9:30,434.0 2021-10-08,432.95,2021-10-08,4.27,0.46199,0.04063,0.20081,-0.4104,0.02677,0.19261,13,3.42,434.0,0.002,1.05,3.22,-5.847953,2.0,0.3


In [4]:
#Import validation data (an entirely separate chunk of time from training data, as we are not doing randomized train/test/split approach)

test = pd.read_csv(
    Path("../Resources/2qs_test_ready.csv")
)
test

#NOTE:
#use test_ready.csv for 2022_Q1 only
#use 2qs_test_ready.csv for 2021_Q4 + 2022_Q1

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,y,INFLATION%
0,463.84,2.02,0.00700,0.00341,0.00871,-0.01352,0.00064,146,0.02,479.0,0.033,15.16,2.0,0.9
1,463.84,2.02,0.00293,0.00153,0.00347,-0.00459,0.00024,1420,0.01,480.0,0.035,16.16,2.0,0.9
2,463.84,2.02,0.00295,0.00137,0.00316,-0.00423,0.00015,1778,0.01,481.0,0.037,17.16,2.0,0.9
3,463.84,2.02,0.00264,0.00126,0.00349,-0.00437,0.00030,2,0.01,482.0,0.039,18.16,2.0,0.9
4,463.84,2.02,0.00241,0.00114,0.00308,-0.00497,-0.00041,537,0.01,483.0,0.041,19.16,2.0,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3652,460.67,3.08,0.08263,0.01815,0.07200,-0.13194,0.00401,3500,0.32,471.0,0.022,10.33,5.0,0.8
3653,460.67,3.08,0.06546,0.01533,0.06078,-0.10882,0.00307,3794,0.24,472.0,0.025,11.33,5.0,0.8
3654,460.67,3.08,0.05229,0.01277,0.05087,-0.09034,0.00195,2079,0.18,473.0,0.027,12.33,5.0,0.8
3655,460.67,3.08,0.04256,0.01065,0.04231,-0.07467,0.00214,2847,0.15,474.0,0.029,13.33,5.0,0.8


In [None]:
test.head()

# __SECTION 2: Exploratory Data Analysis and Further Prep__

## 2.1.) Descriptive Statistics

In [None]:
df.shape

In [None]:
#Check for any null values and remove the null values
print('Null Values =', df.isnull().values.any())

#Drop NaNs
df.dropna()

In [None]:
display(df.head())
df.tail()

In [None]:
df.describe()

In [None]:
df.dtypes

## 2.2.) Feature Analysis and Exploration

### Plotting features according to contract length

In [None]:
#
#contract_outcome = df.groupby('STRIKE_DISTANCE_PCT')['y'].value_counts(normalize=True).loc[:,1]
#sns.set(rc={'figure.figsize':(12,5)})
#sns.barplot(x=contract_outcome.index, y=contract_outcome.values, color='#5975A4', saturation=1)

### Eliminate Uncorrelated Features

In [None]:
#Calculate correlation of each feature with 'y'

correlation = df.corr()
correlation_df = abs(correlation['y'])

In [None]:
correlation_df.sort_values(ascending=False)

In [None]:
#Drop variables with less than 1.5% correlation with contract outcome ('y')

drop_list_corr = sorted(list(correlation_df[correlation_df < 0.015].index))
print(drop_list_corr)

### Drop Columns Not Needed

In [None]:
df.drop(columns=["ROI %"], inplace=True) 

In [None]:
df.drop(columns=["PRICECLOSE"], inplace=True)

In [None]:
df.drop(columns=["EXPIRE_DATE"], inplace=True)

In [None]:
df.drop(columns=["CONTRACT"], inplace=True)

In [None]:
df.drop(columns=["QUOTE_READTIME"], inplace=True)

In [None]:
df.drop(labels=["N_IV"], axis=1, inplace=True)

In [None]:
df.drop(columns=["VIX PRICE"], inplace=True)

In [None]:
test.drop(labels=["N_IV"], axis=1, inplace=True)

In [None]:
test.drop(labels=["VIX PRICE"], axis=1, inplace=True)

## 2.3.) Prep Training and Validation Datasets

In [None]:
#Separate predicted variable from features for training dataset

y_train = df["y"]

X_train = df.drop(columns='y')


In [None]:
y_train.value_counts()

In [None]:
X_train

In [None]:
#Separate predicted variable from features for validation dataset

y_validation = test["y"]

X_validation = test.drop(columns='y')


In [None]:
X_validation

## 2.4.) Scaling Training and Testing Datasets

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_validation_scaled = X_scaler.transform(X_validation)

# __SECTION 3: Evaluate Algorithms and Models__

## 3.1.) Quick Check of Models and Algorithms

In [None]:
#Spot check Classification algorithms

models = []

#Boosting methods

models.append(('XGB', XGBClassifier()))
models.append(('AB', AdaBoostClassifier()))
models.append(('GBM', GradientBoostingClassifier()))

#Bagging methods

models.append(('RF', RandomForestClassifier()))
models.append(('ET', ExtraTreesClassifier()))

#SVC 
models.append(('SVC', SVC()))

#K Nearest Neighbors
models.append(('KNN', KNeighborsClassifier()))

In [None]:
#Number of k-folds for K-fold cross validation is established
#Random state established for replication purposes

num_folds = 10
seed = 7

In [None]:
#Models are fit and evaluated to the training dataset

results = []

names = []

scoring = 'accuracy'

for name, model in models:
    kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    findings_summary = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(findings_summary)

In [None]:
#Compare the model outcomes

fig = pyplot.figure()
fig.suptitle('Model Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
fig.set_size_inches(8,4)
pyplot.show()

## 3.2.) Selecting Random Forest (or whichver tree method best), Calculating its Baseline with Cross Validation on Training Set

In [None]:
#Estimate accuracy on training set
#Test options for classification

num_folds = 10
seed = 7

rf_model = RandomForestClassifier()

cv = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)
scores = cross_val_score(rf_model, X_train_scaled, y_train, scoring='accuracy', cv=cv)
avg_score = np.mean(scores)
avg_score

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

# __SECTION 4: Model Tuning__

## 4.1.1.) Hyperparameter Tuning for Random Forest

### APPROACH (A) 
## __*(CAUTION TAKES A LONG TIME!)*__

In [None]:


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {}

paramgrid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random.seed(1)

cv = EvolutionaryAlgorithmSearchCV(estimator=RandomForestClassifier(),
                                   params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(n_splits=4),
                                   verbose=1,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=4)
cv.fit(X_train_scaled, y_train)

### APPROACH (B)
## __*(CAUTION TAKES A LONG TIME!)*__

In [None]:
# Grid Search: (select model) Tuning

num_folds = 10
seed = 7

scoring = 'accuracy'

n_estimators = [20, 100, 180, 1000]
max_features = [1, 2, 3, 4]

param_grid = dict(n_estimators=n_estimators, max_features=max_features)

rf_model = RandomForestClassifier()

kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)

grid = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train_scaled, y_train)

#Print Results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
ranks = grid_result.cv_results_['rank_test_score']
for mean, stdev, param, rank in zip(means, stds, params, ranks):
    print("#%d %f (%f) with: %r" % (rank, mean, stdev, param))

## 4.1.2.) Implementing Tuned Hyperparameters for Random Forest

In [None]:
#Predicting on TRAINING set

#Prepare the model with parameters decided in previous cell

rf_model = RandomForestClassifier(n_estimators=180, max_features=4, max_depth=30, min_samples_split=5, min_samples_leaf=1, bootstrap=False)
rf_model.fit(X_train_scaled, y_train)
#Score predictions of training set

training_predictions = rf_model.predict(X_train_scaled)

print(accuracy_score(y_train, training_predictions))

print('Micro Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_train, training_predictions, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_train, training_predictions, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_train, training_predictions, average='weighted')))


print(classification_report(y_train, training_predictions, target_names=["Strong Sell", "Sell", "Pass"])) #, "Buy", "Strong Buy", "Very High Return"]))

In [None]:
#Predicting on VALIDATION set

#Prepare the model with parameters decided in previous cell

rf_model = RandomForestClassifier(n_estimators=100, max_features=5, max_depth=30, min_samples_split=5, min_samples_leaf=1, bootstrap=False)
rf_model.fit(X_train_scaled, y_train)

#Score predictions of validation set

predictions = rf_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Micro Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_validation, predictions, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_validation, predictions, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))


print(classification_report(y_validation, predictions, target_names=["Sell", "Pass", "Buy"])) #"Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

In [None]:
#Get ROC AUC score

roc_auc_score(y_validation, pred_proba, multi_class="ovr")

## 4.1.3.) Determining Feature Importance 

In [None]:
#Feature importances

importances = rf_model.feature_importances_

In [None]:
important_features = zip(X.columns, rf_model.feature_importances_)

In [None]:
importances_df = pd.DataFrame(important_features)

In [None]:
importances_df = importances_df.rename(columns={0: 'Feature', 1: 'Importance'})

In [None]:
importances_df = importances_df.set_index('Feature')

In [None]:
importances_df = importances_df.sort_values(by='Importance', ascending=False)

In [None]:
importances_df

In [None]:
#Plot the top 10 most important features

importances_df[0:10].plot(
    kind='barh',
    color='green',
    title='Feature Importance',
    legend=True)

## 4.2.1.) Hyperparameter Tuning for XGBoost: Bayesian Optimization with HYPEROPT

The available hyperopt optimization algorithms are -

hp.choice(label, options) — Returns one of the options, which should be a list or tuple.

hp.randint(label, upper) — Returns a random integer between the range [0, upper).

hp.uniform(label, low, high) — Returns a value uniformly between low and high.

hp.quniform(label, low, high, q) — Returns a value round(uniform(low, high) / q) * q, i.e it rounds the decimal values and returns an integer.

hp.normal(label, mean, std) — Returns a real value that’s normally-distributed with mean and standard deviation sigma.

In [None]:
#Initialize domain space for range of values 
 
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1, 9),
        'reg_alpha' : hp.quniform('reg_alpha', 40, 180, 1),
        'reg_lambda' : hp.uniform('reg_lambda', 0, 1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }


In [None]:
#Define "objective" function to yield the lowest output value, the “loss”.

def objective(space):
    xgb_model = xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [(X_train_scaled, y_train), (X_validation_scaled, y_validation)]
    
    xgb_model.fit(X_train_scaled, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = xgb_model.predict(X_validation_scaled)
    accuracy = accuracy_score(y_validation, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
#Establish Hyperopt Trials() object
trials = Trials()

#Getting the best hyperparameters with "fmin" function
best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
#Informs of the best hyperparameters from above search

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

## 4.2.2.) Implementing Tuned Hyperparameters for XGBoost

In [None]:
#Establish XGB model instance
xgb_model = xgb.XGBClassifier(colsample_bytree=0.83263437029644, gamma=4.6500330499023175, max_depth=9, min_child_weight=9.0, reg_alpha=23, reg_lambda=0.4074373399618115) 

#Fit training dataset to model
xgb_model.fit(X_train_scaled, y_train)

In [None]:
#Predicting on validation set

predictions = xgb_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))

print(classification_report(y_validation, predictions, target_names=["Sell", "Pass", "Buy"])) #, "Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Predicting on training set

training_predictions = xgb_model.predict(X_train_scaled)

print(accuracy_score(y_train, training_predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_train, training_predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_train, training_predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_train, training_predictions, average='weighted')))

print(classification_report(y_train, training_predictions, target_names=["Strong Sell", "Sell", "Pass"])) #, "Buy", "Strong Buy", "Very High Return"]))

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

In [None]:
#Get probabilities of predictions

pred_proba = xgb_model.predict_proba(X_validation_scaled)
pred_proba

In [None]:
proba_df = pd.DataFrame(pred_proba.round(2))

In [None]:
#Formatting probabilities DataFrame

proba_df["Actual"] = y_validation.reset_index(drop=True)
proba_df.columns = ["Sell Probability", "Pass Probability", "Buy Probability", "Actual"] #"'Pass' Probability", "'Buy' Probability", "'Strong Buy' Probability", "'Very High Return' Prob.", "Actual"]
proba_df

In [None]:
#Labeling caterogires

proba_df.loc[proba_df["Actual"] == 0, "Actual"] = "Sell"
proba_df.loc[proba_df["Actual"] == 1, "Actual"] = "Pass"
proba_df.loc[proba_df["Actual"] == 2, "Actual"] = "Buy"

proba_df.head(50)

In [None]:
#Get ROC AUC score

roc_auc_score(y_validation, pred_proba, multi_class="ovr")

## 4.2.3.) Determining Feature Importance for XGBoost Model

In [None]:
#Feature importances

importances = xgb_model.feature_importances_

In [None]:
important_features = zip(X_validation.columns, xgb_model.feature_importances_)

In [None]:
importances_df = pd.DataFrame(important_features)

In [None]:
importances_df = importances_df.rename(columns={0: 'Feature', 1: 'Importance'})

In [None]:
importances_df = importances_df.set_index('Feature')

In [None]:
importances_df = importances_df.sort_values(by='Importance', ascending=False)

In [None]:
importances_df

In [None]:
#Plot the top 10 most important features

importances_df[0:10].plot(
    kind='barh',
    color='green',
    title='Feature Importance',
    legend=True)

## 4.3.1.) Hyperparameter Tuning for KNN

In [None]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]

#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)

#Create new KNN object
knn_2 = KNeighborsClassifier()

#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)

#Fit the model
best_model = clf.fit(X_train_scaled, y_train)

#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

## 4.3.2.) Implementing Tuned Hyperparameters for KNN

In [None]:
#Estimate accuracy on validation set

kn_model = KNeighborsClassifier(n_neighbors=10, leaf_size=1, p=2)

kn_model.fit(X_train_scaled, y_train)

predictions = kn_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))

print(classification_report(y_validation, predictions, target_names=["Sell", "Pass", "Buy"])) #, "Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 

# __APPENDIX__

## Extra Models to Test

In [None]:
#et_model = ExtraTreesClassifier()
#et_model.fit(X_train_scaled, y_train)

In [None]:
#ab_model = AdaBoostClassifier()
#ab_model.fit(X_train_scaled, y_train)

In [None]:
#gb_model = GradientBoostingClassifier()
#ab_model.fit(X_train_scaled, y_train)