# __Predicting Outcomes of Call Option Contracts: *Validation Data Preparation Only*__

# SECTION 1: Preparation

##  1.1.) Loading the python packages

In [1]:

from sklearn import datasets

# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
import random

# Load libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from evolutionary_search import EvolutionaryAlgorithmSearchCV

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

#importing classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

#from sklearn.metrics import balanced_accuracy_score
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

#Libraries for Saving the Model
from pickle import dump
from pickle import load

import warnings
warnings.filterwarnings('ignore')



## 1.2.) Loading the Data

In [2]:
#Import call options info

df = pd.read_csv(
    Path("../Resources/test_2qs.csv")
)

#NOTE:
#use test.csv for just Q1 2022
#use test_2qs.csv for Q4 21 + Q1 22

In [3]:
df.head()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
0,2021-11-29 14:00:00,477.5 2021-12-03,466.25,2021-12-03,4.08,0.03813,0.01323,0.04436,-0.04721,0.00247,...,137,0.09,477.5,0.024,11.25,0.01,-88.888889,0.0,0.9,22.06
1,2021-11-29 14:00:00,478.0 2021-12-03,466.25,2021-12-03,4.08,0.03425,0.01197,0.04012,-0.04335,0.00231,...,1218,0.08,478.0,0.025,11.75,0.02,-75.0,0.0,0.9,22.06
2,2021-11-29 14:00:00,479.0 2021-12-03,466.25,2021-12-03,4.08,0.02334,0.00877,0.02989,-0.03026,0.00188,...,392,0.05,479.0,0.027,12.75,0.02,-60.0,0.0,0.9,22.06
3,2021-11-29 14:30:00,462.0 2021-12-03,465.93,2021-12-03,4.06,0.66737,0.03894,0.19384,-0.36992,0.04046,...,3299,6.0,462.0,0.008,-3.93,0.53,-91.166667,0.0,0.9,22.18
4,2021-11-29 14:30:00,462.5 2021-12-03,465.93,2021-12-03,4.06,0.64929,0.04032,0.19768,-0.37281,0.03856,...,3272,5.59,462.5,0.007,-3.43,0.45,-91.949911,0.0,0.9,22.18


# SECTION 2: Exploratory Data Analysis

## 2.1.) Descriptive Statistics

In [4]:
df.shape

(2952, 21)

In [5]:
#Check for any null values and remove the null values
print('Null Values =', df.isnull().values.any())

#Drop NaNs
df.dropna()

Null Values = False


Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
0,2021-11-29 14:00:00,477.5 2021-12-03,466.25,2021-12-03,4.08,0.03813,0.01323,0.04436,-0.04721,0.00247,...,137,0.09,477.5,0.024,11.25,0.01,-88.888889,0.0,0.9,22.06
1,2021-11-29 14:00:00,478.0 2021-12-03,466.25,2021-12-03,4.08,0.03425,0.01197,0.04012,-0.04335,0.00231,...,1218,0.08,478.0,0.025,11.75,0.02,-75.000000,0.0,0.9,22.06
2,2021-11-29 14:00:00,479.0 2021-12-03,466.25,2021-12-03,4.08,0.02334,0.00877,0.02989,-0.03026,0.00188,...,392,0.05,479.0,0.027,12.75,0.02,-60.000000,0.0,0.9,22.06
3,2021-11-29 14:30:00,462.0 2021-12-03,465.93,2021-12-03,4.06,0.66737,0.03894,0.19384,-0.36992,0.04046,...,3299,6.00,462.0,0.008,-3.93,0.53,-91.166667,0.0,0.9,22.18
4,2021-11-29 14:30:00,462.5 2021-12-03,465.93,2021-12-03,4.06,0.64929,0.04032,0.19768,-0.37281,0.03856,...,3272,5.59,462.5,0.007,-3.43,0.45,-91.949911,0.0,0.9,22.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2947,2021-12-29 16:00:00,483.0 2021-12-31,477.48,2021-12-31,2.00,0.07746,0.03704,0.05983,-0.08835,0.00294,...,10517,0.14,483.0,0.012,5.52,0.14,0.000000,1.0,0.8,17.04
2948,2021-12-29 16:00:00,484.0 2021-12-31,477.48,2021-12-31,2.00,0.05140,0.02616,0.04388,-0.06105,0.00162,...,6389,0.09,484.0,0.014,6.52,0.09,0.000000,1.0,0.8,17.04
2949,2021-12-29 16:00:00,485.0 2021-12-31,477.48,2021-12-31,2.00,0.03414,0.01829,0.03162,-0.04283,0.00111,...,14034,0.06,485.0,0.016,7.52,0.06,0.000000,1.0,0.8,17.04
2950,2021-12-29 16:00:00,486.0 2021-12-31,477.48,2021-12-31,2.00,0.02645,0.01408,0.02574,-0.03567,0.00078,...,3114,0.05,486.0,0.018,8.52,0.05,0.000000,1.0,0.8,17.04


In [6]:
display(df.head())
df.tail()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
0,2021-11-29 14:00:00,477.5 2021-12-03,466.25,2021-12-03,4.08,0.03813,0.01323,0.04436,-0.04721,0.00247,...,137,0.09,477.5,0.024,11.25,0.01,-88.888889,0.0,0.9,22.06
1,2021-11-29 14:00:00,478.0 2021-12-03,466.25,2021-12-03,4.08,0.03425,0.01197,0.04012,-0.04335,0.00231,...,1218,0.08,478.0,0.025,11.75,0.02,-75.0,0.0,0.9,22.06
2,2021-11-29 14:00:00,479.0 2021-12-03,466.25,2021-12-03,4.08,0.02334,0.00877,0.02989,-0.03026,0.00188,...,392,0.05,479.0,0.027,12.75,0.02,-60.0,0.0,0.9,22.06
3,2021-11-29 14:30:00,462.0 2021-12-03,465.93,2021-12-03,4.06,0.66737,0.03894,0.19384,-0.36992,0.04046,...,3299,6.0,462.0,0.008,-3.93,0.53,-91.166667,0.0,0.9,22.18
4,2021-11-29 14:30:00,462.5 2021-12-03,465.93,2021-12-03,4.06,0.64929,0.04032,0.19768,-0.37281,0.03856,...,3272,5.59,462.5,0.007,-3.43,0.45,-91.949911,0.0,0.9,22.18


Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
2947,2021-12-29 16:00:00,483.0 2021-12-31,477.48,2021-12-31,2.0,0.07746,0.03704,0.05983,-0.08835,0.00294,...,10517,0.14,483.0,0.012,5.52,0.14,0.0,1.0,0.8,17.04
2948,2021-12-29 16:00:00,484.0 2021-12-31,477.48,2021-12-31,2.0,0.0514,0.02616,0.04388,-0.06105,0.00162,...,6389,0.09,484.0,0.014,6.52,0.09,0.0,1.0,0.8,17.04
2949,2021-12-29 16:00:00,485.0 2021-12-31,477.48,2021-12-31,2.0,0.03414,0.01829,0.03162,-0.04283,0.00111,...,14034,0.06,485.0,0.016,7.52,0.06,0.0,1.0,0.8,17.04
2950,2021-12-29 16:00:00,486.0 2021-12-31,477.48,2021-12-31,2.0,0.02645,0.01408,0.02574,-0.03567,0.00078,...,3114,0.05,486.0,0.018,8.52,0.05,0.0,1.0,0.8,17.04
2951,2021-12-29 16:00:00,487.0 2021-12-31,477.48,2021-12-31,2.0,0.02102,0.01072,0.02131,-0.0287,0.00111,...,2626,0.03,487.0,0.02,9.52,0.03,0.0,1.0,0.8,17.04


In [7]:
df.describe()

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
count,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0,2952.0
mean,466.678892,3.057683,0.30999,0.031292,0.106347,-0.214649,0.01371,0.161425,6797.763211,2.365999,472.028963,0.017851,5.350071,2.836396,27.047099,0.941057,0.84878,22.109743
std,6.985508,0.801285,0.293825,0.021203,0.065914,0.15109,0.012774,0.03766,12314.674282,3.287417,8.169684,0.012661,8.597976,4.10026,112.351017,0.782679,0.049994,3.561312
min,450.44,2.0,0.00206,0.00086,0.00278,-0.62247,-0.00035,0.09111,0.0,0.01,454.0,0.0,-15.44,0.01,-95.69378,0.0,0.8,17.03
25%,461.3725,2.19,0.043925,0.012985,0.043245,-0.320823,0.002058,0.132928,542.0,0.12,467.0,0.007,-0.7925,0.07,-50.0,0.0,0.8,18.8
50%,467.08,3.1,0.200655,0.02971,0.10737,-0.215385,0.009645,0.16314,2453.5,0.78,472.0,0.016,5.58,0.57,0.0,1.0,0.8,21.99
75%,472.36,4.02,0.544025,0.04566,0.163953,-0.067088,0.02326,0.187713,7427.5,3.37,478.0,0.026,11.44,4.0,39.698427,2.0,0.9,23.72
max,478.5,4.27,0.95766,0.10078,0.22009,-0.00446,0.04454,0.2714,120973.0,15.15,487.0,0.063,28.56,15.97,476.315789,2.0,0.9,32.17


In [8]:
df.dtypes

QUOTE_READTIME          object
CONTRACT                object
SPY PRICE              float64
EXPIRE_DATE             object
DTE                    float64
C_DELTA                float64
C_GAMMA                float64
C_VEGA                 float64
C_THETA                float64
C_RHO                  float64
C_IV                   float64
C_VOLUME                 int64
C_LAST                 float64
STRIKE                 float64
STRIKE_DISTANCE_PCT    float64
STRIKE DISTANCE        float64
PRICECLOSE             float64
ROI %                  float64
y                      float64
INFLATION%             float64
VIX PRICE              float64
dtype: object

## 2.2.) Feature Analysis and Exploration

### Eliminate Uncorrelated Features

In [9]:
#Calculate correlation of each feature with 'y'

correlation = df.corr()
correlation_df = abs(correlation['y'])

In [10]:
correlation_df.sort_values(ascending=False)

y                      1.000000
ROI %                  0.815206
C_IV                   0.350567
PRICECLOSE             0.334246
INFLATION%             0.187168
STRIKE                 0.171492
SPY PRICE              0.133385
C_RHO                  0.132147
C_LAST                 0.126304
C_THETA                0.120636
C_DELTA                0.110018
C_VEGA                 0.091008
VIX PRICE              0.080651
C_VOLUME               0.061823
STRIKE DISTANCE        0.054580
C_GAMMA                0.039419
DTE                    0.011203
STRIKE_DISTANCE_PCT    0.001680
Name: y, dtype: float64

In [11]:
#Drop variables with less than 3% correlation with contract outcome ('y')

#drop_list_corr = sorted(list(correlation_df[correlation_df < 0.015].index))
#print(drop_list_corr)

In [12]:
#Dropping because C_IV convolutes outcome for some buggish reason

#df.drop(labels=["C_IV"], axis=1, inplace=True)

### Drop Columns Not Needed for Machine Learning

In [13]:
df.drop(columns=["QUOTE_READTIME"], inplace=True)

In [14]:
df.drop(columns=["ROI %"], inplace=True)

In [15]:
df.drop(columns=["PRICECLOSE"], inplace=True)

In [16]:
df.drop(columns=["EXPIRE_DATE"], inplace=True)

In [17]:
df.drop(columns=["CONTRACT"], inplace=True)

In [18]:
df.to_csv("../Resources/2qs_test_ready.csv", index=None)

In [None]:
#df.to_csv("../Resources/bull_test_ready.csv", index=None)

# __*STOP HERE AND CON'T TO "forests_no_random_splits.ipynb"*__

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler = scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_validation_scaled = X_scaler.transform(X_validation)

## 3.2.) Quick Check of Models and Algorithms

In [None]:
#Spot check Classification algorithms

models = []

#Boosting methods

models.append(('XGB', XGBClassifier()))
models.append(('AB', AdaBoostClassifier()))
models.append(('GBM', GradientBoostingClassifier()))

#Bagging methods

models.append(('RF', RandomForestClassifier()))
models.append(('ET', ExtraTreesClassifier()))

#SVC 
models.append(('SVC', SVC()))

In [None]:
#Test options for classification

num_folds = 10
seed = 7

In [None]:
results = []

names = []

scoring = 'accuracy'

for name, model in models:
    kfold = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    findings_summary = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(findings_summary)

In [None]:
#Compare the model outcomes

fig = pyplot.figure()
fig.suptitle('Model Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
fig.set_size_inches(8,4)
pyplot.show()

## 3.3.) Selecting Random Forest (or whichver tree method best), Calculating its Baseline 

In [None]:
#Estimate accuracy on training set

rf_model = RandomForestClassifier()

cv = StratifiedKFold(n_splits=num_folds, random_state=seed, shuffle=True)
scores = cross_val_score(rf_model, X_train_scaled, y_train, scoring='accuracy', cv=cv)
avg_score = np.mean(scores)
avg_score

In [None]:
#Estimate accuracy on validation set

rf_model = RandomForestClassifier()

rf_model.fit(X_train_scaled, y_train)

predictions = rf_model.predict(X_validation_scaled)

print(accuracy_score(y_validation, predictions))

print('Weighted Precision: {:.2f}'.format(precision_score(y_validation, predictions, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_validation, predictions, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_validation, predictions, average='weighted')))

print(classification_report(y_validation, predictions, target_names=["Strong Sell", "Sell", "Pass", "Buy", "Strong Buy", "Very High Return"]))

print(confusion_matrix(y_validation, predictions))

In [None]:
#Visualization of the Confusion Matrix

df_matrix = pd.DataFrame(confusion_matrix(y_validation, predictions), columns=np.unique(y_validation), index = np.unique(y_validation))
df_matrix.index.name = 'Actual'
df_matrix.columns.name = 'Predicted'
sns.heatmap(df_matrix, cmap="Blues", annot=True, annot_kws={"size": 16}) 