# __Predicting Outcomes of Call Option Contracts: *Validation Data Preparation Only*__

# SECTION 1: Preparation

##  1.1.) Loading the python packages

In [1]:

from sklearn import datasets

# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
import random

# Load libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, GridSearchCV
from evolutionary_search import EvolutionaryAlgorithmSearchCV

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
#from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
#from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

#importing classification_report, confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

#from sklearn.metrics import balanced_accuracy_score
#from sklearn.metrics import confusion_matrix
#from imblearn.metrics import classification_report_imbalanced

#Libraries for Saving the Model
from pickle import dump
from pickle import load

import warnings
warnings.filterwarnings('ignore')



## 1.2.) Loading the Data

In [2]:
#Import call options info

df = pd.read_csv(
    Path("../Resources/test_2qs.csv")
)

#NOTE:
#use test.csv for just Q1 2022
#use test_2qs.csv for Q4 21 + Q1 22

In [3]:
df.head()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
0,2021-11-29 14:00:00,477.5 2021-12-03,466.25,2021-12-03,4.08,0.03813,0.01323,0.04436,-0.04721,0.00247,...,137,0.09,477.5,0.024,11.25,0.01,-88.888889,0.0,0.9,22.06
1,2021-11-29 14:00:00,478.0 2021-12-03,466.25,2021-12-03,4.08,0.03425,0.01197,0.04012,-0.04335,0.00231,...,1218,0.08,478.0,0.025,11.75,0.02,-75.0,0.0,0.9,22.06
2,2021-11-29 14:00:00,479.0 2021-12-03,466.25,2021-12-03,4.08,0.02334,0.00877,0.02989,-0.03026,0.00188,...,392,0.05,479.0,0.027,12.75,0.02,-60.0,0.0,0.9,22.06
3,2021-11-29 14:30:00,462.0 2021-12-03,465.93,2021-12-03,4.06,0.66737,0.03894,0.19384,-0.36992,0.04046,...,3299,6.0,462.0,0.008,-3.93,0.53,-91.166667,0.0,0.9,22.18
4,2021-11-29 14:30:00,462.5 2021-12-03,465.93,2021-12-03,4.06,0.64929,0.04032,0.19768,-0.37281,0.03856,...,3272,5.59,462.5,0.007,-3.43,0.45,-91.949911,0.0,0.9,22.18


# SECTION 2: Exploratory Data Analysis

## 2.1.) Descriptive Statistics

In [4]:
df.shape

(3657, 21)

In [5]:
#Check for any null values and remove the null values
print('Null Values =', df.isnull().values.any())

#Drop NaNs
df.dropna()

Null Values = False


Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
0,2021-11-29 14:00:00,477.5 2021-12-03,466.25,2021-12-03,4.08,0.03813,0.01323,0.04436,-0.04721,0.00247,...,137,0.09,477.5,0.024,11.25,0.01,-88.888889,0.0,0.9,22.06
1,2021-11-29 14:00:00,478.0 2021-12-03,466.25,2021-12-03,4.08,0.03425,0.01197,0.04012,-0.04335,0.00231,...,1218,0.08,478.0,0.025,11.75,0.02,-75.000000,0.0,0.9,22.06
2,2021-11-29 14:00:00,479.0 2021-12-03,466.25,2021-12-03,4.08,0.02334,0.00877,0.02989,-0.03026,0.00188,...,392,0.05,479.0,0.027,12.75,0.02,-60.000000,0.0,0.9,22.06
3,2021-11-29 14:30:00,462.0 2021-12-03,465.93,2021-12-03,4.06,0.66737,0.03894,0.19384,-0.36992,0.04046,...,3299,6.00,462.0,0.008,-3.93,0.53,-91.166667,0.0,0.9,22.18
4,2021-11-29 14:30:00,462.5 2021-12-03,465.93,2021-12-03,4.06,0.64929,0.04032,0.19768,-0.37281,0.03856,...,3272,5.59,462.5,0.007,-3.43,0.45,-91.949911,0.0,0.9,22.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3652,2022-01-05 14:30:00,490.0 2022-01-07,473.41,1/7/2022,2.06,0.00280,0.00153,0.00322,-0.00421,-0.00022,...,1914,0.01,490.0,0.035,16.59,0.02,100.000000,2.0,0.8,18.23
3653,2022-01-05 14:30:00,491.0 2022-01-07,473.41,1/7/2022,2.06,0.00497,0.00210,0.00546,-0.00950,-0.00016,...,38,0.02,491.0,0.037,17.59,0.01,-50.000000,0.0,0.8,18.23
3654,2022-01-05 15:00:00,474.0 2022-01-07,471.69,1/7/2022,2.04,0.34400,0.06538,0.15074,-0.40846,0.01179,...,9952,1.28,474.0,0.005,2.31,0.55,-57.031250,0.0,0.8,18.47
3655,2022-01-05 15:00:00,475.0 2022-01-07,471.69,1/7/2022,2.04,0.27673,0.06096,0.13782,-0.35256,0.00969,...,18824,0.94,475.0,0.007,3.31,0.39,-58.510638,0.0,0.8,18.47


In [6]:
display(df.head())
df.tail()

Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
0,2021-11-29 14:00:00,477.5 2021-12-03,466.25,2021-12-03,4.08,0.03813,0.01323,0.04436,-0.04721,0.00247,...,137,0.09,477.5,0.024,11.25,0.01,-88.888889,0.0,0.9,22.06
1,2021-11-29 14:00:00,478.0 2021-12-03,466.25,2021-12-03,4.08,0.03425,0.01197,0.04012,-0.04335,0.00231,...,1218,0.08,478.0,0.025,11.75,0.02,-75.0,0.0,0.9,22.06
2,2021-11-29 14:00:00,479.0 2021-12-03,466.25,2021-12-03,4.08,0.02334,0.00877,0.02989,-0.03026,0.00188,...,392,0.05,479.0,0.027,12.75,0.02,-60.0,0.0,0.9,22.06
3,2021-11-29 14:30:00,462.0 2021-12-03,465.93,2021-12-03,4.06,0.66737,0.03894,0.19384,-0.36992,0.04046,...,3299,6.0,462.0,0.008,-3.93,0.53,-91.166667,0.0,0.9,22.18
4,2021-11-29 14:30:00,462.5 2021-12-03,465.93,2021-12-03,4.06,0.64929,0.04032,0.19768,-0.37281,0.03856,...,3272,5.59,462.5,0.007,-3.43,0.45,-91.949911,0.0,0.9,22.18


Unnamed: 0,QUOTE_READTIME,CONTRACT,SPY PRICE,EXPIRE_DATE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,...,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
3652,2022-01-05 14:30:00,490.0 2022-01-07,473.41,1/7/2022,2.06,0.0028,0.00153,0.00322,-0.00421,-0.00022,...,1914,0.01,490.0,0.035,16.59,0.02,100.0,2.0,0.8,18.23
3653,2022-01-05 14:30:00,491.0 2022-01-07,473.41,1/7/2022,2.06,0.00497,0.0021,0.00546,-0.0095,-0.00016,...,38,0.02,491.0,0.037,17.59,0.01,-50.0,0.0,0.8,18.23
3654,2022-01-05 15:00:00,474.0 2022-01-07,471.69,1/7/2022,2.04,0.344,0.06538,0.15074,-0.40846,0.01179,...,9952,1.28,474.0,0.005,2.31,0.55,-57.03125,0.0,0.8,18.47
3655,2022-01-05 15:00:00,475.0 2022-01-07,471.69,1/7/2022,2.04,0.27673,0.06096,0.13782,-0.35256,0.00969,...,18824,0.94,475.0,0.007,3.31,0.39,-58.510638,0.0,0.8,18.47
3656,2022-01-05 15:00:00,476.0 2022-01-07,471.69,1/7/2022,2.04,0.21498,0.05423,0.11988,-0.29226,0.00813,...,22826,0.65,476.0,0.009,4.31,0.29,-55.384615,0.0,0.8,18.47


In [7]:
df.describe()

Unnamed: 0,SPY PRICE,DTE,C_DELTA,C_GAMMA,C_VEGA,C_THETA,C_RHO,C_IV,C_VOLUME,C_LAST,STRIKE,STRIKE_DISTANCE_PCT,STRIKE DISTANCE,PRICECLOSE,ROI %,y,INFLATION%,VIX PRICE
count,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0,3657.0
mean,468.636637,3.088154,0.294222,0.03273,0.106175,-0.202349,0.013228,0.15166,6188.77878,2.107616,474.041427,0.016972,5.404791,2.312324,7.943296,0.77878,0.839377,21.152024
std,7.465659,0.801777,0.284381,0.022394,0.067399,0.147695,0.012521,0.039551,11387.678346,3.057081,8.720731,0.012229,8.070282,3.837292,109.173926,0.792476,0.048865,3.755845
min,450.44,2.0,0.00206,0.00086,0.00278,-0.62247,-0.00035,0.08913,0.0,0.01,454.0,0.0,-15.44,0.01,-95.69378,0.0,0.8,16.52
25%,462.05,2.21,0.0391,0.01272,0.04019,-0.30679,0.00186,0.11435,525.0,0.1,468.0,0.007,-0.27,0.05,-73.096447,0.0,0.8,17.65
50%,468.13,3.12,0.18646,0.03057,0.10736,-0.20047,0.00896,0.14593,2240.0,0.68,475.0,0.015,5.58,0.29,-11.693548,1.0,0.8,20.8
75%,476.65,4.02,0.51453,0.04996,0.16528,-0.05839,0.02287,0.18199,6849.0,2.9,481.0,0.025,11.07,3.03,18.811881,1.0,0.9,22.93
max,479.65,4.27,0.95766,0.10078,0.22191,-0.00371,0.04454,0.2714,120973.0,15.15,491.0,0.063,28.56,15.97,476.315789,2.0,0.9,32.17


In [8]:
df.dtypes

QUOTE_READTIME          object
CONTRACT                object
SPY PRICE              float64
EXPIRE_DATE             object
DTE                    float64
C_DELTA                float64
C_GAMMA                float64
C_VEGA                 float64
C_THETA                float64
C_RHO                  float64
C_IV                   float64
C_VOLUME                 int64
C_LAST                 float64
STRIKE                 float64
STRIKE_DISTANCE_PCT    float64
STRIKE DISTANCE        float64
PRICECLOSE             float64
ROI %                  float64
y                      float64
INFLATION%             float64
VIX PRICE              float64
dtype: object

## 2.2.) Feature Analysis and Exploration

### Eliminate Uncorrelated Features

In [9]:
#Calculate correlation of each feature with 'y'

correlation = df.corr()
correlation_df = abs(correlation['y'])

In [10]:
correlation_df.sort_values(ascending=False)

y                      1.000000
ROI %                  0.841604
C_IV                   0.481310
PRICECLOSE             0.401209
SPY PRICE              0.325585
STRIKE                 0.305364
VIX PRICE              0.279853
C_LAST                 0.172761
C_THETA                0.146722
C_GAMMA                0.122783
C_RHO                  0.119542
C_DELTA                0.119378
STRIKE_DISTANCE_PCT    0.092771
DTE                    0.057557
C_VEGA                 0.038600
STRIKE DISTANCE        0.028783
C_VOLUME               0.013905
INFLATION%             0.012401
Name: y, dtype: float64

In [11]:
#Drop variables with less than 3% correlation with contract outcome ('y')

#drop_list_corr = sorted(list(correlation_df[correlation_df < 0.015].index))
#print(drop_list_corr)

### Drop Columns Not Needed for Machine Learning

In [12]:
df.drop(columns=["QUOTE_READTIME"], inplace=True)

In [13]:
df.drop(columns=["ROI %"], inplace=True)

In [14]:
df.drop(columns=["PRICECLOSE"], inplace=True)

In [15]:
df.drop(columns=["EXPIRE_DATE"], inplace=True)

In [16]:
df.drop(columns=["CONTRACT"], inplace=True)

In [17]:
df.to_csv("../Resources/2qs_test_ready.csv", index=None)

# __*STOP HERE AND CON'T TO "forests_no_random_splits.ipynb"*__