## Data cleasning and preprocessing of the option pricing data

In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import yfinance as yf


In [95]:
# Read in the data from Jan 2023 to June 2023
df_2023_h1 = pd.DataFrame()
for i in [202301, 202302, 202303, 202304,  202305]:
    df_2023_h1 = pd.concat([df_2023_h1, pd.read_table(f'data/spy_eod_{i}.txt', sep=',')], ignore_index=True)
df_2023_h1.columns = df_2023_h1.columns.str.strip()

  df_2023_h1 = pd.concat([df_2023_h1, pd.read_table(f'data/spy_eod_{i}.txt', sep=',')], ignore_index=True)


In [96]:
# also drop expiration date later than 2024
df_2023_h1 = df_2023_h1[df_2023_h1['[EXPIRE_DATE]'] <= ' 2023-12-31']
df_2023_h1 = df_2023_h1.reset_index()

In [97]:
# get target data from yfinance
target = pd.DataFrame(yf.download(['SPY'], start="2023-01-01", end="2023-12-31")['Adj Close'])
target

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2023-01-03,375.118683
2023-01-04,378.014709
2023-01-05,373.700256
2023-01-06,382.270020
2023-01-09,382.053284
...,...
2023-12-22,473.649994
2023-12-26,475.649994
2023-12-27,476.510010
2023-12-28,476.690002


In [98]:
df_2023_h1['[EXPIRE_DATE]'] = df_2023_h1['[EXPIRE_DATE]'].str.strip().astype('datetime64[ns]')
df_2023_h1['[EXPIRE_DATE]'].unique()

<DatetimeArray>
['2023-01-03 00:00:00', '2023-01-04 00:00:00', '2023-01-05 00:00:00',
 '2023-01-06 00:00:00', '2023-01-09 00:00:00', '2023-01-10 00:00:00',
 '2023-01-11 00:00:00', '2023-01-12 00:00:00', '2023-01-13 00:00:00',
 '2023-01-20 00:00:00',
 ...
 '2023-06-01 00:00:00', '2023-06-05 00:00:00', '2023-06-06 00:00:00',
 '2023-06-07 00:00:00', '2023-06-08 00:00:00', '2023-07-07 00:00:00',
 '2023-06-12 00:00:00', '2023-06-13 00:00:00', '2023-06-14 00:00:00',
 '2023-11-17 00:00:00']
Length: 125, dtype: datetime64[ns]

In [99]:
target['[EXPIRE_DATE]'] = target.index
target['[EXPIRE_DATE]'].astype('datetime64[ns]')

Date
2023-01-03   2023-01-03
2023-01-04   2023-01-04
2023-01-05   2023-01-05
2023-01-06   2023-01-06
2023-01-09   2023-01-09
                ...    
2023-12-22   2023-12-22
2023-12-26   2023-12-26
2023-12-27   2023-12-27
2023-12-28   2023-12-28
2023-12-29   2023-12-29
Name: [EXPIRE_DATE], Length: 250, dtype: datetime64[ns]

In [100]:
df_2023_h1 = pd.merge(df_2023_h1, target, on = '[EXPIRE_DATE]')

In [103]:
naive_price = df_2023_h1['Adj Close'] - df_2023_h1['[STRIKE]']

In [104]:
naive_price # this is the first thing we might test for 

0         65.118683
1         55.118683
2         51.118683
3         50.118683
4         49.118683
            ...    
331604    18.969696
331605    13.969696
331606     8.969696
331607     3.969696
331608    -1.030304
Length: 331609, dtype: float64

In [5]:
# ignoring underlying interest rate, lets just use direct profit comparison to try first
# we can use target price (real adj close price) - strike price as our to be predicted quote price. 
# still need to fit target into our original dataframe tho. 
# df_2023_h1.join(target, on= 'Date') # Date = Expire Date here. 

In [6]:
"""# Now do some feature engineering. First deal with moneyness. Moneyness, # probably change it to boolean? 
# by definition, refers to the strike price of an option relative to the current trading price of the underlying asset.
df_2023_h1['Moneyness'] = 'ATM'
df_2023_h1.loc[df_2023_h1['[STRIKE]'] < df_2023_h1['[UNDERLYING_LAST]'], 'Moneyness'] = 'ITM'
df_2023_h1.loc[df_2023_h1['[STRIKE]'] > df_2023_h1['[UNDERLYING_LAST]'], 'Moneyness'] = 'OTM'"""

"# Now do some feature engineering. First deal with moneyness. Moneyness, # probably change it to boolean? \n# by definition, refers to the strike price of an option relative to the current trading price of the underlying asset.\ndf_2023_h1['Moneyness'] = 'ATM'\ndf_2023_h1.loc[df_2023_h1['[STRIKE]'] < df_2023_h1['[UNDERLYING_LAST]'], 'Moneyness'] = 'ITM'\ndf_2023_h1.loc[df_2023_h1['[STRIKE]'] > df_2023_h1['[UNDERLYING_LAST]'], 'Moneyness'] = 'OTM'"

In [105]:
df_2023_h1.columns  # someone pls check this and fill it into the standard scalar

Index(['index', '[QUOTE_UNIXTIME]', '[QUOTE_READTIME]', '[QUOTE_DATE]',
       '[QUOTE_TIME_HOURS]', '[UNDERLYING_LAST]', '[EXPIRE_DATE]',
       '[EXPIRE_UNIX]', '[DTE]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]', '[C_LAST]', '[C_SIZE]',
       '[C_BID]', '[C_ASK]', '[STRIKE]', '[P_BID]', '[P_ASK]', '[P_SIZE]',
       '[P_LAST]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[STRIKE_DISTANCE]',
       '[STRIKE_DISTANCE_PCT]', 'Adj Close'],
      dtype='object')

In [106]:
df_2023_h1 = df_2023_h1[['[QUOTE_UNIXTIME]', '[EXPIRE_UNIX]', '[STRIKE]', '[UNDERLYING_LAST]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]','[C_BID]', '[C_ASK]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[P_BID]', '[P_ASK]']]
for column in df_2023_h1.columns:
    print(type(df_2023_h1[column][0]))

<class 'numpy.int64'>
<class 'numpy.int64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'str'>
<class 'str'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'str'>
<class 'str'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [107]:
df_2023_h1 = df_2023_h1.replace(r'^\s*$', 0, regex=True)

In [108]:
for column in ['[C_IV]', '[C_VOLUME]', '[P_IV]', '[P_VOLUME]']:
    df_2023_h1[column] = df_2023_h1[column].str.strip().astype('float64')
    print(type(df_2023_h1[column][0]))

<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [109]:
df_2023_h1 = df_2023_h1.fillna(0)

In [110]:
# Basic normalization and standardization
# run block of code and catch warnings
import warnings
from sklearn.preprocessing import StandardScaler
with warnings.catch_warnings():
	# ignore all caught warnings
	warnings.filterwarnings("ignore")
	# execute code that will generate warnings
	numeric_cols = ['[QUOTE_UNIXTIME]', '[EXPIRE_UNIX]', '[STRIKE]', '[UNDERLYING_LAST]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]','[C_BID]', '[C_ASK]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[P_BID]', '[P_ASK]']  # not sure about all this, we ball
	scaler = StandardScaler()
	df_2023_h1[numeric_cols] = scaler.fit_transform(df_2023_h1[numeric_cols])

In [111]:
df_2023_h1

Unnamed: 0,[QUOTE_UNIXTIME],[EXPIRE_UNIX],[STRIKE],[UNDERLYING_LAST],[C_DELTA],[C_GAMMA],[C_VEGA],[C_THETA],[C_RHO],[C_IV],...,[C_ASK],[P_DELTA],[P_GAMMA],[P_VEGA],[P_THETA],[P_RHO],[P_IV],[P_VOLUME],[P_BID],[P_ASK]
0,-1.69160,-1.531564,-1.054517,-2.406592,1.054125,-0.052714,0.000146,-0.391189,-0.307878,11.711458,...,0.676304,1.120791,-0.013192,0.001135,0.086874,0.592295,3.380989,-0.100406,-0.563368,-0.566211
1,-1.69160,-1.531564,-0.936052,-2.406592,1.041020,0.036229,0.000190,-0.390353,-0.308470,10.342762,...,0.479078,1.120350,-0.013031,0.001152,0.086843,0.592878,2.599981,-0.100406,-0.563368,-0.566211
2,-1.69160,-1.531564,-0.888665,-2.406592,1.035470,0.083539,0.000199,-0.393217,-0.308536,9.795964,...,0.400187,1.119196,-0.013111,0.001152,0.086442,0.592606,2.292638,-0.100406,-0.563368,-0.566211
3,-1.69160,-1.531564,-0.876819,-2.406592,1.048893,0.103725,0.000140,0.031159,-0.308217,9.553426,...,0.389734,1.120644,-0.013015,0.001157,0.086741,0.591011,2.214716,-0.100406,-0.563368,-0.566211
4,-1.69160,-1.531564,-0.864972,-2.406592,1.030873,0.108141,0.000225,-0.438792,-0.308574,9.529689,...,0.371195,1.119712,-0.013079,0.001120,0.086482,0.591167,2.139460,-0.100406,-0.563368,-0.566211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331604,1.78216,1.881977,0.367073,1.410238,-0.175447,0.125803,0.058474,0.064804,0.473889,-0.825637,...,-0.386156,-0.261286,0.003340,0.057390,0.064104,-2.657456,-0.483741,-0.100406,-0.191656,-0.130110
331605,1.78216,1.881977,0.426306,1.410238,-0.288188,0.141573,0.057498,0.121952,0.401497,-0.825637,...,-0.472936,-0.405104,0.004291,0.054827,0.067442,-2.681954,-0.504299,-0.100406,-0.142171,-0.085819
331606,1.78216,1.881977,0.485538,1.410238,-0.406236,0.147881,0.055627,0.179100,0.323910,-0.825637,...,-0.521651,-0.565709,0.005225,0.050409,0.071071,-2.573385,-0.528800,-0.100406,-0.090211,-0.033009
331607,1.78216,1.881977,0.544771,1.410238,-0.524528,0.142835,0.052804,0.249133,0.243663,-0.825637,...,-0.564252,-0.714509,0.005499,0.044680,0.074684,-2.371958,-0.536906,-0.100406,-0.024928,0.033049


In [14]:
# Extract features from datetime and encode categorical features
# Do we still use datetime ?? or just use unix 
"""df_2023_h1['[QUOTE_MONTH]'] = df_2023_h1['[QUOTE_DATE]'].dt.month
df_2023_h1['[EXPIRE_WEEKDAY]'] = df_2023_h1['[EXPIRE_DATE]'].dt.weekday
df_2023_h1=pd.get_dummies(df_2023_h1, columns = ['Moneyness'])"""

"df_2023_h1['[QUOTE_MONTH]'] = df_2023_h1['[QUOTE_DATE]'].dt.month\ndf_2023_h1['[EXPIRE_WEEKDAY]'] = df_2023_h1['[EXPIRE_DATE]'].dt.weekday\ndf_2023_h1=pd.get_dummies(df_2023_h1, columns = ['Moneyness'])"

In [15]:
# # Outlier detection
# num_col = df_2023_h1.select_dtypes(include=['int64', 'float64']).columns
# for i in num_col:
#     q75, q25 = np.percentile(df_2023_h1[i], [75 ,25])
#     iqr = q75 - q25
#     min = q25 - (iqr*1.5)
#     max = q75 + (iqr*1.5)
    

In [16]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [112]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

ValueError: too many values to unpack (expected 2)