## Data cleasning and preprocessing of the option pricing data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import yfinance as yf


In [2]:
# Read in the data from Jan 2023 to June 2023
df_2023_h1 = pd.DataFrame()
for i in [202301, 202302, 202303, 202304, 202305, 202306]:
    df_2023_h1 = pd.concat([df_2023_h1, pd.read_table(f'data/spy_eod_{i}.txt', sep=',')], ignore_index=True)
df_2023_h1.columns = df_2023_h1.columns.str.strip()

  df_2023_h1 = pd.concat([df_2023_h1, pd.read_table(f'data/spy_eod_{i}.txt', sep=',')], ignore_index=True)


In [3]:
# also drop expiration date later than 2024
df_2023_h1 = df_2023_h1[df_2023_h1['[EXPIRE_DATE]'] <= ' 2023-12-31']
df_2023_h1 = df_2023_h1.reset_index()

In [4]:
df_2023_h1

Unnamed: 0,index,[QUOTE_UNIXTIME],[QUOTE_READTIME],[QUOTE_DATE],[QUOTE_TIME_HOURS],[UNDERLYING_LAST],[EXPIRE_DATE],[EXPIRE_UNIX],[DTE],[C_DELTA],...,[P_LAST],[P_DELTA],[P_GAMMA],[P_VEGA],[P_THETA],[P_RHO],[P_IV],[P_VOLUME],[STRIKE_DISTANCE],[STRIKE_DISTANCE_PCT]
0,0,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-01-03,1672779600,0.00,0.96551,...,0.01,-0.00075,0.00015,0.00072,-0.00483,-0.00015,1.210050,0.000000,70.8,0.186
1,1,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-01-03,1672779600,0.00,0.96015,...,0.02,-0.00093,0.00025,0.00104,-0.00487,0.00000,0.996160,0.000000,60.8,0.160
2,2,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-01-03,1672779600,0.00,0.95788,...,0.00,-0.00140,0.00020,0.00105,-0.00538,-0.00007,0.911990,,56.8,0.149
3,3,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-01-03,1672779600,0.00,0.96337,...,0.02,-0.00081,0.00026,0.00115,-0.00500,-0.00048,0.890650,0.000000,55.8,0.147
4,4,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-01-03,1672779600,0.00,0.95600,...,0.01,-0.00119,0.00022,0.00043,-0.00533,-0.00044,0.870040,0.000000,54.8,0.144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
388675,481184,1688155200,2023-06-30 16:00,2023-06-30,16.0,443.23,2023-12-29,1703883600,182.04,0.05919,...,0.00,-1.00000,0.00000,0.00000,0.00000,0.00000,,,66.8,0.151
388676,481185,1688155200,2023-06-30 16:00,2023-06-30,16.0,443.23,2023-12-29,1703883600,182.04,0.04538,...,0.00,-1.00000,0.00000,0.00000,0.00000,0.00000,,,71.8,0.162
388677,481186,1688155200,2023-06-30 16:00,2023-06-30,16.0,443.23,2023-12-29,1703883600,182.04,0.03511,...,119.30,-1.00000,0.00000,0.00000,0.00000,0.00000,,0.000000,76.8,0.173
388678,481187,1688155200,2023-06-30 16:00,2023-06-30,16.0,443.23,2023-12-29,1703883600,182.04,0.02477,...,112.78,-1.00000,0.00000,0.00000,0.00000,0.00000,,0.000000,81.8,0.184


In [5]:
# get target data from yfinance
target = pd.DataFrame(yf.download(['SPY'], start="2023-01-01", end="2023-12-31")['Adj Close'])
target

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2023-01-03,375.118744
2023-01-04,378.014709
2023-01-05,373.700256
2023-01-06,382.269989
2023-01-09,382.053284
...,...
2023-12-22,473.649994
2023-12-26,475.649994
2023-12-27,476.510010
2023-12-28,476.690002


In [6]:
# ignoring underlying interest rate, lets just use direct profit comparison to try first
# we can use target price (real adj close price) - strike price as our to be predicted quote price. 
# still need to fit target into our original dataframe tho. 
# df_2023_h1.join(target, on= 'Date') # Date = Expire Date here. 

In [7]:
# convert date to datetime 
"""date_cols = ['[QUOTE_READTIME]', '[QUOTE_DATE]', '[EXPIRE_DATE]']
for col in date_cols:
    df_2023_h1[col] = pd.to_datetime(df_2023_h1[col])"""
# no need, we use unix time 

"date_cols = ['[QUOTE_READTIME]', '[QUOTE_DATE]', '[EXPIRE_DATE]']\nfor col in date_cols:\n    df_2023_h1[col] = pd.to_datetime(df_2023_h1[col])"

In [8]:
# Now do some feature engineering. First deal with moneyness. Moneyness, 
# by definition, refers to the strike price of an option relative to the current trading price of the underlying asset.
df_2023_h1['Moneyness'] = 'ATM'
df_2023_h1.loc[df_2023_h1['[STRIKE]'] < df_2023_h1['[UNDERLYING_LAST]'], 'Moneyness'] = 'ITM'
df_2023_h1.loc[df_2023_h1['[STRIKE]'] > df_2023_h1['[UNDERLYING_LAST]'], 'Moneyness'] = 'OTM'

In [9]:
df_2023_h1.columns  # someone pls check this and fill it into the standard scalar

Index(['index', '[QUOTE_UNIXTIME]', '[QUOTE_READTIME]', '[QUOTE_DATE]',
       '[QUOTE_TIME_HOURS]', '[UNDERLYING_LAST]', '[EXPIRE_DATE]',
       '[EXPIRE_UNIX]', '[DTE]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]', '[C_LAST]', '[C_SIZE]',
       '[C_BID]', '[C_ASK]', '[STRIKE]', '[P_BID]', '[P_ASK]', '[P_SIZE]',
       '[P_LAST]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[STRIKE_DISTANCE]',
       '[STRIKE_DISTANCE_PCT]', 'Moneyness'],
      dtype='object')

In [10]:
# Basic normalization and standardization
# run block of code and catch warnings
import warnings
from sklearn.preprocessing import StandardScaler
with warnings.catch_warnings():
	# ignore all caught warnings
	warnings.filterwarnings("ignore")
	# execute code that will generate warnings
	scaler = StandardScaler()
	numeric_cols = ['[UNDERLYING_LAST]', '[DTE]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]', '[C_THETA]', '[C_RHO]', '[STRIKE_DISTANCE]', '[STRIKE_DISTANCE_PCT]']  # there are more numeric columns ... 
	df_2023_h1[numeric_cols] = scaler.fit_transform(df_2023_h1[numeric_cols])

In [12]:
# Extract features from datetime and encode categorical features
# Do we still use datetime ?? or just use unix 
"""df_2023_h1['[QUOTE_MONTH]'] = df_2023_h1['[QUOTE_DATE]'].dt.month
df_2023_h1['[EXPIRE_WEEKDAY]'] = df_2023_h1['[EXPIRE_DATE]'].dt.weekday
df_2023_h1=pd.get_dummies(df_2023_h1, columns = ['Moneyness'])"""

"df_2023_h1['[QUOTE_MONTH]'] = df_2023_h1['[QUOTE_DATE]'].dt.month\ndf_2023_h1['[EXPIRE_WEEKDAY]'] = df_2023_h1['[EXPIRE_DATE]'].dt.weekday\ndf_2023_h1=pd.get_dummies(df_2023_h1, columns = ['Moneyness'])"

In [13]:
# # Outlier detection
# num_col = df_2023_h1.select_dtypes(include=['int64', 'float64']).columns
# for i in num_col:
#     q75, q25 = np.percentile(df_2023_h1[i], [75 ,25])
#     iqr = q75 - q25
#     min = q25 - (iqr*1.5)
#     max = q75 + (iqr*1.5)
    

In [14]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [15]:
 # start torching
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

NameError: name 'training_data' is not defined