## Data cleasning and preprocessing of the option pricing data

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import yfinance as yf
import sqlite3
import math

In [35]:
# Read in the data from Jan 2023 to June 2023
df_2023_h1 = pd.DataFrame()
for i in [202301, 202302, 202303, 202304,  202305]:
    df_2023_h1 = pd.concat([df_2023_h1, pd.read_table(f'data/spy_eod_{i}.txt', sep=',')], ignore_index=True)
df_2023_h1.columns = df_2023_h1.columns.str.strip()

  df_2023_h1 = pd.concat([df_2023_h1, pd.read_table(f'data/spy_eod_{i}.txt', sep=',')], ignore_index=True)


In [36]:
# also drop expiration date later than 2024
df_2023_h1 = df_2023_h1[df_2023_h1['[EXPIRE_DATE]'] <= ' 2023-12-31']
df_2023_h1 = df_2023_h1[df_2023_h1['[EXPIRE_DATE]'] >= ' 2023-06-01']
df_2023_h1 = df_2023_h1.reset_index()

In [37]:
# get adj close data from yfinance
target = pd.DataFrame(yf.download(['SPY'], start="2023-01-01", end="2023-12-31")['Adj Close'])
target

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2023-01-03,373.956818
2023-01-04,376.843842
2023-01-05,372.542786
2023-01-06,381.085968
2023-01-09,380.869934
...,...
2023-12-22,472.182892
2023-12-26,474.176697
2023-12-27,475.034058
2023-12-28,475.213501


In [38]:
df_2023_h1['[EXPIRE_DATE]'] = df_2023_h1['[EXPIRE_DATE]'].str.strip().astype('datetime64[ns]')
df_2023_h1['[EXPIRE_DATE]'].unique()

<DatetimeArray>
['2023-06-16 00:00:00', '2023-06-30 00:00:00', '2023-09-15 00:00:00',
 '2023-09-29 00:00:00', '2023-12-15 00:00:00', '2023-12-29 00:00:00',
 '2023-07-21 00:00:00', '2023-08-18 00:00:00', '2023-10-20 00:00:00',
 '2023-06-02 00:00:00', '2023-06-09 00:00:00', '2023-06-23 00:00:00',
 '2023-06-01 00:00:00', '2023-06-05 00:00:00', '2023-06-06 00:00:00',
 '2023-06-07 00:00:00', '2023-06-08 00:00:00', '2023-07-07 00:00:00',
 '2023-06-12 00:00:00', '2023-06-13 00:00:00', '2023-06-14 00:00:00',
 '2023-11-17 00:00:00']
Length: 22, dtype: datetime64[ns]

In [39]:
target['[EXPIRE_DATE]'] = target.index
target['[EXPIRE_DATE]'].astype('datetime64[ns]')

Date
2023-01-03   2023-01-03
2023-01-04   2023-01-04
2023-01-05   2023-01-05
2023-01-06   2023-01-06
2023-01-09   2023-01-09
                ...    
2023-12-22   2023-12-22
2023-12-26   2023-12-26
2023-12-27   2023-12-27
2023-12-28   2023-12-28
2023-12-29   2023-12-29
Name: [EXPIRE_DATE], Length: 250, dtype: datetime64[ns]

In [40]:
df_2023_h1 = pd.merge(df_2023_h1, target, on = '[EXPIRE_DATE]')

Revising the target here. Since with call option the loss cannot be less than the option price. Better split the target and tables to call and put. 

In [41]:
from data_cleansing_function import target_setting
import inspect
print(inspect.getsource(target_setting))
df_2023_h1 = target_setting(df_2023_h1)

def target_setting(df):
    """
    vectorized operation to calculate the target value based on formula
    """
    df['-rt'] = -0.04*(df['[EXPIRE_UNIX]'] - df['[QUOTE_UNIXTIME]'])/(3600*365*24)  # unix time is based on seconds
    df['price_diff'] = df['[STRIKE]'] - df['Adj Close']
    df['exp(-rt)'] = df['-rt'].apply(lambda x: math.exp(x))
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]   
    df['discounted_price'] = df['price_diff'] * df['exp(-rt)']
    return df



In [42]:
# in case that the value is smaller than 0
# If there needs split learning from call or put function
df_2023_h1['adj_call_target'] = df_2023_h1[df_2023_h1['price_diff'] > 0]['price_diff'] * df_2023_h1[df_2023_h1['price_diff'] > 0]['exp(-rt)']
df_2023_h1['adj_put_target'] = -df_2023_h1[df_2023_h1['price_diff'] < 0]['price_diff'] * df_2023_h1[df_2023_h1['price_diff'] < 0]['exp(-rt)']

In [43]:
df_2023_h1.fillna(0, inplace=True) # fill na

In [44]:
"""strike_amount = []
for date in df_2023_h1['[QUOTE_DATE]'].unique():
    for expire_date in df_2023_h1[df_2023_h1['[QUOTE_DATE]'] == date]['[EXPIRE_DATE]'].unique():
        length = len(df_2023_h1[df_2023_h1['[QUOTE_DATE]'] ==date][df_2023_h1['[EXPIRE_DATE]'] == expire_date])
        strike_amount.append(length)
min(strike_amount)""" # 11

"strike_amount = []\nfor date in df_2023_h1['[QUOTE_DATE]'].unique():\n    for expire_date in df_2023_h1[df_2023_h1['[QUOTE_DATE]'] == date]['[EXPIRE_DATE]'].unique():\n        length = len(df_2023_h1[df_2023_h1['[QUOTE_DATE]'] ==date][df_2023_h1['[EXPIRE_DATE]'] == expire_date])\n        strike_amount.append(length)\nmin(strike_amount)"

In [45]:
df_2023_h1.groupby(['[QUOTE_DATE]', '[EXPIRE_DATE]']).apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,index,[QUOTE_UNIXTIME],[QUOTE_READTIME],[QUOTE_DATE],[QUOTE_TIME_HOURS],[UNDERLYING_LAST],[EXPIRE_DATE],[EXPIRE_UNIX],[DTE],[C_DELTA],...,[P_VOLUME],[STRIKE_DISTANCE],[STRIKE_DISTANCE_PCT],Adj Close,-rt,price_diff,exp(-rt),discounted_price,adj_call_target,adj_put_target
[QUOTE_DATE],[EXPIRE_DATE],Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2023-01-03,2023-06-16,0,2382,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-06-16,1686945600,163.96,0.98313,...,135.000000,230.8,0.606,434.796082,-0.017968,-284.796082,0.982192,-279.724554,0.000000,279.724554
2023-01-03,2023-06-16,1,2383,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-06-16,1686945600,163.96,0.98258,...,101.000000,220.8,0.580,434.796082,-0.017968,-274.796082,0.982192,-269.902630,0.000000,269.902630
2023-01-03,2023-06-16,2,2384,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-06-16,1686945600,163.96,0.97997,...,27.000000,210.8,0.554,434.796082,-0.017968,-264.796082,0.982192,-260.080706,0.000000,260.080706
2023-01-03,2023-06-16,3,2385,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-06-16,1686945600,163.96,0.97810,...,215.000000,200.8,0.527,434.796082,-0.017968,-254.796082,0.982192,-250.258782,0.000000,250.258782
2023-01-03,2023-06-16,4,2386,1672779600,2023-01-03 16:00,2023-01-03,16.0,380.82,2023-06-16,1686945600,163.96,0.97648,...,0.000000,195.8,0.514,434.796082,-0.017968,-249.796082,0.982192,-245.347819,0.000000,245.347819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-31,2023-12-29,90611,403301,1685563200,2023-05-31 16:00,2023-05-31,16.0,417.80,2023-12-29,1703883600,212.04,0.02990,...,,92.2,0.221,473.837769,-0.023237,36.162231,0.977030,35.331602,35.331602,0.000000
2023-05-31,2023-12-29,90612,403302,1685563200,2023-05-31 16:00,2023-05-31,16.0,417.80,2023-12-29,1703883600,212.04,0.02566,...,,97.2,0.233,473.837769,-0.023237,41.162231,0.977030,40.216754,40.216754,0.000000
2023-05-31,2023-12-29,90613,403303,1685563200,2023-05-31 16:00,2023-05-31,16.0,417.80,2023-12-29,1703883600,212.04,0.02121,...,0.000000,102.2,0.245,473.837769,-0.023237,46.162231,0.977030,45.101907,45.101907,0.000000
2023-05-31,2023-12-29,90614,403304,1685563200,2023-05-31 16:00,2023-05-31,16.0,417.80,2023-12-29,1703883600,212.04,0.01735,...,0.000000,107.2,0.257,473.837769,-0.023237,51.162231,0.977030,49.987059,49.987059,0.000000


In [46]:
# scrapped
"""df_2023_h1['abs_strike_distance'] = df_2023_h1.groupby(['[QUOTE_DATE]', '[EXPIRE_DATE]'])['[STRIKE_DISTANCE]'].apply(abs).reset_index()['[STRIKE_DISTANCE]']
df_2023_h1['rank'] = df_2023_h1.groupby(['[QUOTE_DATE]', '[EXPIRE_DATE]'])['[STRIKE_DISTANCE]'].rank()
df_2023_h1 = df_2023_h1[df_2023_h1['rank'] <= 11].reset_index()"""

"df_2023_h1['abs_strike_distance'] = df_2023_h1.groupby(['[QUOTE_DATE]', '[EXPIRE_DATE]'])['[STRIKE_DISTANCE]'].apply(abs).reset_index()['[STRIKE_DISTANCE]']\ndf_2023_h1['rank'] = df_2023_h1.groupby(['[QUOTE_DATE]', '[EXPIRE_DATE]'])['[STRIKE_DISTANCE]'].rank()\ndf_2023_h1 = df_2023_h1[df_2023_h1['rank'] <= 11].reset_index()"

In [47]:
amount = []
for date in df_2023_h1['[QUOTE_DATE]'].unique():
    length = len(df_2023_h1[df_2023_h1['[QUOTE_DATE]'] ==date])
    amount.append(length)
min(amount) # minimum amount of entries of a quote date

765

In [48]:
df_2023_h1 = df_2023_h1.groupby(['[QUOTE_DATE]']).sample(n = 765, random_state = 42)

In [49]:
df_2023_h1['[QUOTE_DATE]'] = df_2023_h1['[QUOTE_DATE]'].apply(np.datetime64)
df_2023_h1['[EXPIRE_DATE]'] = df_2023_h1['[EXPIRE_DATE]'].apply(np.datetime64)

In [50]:
df_2023_h1 = df_2023_h1[['[EXPIRE_UNIX]', '[QUOTE_DATE]', '[EXPIRE_DATE]', '[STRIKE]', '[UNDERLYING_LAST]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]','[C_BID]', '[C_ASK]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[P_BID]', '[P_ASK]', 'adj_call_target', 'adj_put_target','discounted_price']]
for column in df_2023_h1.columns:
    print(type(df_2023_h1[column][0]))

<class 'numpy.int64'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'str'>
<class 'str'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'str'>
<class 'str'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [51]:
df_2023_h1.groupby(['[QUOTE_DATE]']).apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,[EXPIRE_UNIX],[QUOTE_DATE],[EXPIRE_DATE],[STRIKE],[UNDERLYING_LAST],[C_DELTA],[C_GAMMA],[C_VEGA],[C_THETA],[C_RHO],...,[P_VEGA],[P_THETA],[P_RHO],[P_IV],[P_VOLUME],[P_BID],[P_ASK],adj_call_target,adj_put_target,discounted_price
[QUOTE_DATE],Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2023-01-03,35834,1694808000,2023-01-03,2023-09-15,265.0,380.82,0.92227,0.00116,0.42399,-0.04696,1.58341,...,0.39418,-0.02328,-0.18374,0.333300,2.000000,3.19,3.28,0.000000,170.384111,-170.384111
2023-01-03,22736,1688155200,2023-01-03,2023-06-30,364.0,380.82,0.66618,0.00543,0.95754,-0.08510,1.05430,...,0.96215,-0.05433,-0.61092,0.246080,0.000000,15.58,15.66,0.000000,73.135228,-73.135228
2023-01-03,79332,1703883600,2023-01-03,2023-12-29,425.0,380.82,0.39002,0.00491,1.44059,-0.05273,1.28108,...,1.26217,-0.02139,-1.46953,0.205950,,49.51,54.50,0.000000,46.948530,-46.948530
2023-01-03,193,1686945600,2023-01-03,2023-06-16,610.0,380.82,0.00156,0.00004,0.01151,-0.00107,0.00213,...,0.00000,0.00000,0.00000,,,228.67,229.45,172.083962,0.000000,172.083962
2023-01-03,22810,1688155200,2023-01-03,2023-06-30,545.0,380.82,0.00621,0.00034,0.04473,-0.00286,0.01035,...,0.00000,0.00000,0.00000,,,163.58,164.55,104.369057,0.000000,104.369057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-05-31,35797,1688155200,2023-05-31,2023-06-30,480.0,417.80,0.00106,0.00017,0.00422,-0.00102,0.00074,...,0.16031,-0.06989,-0.11405,0.290430,0.000000,61.89,63.04,41.288503,0.000000,41.288503
2023-05-31,118969,1685736000,2023-05-31,2023-06-02,451.0,417.80,0.00190,0.00052,0.00219,-0.00479,0.00014,...,0.00000,0.00000,0.00000,,,32.61,33.36,29.181927,0.000000,29.181927
2023-05-31,22552,1686945600,2023-05-31,2023-06-16,366.0,417.80,0.91525,0.00291,0.13140,-0.16689,1.01291,...,0.05407,-0.05017,-0.00558,0.324910,0.000000,0.29,0.30,0.000000,68.675558,-68.675558
2023-05-31,126350,1686168000,2023-05-31,2023-06-07,450.0,417.80,0.00123,0.00055,0.00299,-0.00246,-0.00009,...,0.00000,0.00000,0.00000,,,31.59,32.80,29.516167,0.000000,29.516167


In [52]:
df_2023_h1 = df_2023_h1.replace(r'^\s*$', 0, regex=True)

In [53]:
for column in ['[C_IV]', '[C_VOLUME]', '[P_IV]', '[P_VOLUME]']:
    df_2023_h1[column] = df_2023_h1[column].str.strip().astype('float64')
    print(type(df_2023_h1[column][0]))

<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [54]:
df_2023_h1 = df_2023_h1.fillna(0)

In [55]:
# Basic normalization and standardization
# run block of code and catch warnings
import warnings
from sklearn.preprocessing import StandardScaler
with warnings.catch_warnings():
	# ignore all caught warnings
	warnings.filterwarnings("ignore")
	# execute code that will generate warnings
	# dont standardize unixtime '[QUOTE_UNIXTIME]', '[EXPIRE_UNIX]', if it does what I think it does. 
	numeric_cols = ['[EXPIRE_UNIX]', '[STRIKE]', '[UNDERLYING_LAST]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]','[C_BID]', '[C_ASK]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[P_BID]', '[P_ASK]']  # not sure about all this, we ball
	scaler = StandardScaler()
	df_2023_h1[numeric_cols] = scaler.fit_transform(df_2023_h1[numeric_cols])

In [56]:
df_2023_h1

Unnamed: 0,[EXPIRE_UNIX],[QUOTE_DATE],[EXPIRE_DATE],[STRIKE],[UNDERLYING_LAST],[C_DELTA],[C_GAMMA],[C_VEGA],[C_THETA],[C_RHO],...,[P_VEGA],[P_THETA],[P_RHO],[P_IV],[P_VOLUME],[P_BID],[P_ASK],adj_call_target,adj_put_target,discounted_price
35834,0.189493,2023-01-03,2023-09-15,-1.208188,-2.415805,0.986203,-0.640510,-0.138950,0.179521,0.634474,...,-0.284184,0.354851,0.493729,0.579386,-0.121446,-0.583124,-0.586779,0.000000,170.384111,-170.384111
22736,-0.877875,2023-01-03,2023-06-30,-0.303386,-2.415805,0.280390,0.331213,0.420779,-0.867682,0.220711,...,1.126744,-0.901717,-0.803960,-0.031508,-0.123763,-0.399403,-0.404368,0.000000,73.135228,-73.135228
79332,1.645572,2023-01-03,2023-12-29,0.254118,-2.415805,-0.480739,0.212876,0.927530,0.021095,0.398053,...,1.872041,0.431338,-3.412249,-0.312580,-0.123763,0.103719,0.167914,0.000000,46.948530,-46.948530
193,-1.071942,2023-01-03,2023-06-16,1.944909,-2.415805,-1.551380,-0.895388,-0.571669,1.439515,-0.602083,...,-1.263390,1.296973,1.051895,-1.755065,-0.123763,2.760342,2.745688,172.083962,0.000000,172.083962
22810,-0.877875,2023-01-03,2023-06-30,1.350847,-2.415805,-1.538564,-0.827117,-0.536819,1.390367,-0.595655,...,-1.263390,1.296973,1.051895,-1.755065,-0.123763,1.795173,1.789429,104.369057,0.000000,104.369057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35797,-0.877875,2023-05-31,2023-06-30,0.756785,1.378693,-1.552758,-0.865804,-0.579317,1.440888,-0.603170,...,-0.865155,-1.531418,0.705434,0.279122,-0.123763,0.287292,0.293745,41.288503,0.000000,41.288503
118969,-1.266009,2023-05-31,2023-06-02,0.491742,1.378693,-1.550443,-0.786155,-0.581446,1.337376,-0.603639,...,-1.263390,1.296973,1.051895,-1.755065,-0.123763,-0.146878,-0.143570,29.181927,0.000000,29.181927
22552,-1.071942,2023-05-31,2023-06-16,-0.285108,1.378693,0.966855,-0.242263,-0.445896,-3.113375,0.188345,...,-1.129072,-0.733366,1.034944,0.520622,-0.123763,-0.626126,-0.630687,0.000000,68.675558,-68.675558
126350,-1.196700,2023-05-31,2023-06-07,0.482603,1.378693,-1.552289,-0.779328,-0.580607,1.401350,-0.603819,...,-1.263390,1.296973,1.051895,-1.755065,-0.123763,-0.162003,-0.151821,29.516167,0.000000,29.516167


In [57]:
df_2023_h1_call = df_2023_h1[['[QUOTE_DATE]', '[EXPIRE_DATE]', '[EXPIRE_UNIX]', '[STRIKE]', '[UNDERLYING_LAST]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]','[C_BID]', '[C_ASK]', 'adj_call_target',]]

In [58]:
df_2023_h1_put = df_2023_h1[['[QUOTE_DATE]', '[EXPIRE_DATE]', '[EXPIRE_UNIX]', '[STRIKE]', '[UNDERLYING_LAST]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[P_BID]', '[P_ASK]', 'adj_put_target']]

In [59]:
conn = sqlite3.connect("data/tables_split.db")
df_2023_h1_call.to_sql("df_2023_h1_call", conn, if_exists = "replace", index=False)
df_2023_h1_put.to_sql("df_2023_h1_put", conn, if_exists = "replace", index=False)
conn.close()

In [60]:
 # try to put '[QUOTE_DATE]' as timestamps and 'adj_call_target' as target see what happens. 
time_series_call = []
for date in df_2023_h1_call['[QUOTE_DATE]'].unique():
    time_series_call.append(df_2023_h1_call[df_2023_h1_call['[QUOTE_DATE]'] == date].reset_index().drop(['[EXPIRE_DATE]', 'index'], axis = 1))
time_series_call

[    [QUOTE_DATE]  [EXPIRE_UNIX]  [STRIKE]  [UNDERLYING_LAST]  [C_DELTA]  \
 0     2023-01-03       0.189493 -1.208188          -2.415805   0.986203   
 1     2023-01-03      -0.877875 -0.303386          -2.415805   0.280390   
 2     2023-01-03       1.645572  0.254118          -2.415805  -0.480739   
 3     2023-01-03      -1.071942  1.944909          -2.415805  -1.551380   
 4     2023-01-03      -0.877875  1.350847          -2.415805  -1.538564   
 ..           ...            ...       ...                ...        ...   
 760   2023-01-03      -1.071942 -0.486175          -2.415805   0.558399   
 761   2023-01-03      -1.071942 -0.166295          -2.415805   0.045706   
 762   2023-01-03      -0.877875 -0.202853          -2.415805   0.110200   
 763   2023-01-03       0.383559 -1.299582          -2.415805   1.006488   
 764   2023-01-03      -1.071942 -0.202853          -2.415805   0.115023   
 
      [C_GAMMA]  [C_VEGA]  [C_THETA]   [C_RHO]    [C_IV]  [C_VOLUME]   [C_BID]  \
 0  

In [61]:
 # split target into two df
"""target = df_2023_h1['discounted_price']
df_2023_h1 = df_2023_h1.drop('discounted_price', axis=1)"""

"target = df_2023_h1['discounted_price']\ndf_2023_h1 = df_2023_h1.drop('discounted_price', axis=1)"

In [62]:
"""target_call = df_2023_h1_call['adj_call_target']
df_2023_h1_call = df_2023_h1_call.drop('adj_call_target', axis=1)"""

"target_call = df_2023_h1_call['adj_call_target']\ndf_2023_h1_call = df_2023_h1_call.drop('adj_call_target', axis=1)"

In [63]:
# output the df_2023_h1 to a csv file
# df_2023_h1.to_csv(r'data/df_2023_h1.csv', index = False, header=True)
# target.to_csv(r'data/target.csv', index = False, header=True)

In [64]:
# output to sqlite database if anyone cares
"""conn = sqlite3.connect("data/tables.db")
df_2023_h1.to_sql("df_2023_h1_feature", conn, if_exists = "replace", index=False)
target.to_sql("df_2023_h1_target", conn, if_exists = "replace", index=False)
conn.close()"""

'conn = sqlite3.connect("data/tables.db")\ndf_2023_h1.to_sql("df_2023_h1_feature", conn, if_exists = "replace", index=False)\ntarget.to_sql("df_2023_h1_target", conn, if_exists = "replace", index=False)\nconn.close()'

In [65]:
"""conn = sqlite3.connect("data/tables_split.db")
df_2023_h1_call.to_sql("df_2023_h1_feature", conn, if_exists = "replace", index=False)
target_call.to_sql("df_2023_h1_target", conn, if_exists = "replace", index=False)
conn.close()"""

'conn = sqlite3.connect("data/tables_split.db")\ndf_2023_h1_call.to_sql("df_2023_h1_feature", conn, if_exists = "replace", index=False)\ntarget_call.to_sql("df_2023_h1_target", conn, if_exists = "replace", index=False)\nconn.close()'