In [18]:
import pandas as pd
import numpy as np
import os
import datetime

#### Extracting from CBOE file format to dataframe

In [94]:
frames = []

for file in os.listdir('spy_data'):
    if file[-4:] == '.csv':
        df = pd.read_csv('spy_data/' + file)
        df['quote_datetime'] = pd.to_datetime(df['quote_datetime'])
        df['expiration'] = pd.to_datetime(df['expiration'])
        df['quote_date'] = df['quote_datetime'][0].date()
        df['quote_date'] = pd.to_datetime(df['quote_date'])
        
        eod = datetime.datetime.combine(df['quote_datetime'][0].date(), datetime.time(16,0, 0))
        df = df.loc[df['quote_datetime'] == eod]
        
        df = df[['quote_date', 'expiration', 'strike', 'option_type', 'close', 'active_underlying_price', 'implied_volatility']]
        
        frames.append(df)
        

In [95]:
options = pd.concat(frames)

#### Calculating ML features

In [118]:
options['T'] = options['expiration'] - options['quote_date']
options['T'] = options['T'].dt.days
options['moneyness'] = options['strike'] / options['active_underlying_price']

#### Filtering into research paper criteria
Non-zero prices, moneyness between 0.7-1.3, and time to expiration > 14

In [121]:
filtered = options.loc[(options['close']!=0) & (options['T'] > 14) & (options['moneyness'] > 0.7) & (options['moneyness'] < 1.3)]


In [127]:
calls = filtered.loc[filtered['option_type']=='C'][['T', 'moneyness', 'implied_volatility']]
puts = filtered.loc[filtered['option_type']=='P'][['T', 'moneyness', 'implied_volatility']]

In [129]:
calls.to_csv("SPY_calls.csv")
puts.to_csv("SPY_puts.csv")