# Generating the Feature Sets

This Notebook contains a function that takes in the following inputs
1. Expiry date
2. Strike Price
3. Date under consideration 
4. Option Data
5. Time Series Data


The function should output out a row of feature values. This output will be used as a feature set.


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import skew, kurtosis, linregress
from matplotlib import pyplot as plt
import multiprocessing
from multiprocessing import Pool
import os
import math

In [2]:
clean = pd.read_csv('/home/sharan/Desktop/Option Pricing Work/Work/ttm_cleaned_NIFTY50_2014_2018_raw.csv')
clean = clean.dropna()
filt = pd.read_csv('/home/sharan/Desktop/Option Pricing Work/Work/ttm_filter_param_0.02_NIFTY50_2015_2018_new.csv')
ts_data = pd.read_csv('/home/sharan/Desktop/Option Pricing Work/Work/INDEXVALUE_NIFTY50_2015_2018_combined_raw.csv')

In [3]:
filt['TTM']=filt['ttm']

In [4]:
#print(clean)
filt['Date'] = pd.to_datetime(filt['Date'])
filt = filt.sort_values(by='Date')
filt.reset_index(inplace = True, drop = True)
clean['Date'] = pd.to_datetime(clean['Date'])
clean['Expiry'] = pd.to_datetime(clean['Expiry'])
clean = clean.sort_values(by='Date')
clean.reset_index(inplace = True, drop = False)
ts_data['Date'] = pd.to_datetime(ts_data['Date'])  
ts_data = ts_data.sort_values(by='Date')
ts_data.reset_index(inplace = True, drop = False)

In [5]:
len(clean)

121063

In [6]:
def stats(data):
    return np.mean(data),np.std(data)

In [7]:
def spreadEMA(data):
    data['Spread']=data['Close']/data['Open']
    return np.mean(data['Spread'])

def jumpEMA(data):
    data['Jump']=data['Open']/data['Close'].shift(1)
    data=data.loc[1:,:]
    return np.mean(data['Jump'])

In [8]:
def atr_data(ext):
    '''
    This function is defined as 
    atr =  max[abs(High - Low)_t, (High_t - Close_t-1), (Low_t - Close_t-1)]
    '''
    # Function Logic
#     ext = pd.DataFrame()
#     temp = data.loc[np.logical_and(data.loc[:, 'Strike Price'] == strike , data.loc[:, 'Expiry']) == expiry]
#     ext = ext.append(temp, ignore_index = True)

    val1 = ext.loc[:, 'High']
    val2 = ext.loc[:, 'Low']
    val3 = ext.loc[:, 'Close']

    val = pd.DataFrame()
    val.insert(0, "ATR", 0)
    for i in range(1, len(ext)):
        low = val2.loc[i]
        high = val1.loc[i]
        prev_close = val3.loc[i-1]
        a = (high/low)
        
        if(high>prev_close):
            b = (high/prev_close)
        else:
            b = (prev_close/high)
            
        if(low>prev_close):
            c = (low/prev_close)
        else:
            c = (prev_close/low)            

        k = max(a, b, c)
        val.loc[i-1, "ATR"] = k
        
    return np.mean(val)[0]
       

### Option Data Functions

In [9]:
def ret(data, date, OHLC ,mode='log'):
    '''
    This function takes in 2 primary params. The function outputs a vector of simple return data
    corressponding to previous 20 days raw ts data.
    The function can also output the log returns on changing the value of the mode param to 'log'.

    ## Modification :: The function can also output just the raw rows if mode param is set to 'e'

    # Demo func
    ret(df, '31-Jan-2019')
    ret(df, '31-Jan-2019', 'log')
    ret(df, '31-Jan-2019', 'e')
    '''

    if (mode == 'log'): # Func logic if log returns is desired
        val = pd.DataFrame()
        temp = np.log(data[OHLC])-np.log(data[OHLC].shift(1))
        val['LR']=temp[1:]

    if (mode == 'e'): # Func logic to extract the raw data
        k = data.index[data['Date']==date][0]
        val = pd.DataFrame()
        val.insert(0, "Close", 0)
        val.insert(0, "High", 0)
        val.insert(0, "Low", 0)
        val.insert(0, "Date", 0)
        val["Close"] = data.loc[k-19:k, "Close"]
        val['High'] = data.loc[k-19:k, "High"]
        val['Low'] = data.loc[k-19:k, "Low"]
        val['Open'] = data.loc[k-19:k, "Open"]
        val['Date'] = data.loc[k-19:k, "Date"]
    val.reset_index(inplace=True, drop=True)
    return val

### Feature Generator

In [10]:
def gen(ts_data, ttm, atmDate, atmError, percentage, op_data, op_data_jump, strike,spot,opPrice, prevOp):
    
    oi = op_data['Open Int'].iloc[-1]
    #     print(oi)

    del_oi = op_data_jump['Change in OI'].iloc[-1]
    #     print(del_oi)

    change_oi = (del_oi/oi) *100

    ts_m,ts_sd = stats(ts_data['Close'])

    avg_atmError=ts_m/strike

    ts_atr = atr_data(ts_data)

    close_log_return = ret(ts_data, atmDate,'Close',mode = 'log')

    open_log_return = ret(ts_data, atmDate,'Open',mode = 'log')

    high_log_return = ret(ts_data, atmDate,'High',mode = 'log')

    low_log_return = ret(ts_data, atmDate,'Low',mode = 'log')    

    close_log_ret_m, close_log_ret_sd = stats(close_log_return['LR'])

    open_log_ret_m, open_log_ret_sd = stats(open_log_return['LR'])

    high_log_ret_m, high_log_ret_sd = stats(high_log_return['LR'])

    low_log_ret_m, low_log_ret_sd = stats(low_log_return['LR'])

    vcov = pd.DataFrame()

    vcov['close_lr']=close_log_return['LR']

    vcov['open_lr']=open_log_return['LR']

    vcov['high_lr']=high_log_return['LR']

    vcov['low_lr']=low_log_return['LR']

    covMatrix = vcov.cov()

    #     print(covMatrix)

    c=[]

    for i in range(4):
        for j in range(i+1,4):
            #             print(covMatrix.iloc[i,j])
            val = round(covMatrix.iloc[i,j],8)
            if(val>0):
                c.append(math.sqrt(val))
            else:
                c.append(-1*math.sqrt(abs(val)))

    return [atmDate] + [close_log_ret_m, close_log_ret_sd, 
    open_log_ret_m, open_log_ret_sd, high_log_ret_m, high_log_ret_sd, low_log_ret_m, 
    low_log_ret_sd] + c + [change_oi, avg_atmError,ts_atr,prevOp] + [ttm, atmError,strike,spot,opPrice,percentage]

In [11]:
cvList=[]
cov_list=['close_lr','open_lr','high_lr','low_lr']
for cl1 in range(len(cov_list)):
    for cl2 in range(cl1+1,len(cov_list)):
        cvl=cov_list[cl1]+"_"+cov_list[cl2]+"_cov"
        cvList.append(cvl)
# print(cvList)
col_name = ['date'] + ['close_mean','close_sd','open_mean','open_sd','high_mean','high_sd','low_mean','low_sd'] + cvList + ['change_oi','avg_atmError','ts_atr','prevOp'] + ['ttm', 'atmError','strike','spot','opPrice','percentage']

In [12]:
col_name

['date',
 'close_mean',
 'close_sd',
 'open_mean',
 'open_sd',
 'high_mean',
 'high_sd',
 'low_mean',
 'low_sd',
 'close_lr_open_lr_cov',
 'close_lr_high_lr_cov',
 'close_lr_low_lr_cov',
 'open_lr_high_lr_cov',
 'open_lr_low_lr_cov',
 'high_lr_low_lr_cov',
 'change_oi',
 'avg_atmError',
 'ts_atr',
 'prevOp',
 'ttm',
 'atmError',
 'strike',
 'spot',
 'opPrice',
 'percentage']

In [13]:
len(filt)

26978

In [14]:
filt.reset_index(drop=True,inplace=True)
# filt

In [15]:
count = 0
# percentage = []

feat = []
params=[]
for index,row in filt.iterrows():
    ttm = row['TTM']
    strike=row['Strike Price']
    spot=row['Underlying Value']
    atmDate=row['Date']
    expDate=row['Expiry']
    close = row['Close']
    opPrice = close
    opsTS = clean.loc[np.logical_and(clean['Strike Price'] == strike, clean['Expiry'] == expDate)]
    opsTS=opsTS.drop("index",1)
    opsTS.reset_index(inplace=True, drop=True)
#     print(atmDate)
    ind = opsTS.index[opsTS['Date']==atmDate]#Gives Index of that day itself
    ind=ind[0]
    if (ind > 20):
        window = 20
    elif (ind < 1):
        continue
    else:
        window = ind
    
    percentage = (close/strike)*100
    
    ops=opsTS.loc[ind-window:ind-1,:]
    ops.reset_index(inplace=True, drop=True)
    if(window==20):
        adjWindow=19
    else:
        adjWindow=window
        
    opsJump=opsTS.loc[ind-adjWindow:ind,:]
    opsJump.reset_index(inplace=True, drop=True)
    ts = ret(ts_data, atmDate, '', mode = 'e')
    
    atmError=(spot/strike)
    prevOp=(opsTS.loc[ind-1,'Close']/strike)
    params.append((ts,ttm,atmDate,atmError,percentage,ops,opsJump,strike,spot,opPrice,prevOp))
#     print(opsJump.loc[:,['Open Int','Change in OI']])
#     count = count + 1
#     break
    
pool = Pool()
for res in pool.starmap(gen,params):    
    feat.append(res)
    
pool.close()


  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':
  if __name__ == '__main__':


In [18]:
results=pd.DataFrame.from_records(feat, columns=col_name)
results

Unnamed: 0,date,close_mean,close_sd,open_mean,open_sd,high_mean,high_sd,low_mean,low_sd,close_lr_open_lr_cov,...,change_oi,avg_atmError,ts_atr,prevOp,ttm,atmError,strike,spot,opPrice,percentage
0,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,8.917197e-01,0.962110,1.011703,0.018849,84.0,0.963256,8600.0,8284.00,186.25,2.165698
1,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,0.000000e+00,1.027844,1.011703,0.035596,28.0,1.029068,8050.0,8284.00,335.05,4.162112
2,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,5.321905e-01,1.021499,1.011703,0.030167,28.0,1.022716,8100.0,8284.00,292.10,3.606173
3,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,-3.089944e+00,1.034268,1.011703,0.039788,28.0,1.035500,8000.0,8284.00,372.55,4.656875
4,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,3.176404e+00,0.985017,1.011703,0.017893,56.0,0.986190,8400.0,8284.00,175.50,2.089286
5,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,1.314861e+01,0.973429,1.011703,0.005441,28.0,0.974588,8500.0,8284.00,55.35,0.651176
6,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,-2.602487e+01,1.002926,1.011703,0.017933,28.0,1.004121,8250.0,8284.00,180.80,2.191515
7,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,6.168026e+00,0.967736,1.011703,0.004029,28.0,0.968889,8550.0,8284.00,40.70,0.476023
8,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,3.761330e+00,0.962110,1.011703,0.002953,28.0,0.963256,8600.0,8284.00,29.85,0.347093
9,2015-01-01,-0.001752,0.008652,-0.001934,0.008566,-0.002067,0.007243,-0.001742,0.008358,0.005222,...,-1.776838e+01,0.990915,1.011703,0.011808,28.0,0.992096,8350.0,8284.00,120.90,1.447904


In [17]:
results.to_csv('/home/sharan/Desktop/Option Pricing Work/Work/nifty_2015_2018_feature_set_COV_new_step1.csv', index = False)