# Security Feature Engineering

Goal: Reduce the securities data to only features that will be used in our model. Tansform date column in the securities data to a common format that will be used in each data frame. Will need the same date format to properly join the data frames.

Create a derived implied volatility feature for each date. Standard deviation of the last 20 days of stock close prices. Will be used as an additional input to the model.

In [1]:
import pandas as pd
import numpy as np

In [2]:
%cd '/Users/benjochem/Desktop/Junior/Research'

/Users/benjochem/Desktop/Junior/Research


In [3]:
securities = pd.read_csv('Project/data/raw/vxx_security_prices.csv')
securities = securities [['date','close','volume','return']]
securities.head()

Unnamed: 0,date,close,volume,return
0,1/30/2009,104.58,215700,
1,2/2/2009,104.25,308571,-0.003155
2,2/3/2009,99.37,177709,-0.046811
3,2/4/2009,99.74,156854,0.003723
4,2/5/2009,99.13,243819,-0.006116


In [4]:
#function to match data structure of the securities dates to treasury data / options data dates (yyyymmdd)
def date_to_numeric(date = []):
    converted = []
    for d in date:
        d = d.strip().split('/')
        day,month,year = d[0], d[1], d[2]
    
        # leading zeros on days/months
        if len(day) == 1: 
            day = '0' + day
        if len(month) == 1:
            month = '0' + month

        string = year + day + month
        converted.append(string)
    
    return converted

In [5]:
securities['Date'] = date_to_numeric(securities.date)
securities.head()

Unnamed: 0,date,close,volume,return,Date
0,1/30/2009,104.58,215700,,20090130
1,2/2/2009,104.25,308571,-0.003155,20090202
2,2/3/2009,99.37,177709,-0.046811,20090203
3,2/4/2009,99.74,156854,0.003723,20090204
4,2/5/2009,99.13,243819,-0.006116,20090205


In [6]:
securities.drop(labels = 'date', axis = 1, inplace = True)
securities.head()

Unnamed: 0,close,volume,return,Date
0,104.58,215700,,20090130
1,104.25,308571,-0.003155,20090202
2,99.37,177709,-0.046811,20090203
3,99.74,156854,0.003723,20090204
4,99.13,243819,-0.006116,20090205


In [7]:
securities

Unnamed: 0,close,volume,return,Date
0,104.58,215700,,20090130
1,104.25,308571,-0.003155,20090202
2,99.37,177709,-0.046811,20090203
3,99.74,156854,0.003723,20090204
4,99.13,243819,-0.006116,20090205
...,...,...,...,...
2994,14.94,10315489,-0.009940,20191224
2995,14.92,13591197,-0.001339,20191226
2996,15.23,26440757,0.020777,20191227
2997,15.76,34261087,0.034800,20191230


In [8]:
# Volatility over life of the option is assumed to be historical volatility of the last 20 trading days
# rolling essentially aggregates the last 20 observations and requires some function be applied 
# lambda function calculates the standard deviation of day to day price changes for the last 20 days

securities['vol_20'] = securities.close.rolling(20).apply(lambda x: (np.diff(x) / x[:-1]).std())

  """


In [9]:
securities.to_csv('Project/data/interim/securities_w_date.csv', index = False)

In [8]:
securities = pd.read_csv('Project/data/interim/securities_w_date.csv')