## Feature Engineering

In [5]:
# Load the datasets amd load the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

data_path = '../Findata/data.h5'


### Read the data

In [52]:
# set start date and end date
start_year = '1990'
end_year = '2000'

# read dataset
with pd.HDFStore(data_path) as store:
    
    # get stock prices 
    stocks_prices = (store['quandl/wiki/prices']
                    .loc[pd.IndexSlice[str(start_year):str(end_year), :], 'adj_close'])
    
    # unstack ticker to get ticker as columns and date as index
    stocks_prices = stocks_prices.unstack('ticker')

    # get metadata
    stocks_meta = store['us_equities/stocks'].loc[:,['marketcap', 'ipoyear', 'sector']]

    # get common stocks between stock prices and metadata
    common = stocks_prices.columns.intersection(stocks_meta.index)
    
    # get prices and meta data for common stocks
    stocks_meta = stocks_meta.loc[common]
    stocks_prices= stocks_prices[stocks_meta.index]


### Get returns for different lags

In [57]:
# change the frequency to monthly taking the last price for each month
stocks_prices_monthly = stocks_prices.resample('M').last()

# get monthly to yearly returns
lags = [1,3,6,9,12]

monthly_stats = pd.DataFrame()
outlier_cutoff = 0.01
for lag in lags:
    monthly_stats['return_'+str(lag)] = (stocks_prices_monthly
                .pct_change(lag).stack() # get returns for each lag
                .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),upper=x.quantile(1-outlier_cutoff))) # clip the outliers
                .add(1) # add one and take geometric mean
                .pow(1/lag)
                .sub(1)

    )

# swap date level with ticker level
monthly_stats = monthly_stats.swaplevel().dropna()


In [49]:
assert stocks_prices.shape[1] == stocks_meta.shape[0]
