In [80]:
import pandas as pd
import numpy as np
import os
import gc
import sys
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.options.display.width = 120

In [81]:
def get_target(df, keys):
    
    df = df[df["key"] == key]
    
    df["change_one_week"] = df["fClose"].pct_change(5)
    df["change_two_weeks"] = df["fClose"].pct_change(10)
    df["change_one_month"] = df["fClose"].pct_change(20)
    
    return df

In [77]:
def get_features(df, key):
    
    df = df[df["key"] == key]
    
    # Volume volatility
    df["volatility_1month"] = (
        np.log(df["fClose"]).diff().rolling(20).std()
    )
    
    df["volatility_2month"] = (
        np.log(df["fClose"]).diff().rolling(40).std()
    )
    
    df["volatility_3month"] = (
        np.log(df["fClose"]).diff().rolling(60).std()
    )
    
    # Gap between MA and close price
    df["MA_gap_1month"] = df["fClose"] / (
        df["fClose"].rolling(20).mean()
    )

    df["MA_gap_2month"] = df["fClose"] / (
        df["fClose"].rolling(40).mean()
    )

    df["MA_gap_3month"] = df["fClose"] / (
        df["fClose"].rolling(60).mean()
    )

    return df

# Read data

In [63]:
price = pd.read_csv("./price.csv")

price.columns.values
price.head(3)

Unnamed: 0,change,changeOverTime,changePercent,close,fClose,fHigh,fLow,fOpen,fVolume,high,id,key,label,low,marketChangeOverTime,open,subkey,symbol,uClose,uHigh,uLow,uOpen,uVolume,updated,volume
0,0.0,0.0,0.0,123.0,123.0,124.18,122.49,123.66,75089134,124.18,HISTORICAL_PRICES,AAPL,"Apr 1, 21",122.49,0.0,123.66,,AAPL,123.0,124.18,122.49,123.66,75089134,2021-04-02 00:48:07,75089134
1,-0.85,-0.006911,-0.0069,122.15,122.15,123.52,121.15,121.65,118323826,123.52,HISTORICAL_PRICES,AAPL,"Mar 31, 21",121.15,-0.006911,121.65,,AAPL,122.15,123.52,121.15,121.65,118323826,2021-04-01 00:48:05,118323826
2,-2.25,-0.025203,-0.0184,119.9,119.9,120.4031,118.86,120.11,85671919,120.4031,HISTORICAL_PRICES,AAPL,"Mar 30, 21",118.86,-0.025203,120.11,,AAPL,119.9,120.4031,118.86,120.11,85671919,2021-03-31 01:16:10,85671919


# Check NaN percentage

In [72]:
total = price.isnull().sum().sort_values(ascending=False)
percent = (price.isnull().sum()/price.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

print(missing_data.head(3))

                 Total  Percent
subkey           11788      1.0
change_one_week      0      0.0
uHigh                0      0.0


In [73]:
price.drop(["subkey"], axis=1, inplace=True)

# Get list of tickers

In [35]:
keys = price["key"].unique().tolist()

# Target creation

In [74]:
temp = []

for key in tqdm(keys):
    target = get_target(price, key)
    temp.append(target)

price_target = pd.concat(temp)

del target; gc.collect()

price_target.dropna(inplace=True)
price_target.reset_index(drop=True, inplace=True)

price_target[["change_one_week", "change_two_weeks", "change_one_month"]].head()

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,change_one_week,change_two_weeks,change_one_month
0,-0.003229,0.001559,-0.028016
1,-0.039652,-0.027773,-0.066602
2,-0.049325,-0.076558,-0.104243
3,-0.066517,-0.097401,-0.1152
4,-0.0844,-0.088777,-0.087169


# Feature Engineering

In [79]:
temp = []

for key in tqdm(keys):
    feat = get_features(price_target, key)
    temp.append(feat)

price_tar_fe = pd.concat(temp)

del feat; gc.collect()

price_tar_fe.dropna(inplace=True)
price_tar_fe.reset_index(drop=True, inplace=True)

price_tar_fe.head(3)

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,change,changeOverTime,changePercent,close,fClose,fHigh,fLow,fOpen,fVolume,high,id,key,label,low,marketChangeOverTime,open,symbol,uClose,uHigh,uLow,uOpen,uVolume,updated,volume,change_one_week,change_two_weeks,change_one_month,volatility_1month,volatility_2month,volatility_3month,MA_gap_1month,MA_gap_2month,MA_gap_3month
0,-2.0,-0.08122,-0.0174,113.01,112.647,112.9112,109.9432,110.1426,165944820,113.275,HISTORICAL_PRICES,AAPL,"Aug 12, 20",110.2975,-0.08122,110.4975,AAPL,452.04,453.1,441.19,441.99,41486205,2021-02-05 02:17:34,165944820,-0.023314,-0.106799,-0.00423,0.033604,0.029159,0.0289,0.932013,0.965962,0.968603
1,-3.635,-0.110772,-0.0322,109.375,109.0237,112.1212,108.7562,111.6091,187902376,112.4825,HISTORICAL_PRICES,AAPL,"Aug 11, 20",109.1067,-0.110772,111.9688,AAPL,437.5,449.93,436.4267,447.875,46975594,2021-02-05 02:16:08,187902376,-0.053542,-0.123773,-0.067721,0.033386,0.029601,0.028858,0.905,0.936033,0.938195
2,3.3525,-0.083516,0.0307,112.7275,112.3654,113.4096,109.6467,112.2384,212403424,113.775,HISTORICAL_PRICES,AAPL,"Aug 10, 20",110.0,-0.083516,112.6,AAPL,450.91,455.1,440.0,450.4,53100856,2021-02-05 02:16:30,212403424,-0.016404,-0.104325,-0.00082,0.033091,0.029899,0.028665,0.932775,0.964812,0.966636
