In [12]:
import sys
sys.path.append('../utils')
from preprocessing import get_events, get_garman_class_vol, cusum_filter, add_vertical_barrier

In [13]:
import pandas as pd
from pathlib import Path
import numpy as np
from pandas_datareader.data import DataReader
import yfinance as yf
import plotly.graph_objects as go

from datetime import datetime
from omegaconf import OmegaConf

import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode
import seaborn as sns

init_notebook_mode(connected = True)
%matplotlib inline
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')



In [14]:
dataset_paths = Path('../data/stock_market_data').glob('*/*/*.csv')

#find all datasets that are viable for ML trading
def is_viable_dataset(df):
    avg_vol_last_year = df.iloc[:-365]['Dollar Volume'].mean() 
    if len(df) > 4000 and avg_vol_last_year > 1e6:
        return True
    else:
        return False


# add dollar volume and start dataset from 2004+
def process_dataset(df):
    df['Dollar Volume'] = df.Close * df.Volume
    df.Date = pd.to_datetime(df.Date)
    df = df[df.Date >= np.datetime64('2004-01-01')]
    return df
'''
viable_datasets=[]
for path in dataset_paths:
    try:
        raw_df = pd.read_csv(path)
        df = process_dataset(raw_df)
        if is_viable_dataset(df):
            viable_datasets.append(path.stem)
    except:
        print(f'error on {path.stem}')
        
OmegaConf.save(OmegaConf.create(viable_datasets), f='../data/stock_market_data/viable_symbols.yaml')
'''


"\nviable_datasets=[]\nfor path in dataset_paths:\n    try:\n        raw_df = pd.read_csv(path)\n        df = process_dataset(raw_df)\n        if is_viable_dataset(df):\n            viable_datasets.append(path.stem)\n    except:\n        print(f'error on {path.stem}')\n        \nOmegaConf.save(OmegaConf.create(viable_datasets), f='../data/stock_market_data/viable_symbols.yaml')\n"

In [15]:

#currently removes in first come first serve event order -- does this optimize number of events?
def remove_overlapping_events(raw_events):
    events=raw_events.copy()
    current_end_time = events.index[0]
    overlapped_indexes = []

    for idx, row in events.iloc[1:].iterrows():
        if row.tf < current_end_time:
            events.drop(idx,inplace=True)
        else:
            current_end_time = idx

    return events

def add_bet_results(df, events):
    df=df.dropna()
    for idx, row in events.iterrows():
        events.loc[idx, 'result'] = (df.loc[row.t1,'Close'] / df.loc[idx,'Close']) - 1
    return events
    

def fix_feature_time_window(cusum, df, window=50):
    feature_start=[]
    for t0 in cusum:
        tf_iloc = df.index.get_loc(t0) - window
        if tf_iloc > 0:
            feature_start.append(df.iloc[tf_iloc].name)
        else:
            cusum=cusum.drop(t0)
    feature_start=pd.Series(feature_start, name='tf', index=cusum)
    return cusum, feature_start

def get_training_data(stock):

    df = DataReader(stock, data_source='yahoo', start='2010-01-01', end=datetime.now())
    df=df.sort_index(ascending=True)
    vol = get_garman_class_vol(df.Open,df.High,df.Low,df.Close)

    cusum = cusum_filter(np.log(df['Close']), threshold=vol*2)
    cusum, feature_start = fix_feature_time_window(cusum, df)
    vertical_barriers = add_vertical_barrier(t_events=cusum, close=df.Close, num_days=21)
    raw_events = get_events(close=df['Close'],
                                                t_events=cusum,
                                                pt_sl=[1,1.5],
                                                target=vol*4,
                                                min_ret=0.02,
                                                num_threads=3,
                                                vertical_barrier_times=vertical_barriers,
                                                verbose=False)
    raw_events=raw_events.dropna()
    raw_events['tf']=feature_start
    events = remove_overlapping_events(raw_events)
    events = add_bet_results(df, events)
    events['symbol'] = stock
    events.index=events.index.set_names('t0')
    events=events.reset_index()
    

    return events


viable_symbols = OmegaConf.load('../data/viable_symbols.yaml')
viable_symbols=['AAPL', 'GOOG', 'MSFT']
events=None

for symbol in viable_symbols:
        if events is not None:
            new_events = get_training_data(symbol)
            events=events.append(new_events,ignore_index=True)
        else:
            events = get_training_data(symbol)
        events.to_csv(f'../data/v1/events.csv', index=False)
        print(f'saved: {symbol}')


saved: AAPL
saved: GOOG
saved: MSFT


In [16]:
import tsfresh
from tsfresh import extract_features, select_features, extract_relevant_features
from tsfresh.utilities.dataframe_functions import roll_time_series, make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute


events = pd.read_csv('../data/v1/events.csv')
symbols = events.symbol.unique()

feature_input_dict = {}
for  symbol in symbols: 
    symbol_prices =  DataReader(symbol, data_source='yahoo', start='2004-01-01', end=datetime.now()).filter(['Close'])
    symbol_events = events[events.symbol == symbol]

    for idx, row in events.iterrows():
        feature_input = symbol_prices.loc[row.tf:row.t0]
        id = f'{symbol}_{idx}'
        feature_input['id']=id
        feature_input_dict[id] = feature_input.reset_index()
        




ImportError: cannot import name 'extract_features' from 'tsfresh' (unknown location)

In [None]:
X = extract_features(feature_input_dict, 
                column_id="id", column_sort="Date", column_value="Close", 
                impute_function=None, show_warnings=False)


Feature Extraction: 100%|██████████| 20/20 [01:28<00:00,  4.41s/it]


MemoryError: Unable to allocate 35.4 KiB for an array with shape (4532,) and data type uint64

In [None]:
X

{'AAPL_0':                Close      id
 Date                        
 2004-09-15  0.628571  AAPL_0
 2004-09-16  0.649107  AAPL_0
 2004-09-17  0.663214  AAPL_0
 2004-09-20  0.673393  AAPL_0
 2004-09-21  0.678750  AAPL_0
 2004-09-22  0.659286  AAPL_0
 2004-09-23  0.665536  AAPL_0
 2004-09-24  0.665893  AAPL_0
 2004-09-27  0.670179  AAPL_0
 2004-09-28  0.679286  AAPL_0
 2004-09-29  0.690714  AAPL_0
 2004-09-30  0.691964  AAPL_0
 2004-10-01  0.690536  AAPL_0
 2004-10-04  0.692679  AAPL_0
 2004-10-05  0.703036  AAPL_0
 2004-10-06  0.725714  AAPL_0
 2004-10-07  0.707500  AAPL_0
 2004-10-08  0.697500  AAPL_0
 2004-10-11  0.689107  AAPL_0
 2004-10-12  0.683750  AAPL_0
 2004-10-13  0.709821  AAPL_0
 2004-10-14  0.803214  AAPL_0
 2004-10-15  0.812500  AAPL_0
 2004-10-18  0.852679  AAPL_0
 2004-10-19  0.846786  AAPL_0
 2004-10-20  0.847679  AAPL_0
 2004-10-21  0.856071  AAPL_0
 2004-10-22  0.846607  AAPL_0
 2004-10-25  0.849107  AAPL_0
 2004-10-26  0.856607  AAPL_0
 2004-10-27  0.898214  AAPL_0
