In [4]:
import pandas as pd
from pathlib import Path
import numpy as np
from pandas_datareader.data import DataReader
import yfinance as yf
import plotly.graph_objects as go

from datetime import datetime
from omegaconf import OmegaConf

import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode
import seaborn as sns

init_notebook_mode(connected = True)
%matplotlib inline
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')



In [5]:
dataset_paths = Path('../data/stock_market_data').glob('*/*/*.csv')

#find all datasets that are viable for ML trading
def is_viable_dataset(df):
    avg_vol_last_year = df.iloc[:-365]['Dollar Volume'].mean() 
    if len(df) > 4000 and avg_vol_last_year > 1e6:
        return True
    else:
        return False


# add dollar volume and start dataset from 2004+
def process_dataset(df):
    df['Dollar Volume'] = df.Close * df.Volume
    df.Date = pd.to_datetime(df.Date)
    df = df[df.Date >= np.datetime64('2004-01-01')]
    return df
'''
viable_datasets=[]
for path in dataset_paths:
    try:
        raw_df = pd.read_csv(path)
        df = process_dataset(raw_df)
        if is_viable_dataset(df):
            viable_datasets.append(path.stem)
    except:
        print(f'error on {path.stem}')
        
OmegaConf.save(OmegaConf.create(viable_datasets), f='../data/stock_market_data/viable_symbols.yaml')
'''


"\nviable_datasets=[]\nfor path in dataset_paths:\n    try:\n        raw_df = pd.read_csv(path)\n        df = process_dataset(raw_df)\n        if is_viable_dataset(df):\n            viable_datasets.append(path.stem)\n    except:\n        print(f'error on {path.stem}')\n        \nOmegaConf.save(OmegaConf.create(viable_datasets), f='../data/stock_market_data/viable_symbols.yaml')\n"

In [6]:
import mlfinlab as ml

#currently removes in first come first serve event order -- does this optimize number of events?
def remove_overlapping_events(raw_events):
    events=raw_events.copy()
    current_end_time = events.index[0]
    overlapped_indexes = []

    for idx, row in events.iloc[1:].iterrows():
        if row.tf < current_end_time:
            events.drop(idx,inplace=True)
        else:
            current_end_time = idx

    return events

def add_bet_results(df, events):
    df=df.dropna()
    for idx, row in events.iterrows():
        events.loc[idx, 'result'] = (df.loc[row.t1,'Close'] / df.loc[idx,'Close']) - 1
    return events
    

def fix_feature_time_window(cusum, df, window=50):
    feature_start=[]
    for t0 in cusum:
        tf_iloc = df.index.get_loc(t0) - window
        if tf_iloc > 0:
            feature_start.append(df.iloc[tf_iloc].name)
        else:
            cusum=cusum.drop(t0)
    feature_start=pd.Series(feature_start, name='tf', index=cusum)
    return cusum, feature_start

def get_training_data(stock):

    df = DataReader(stock, data_source='yahoo', start='2010-01-01', end=datetime.now())
    df=df.sort_index(ascending=True)
    vol = ml.util.get_garman_class_vol(df.Open,df.High,df.Low,df.Close)

    cusum = ml.filters.cusum_filter(np.log(df['Close']), threshold=vol*2)
    cusum, feature_start = fix_feature_time_window(cusum, df)
    vertical_barriers = ml.labeling.add_vertical_barrier(t_events=cusum, close=df.Close, num_days=21)
    raw_events = ml.labeling.get_events(close=df['Close'],
                                                t_events=cusum,
                                                pt_sl=[1,1.5],
                                                target=vol*4,
                                                min_ret=0.02,
                                                num_threads=3,
                                                vertical_barrier_times=vertical_barriers,
                                                verbose=False)
    raw_events=raw_events.dropna()
    raw_events['tf']=feature_start
    events = remove_overlapping_events(raw_events)
    events = add_bet_results(df, events)
    events['symbol'] = stock
    events.index=events.index.set_names('t0')
    events=events.reset_index()
    

    return events


viable_symbols = OmegaConf.load('../data/viable_symbols.yaml')
viable_symbols=['AAPL', 'GOOG', 'MSTF']
events=None

for symbol in viable_symbols:
    try:
        if events is not None:
            new_events = get_training_data(symbol)
            events=events.append(new_events,ignore_index=True)
        else:
            events = get_training_data(symbol)
        events.to_csv(f'../data/v1/events.csv', index=False)
        print(f'saved: {symbol}')
    except:
        print(f'error: {symbol}')


saved: AAPL
saved: GOOG
error: MSTF






In [10]:
import tsfresh
from tsfresh import extract_features, select_features, extract_relevant_features
from tsfresh.utilities.dataframe_functions import roll_time_series, make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute


events = pd.read_csv('../data/v1/events.csv')
symbols = events.symbol.unique()

feature_input_dict = {}
for  symbol in symbols: 
    symbol_prices =  DataReader(symbol, data_source='yahoo', start='2004-01-01', end=datetime.now()).filter(['Close'])
    symbol_events = events[events.symbol == symbol]

    for idx, row in events.iterrows():
        feature_input = symbol_prices.loc[row.tf:row.t0]
        id = f'{symbol}_{idx}'
        feature_input['id']=id
        feature_input_dict[id] = feature_input.reset_index()
        






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
X = extract_features(feature_input_dict, 
                column_id="id", column_sort="Date", column_value="Close", 
                impute_function=None, show_warnings=False)


Feature Extraction: 100%|██████████| 20/20 [00:20<00:00,  1.03s/it]


In [12]:
X

Unnamed: 0,AAPL_0__variance_larger_than_standard_deviation,AAPL_0__has_duplicate_max,AAPL_0__has_duplicate_min,AAPL_0__has_duplicate,AAPL_0__sum_values,AAPL_0__abs_energy,AAPL_0__mean_abs_change,AAPL_0__mean_change,AAPL_0__mean_second_derivative_central,AAPL_0__median,...,GOOG_99__permutation_entropy__dimension_6__tau_1,GOOG_99__permutation_entropy__dimension_7__tau_1,GOOG_99__query_similarity_count__query_None__threshold_0.0,"GOOG_99__matrix_profile__feature_""min""__threshold_0.98","GOOG_99__matrix_profile__feature_""max""__threshold_0.98","GOOG_99__matrix_profile__feature_""mean""__threshold_0.98","GOOG_99__matrix_profile__feature_""median""__threshold_0.98","GOOG_99__matrix_profile__feature_""25""__threshold_0.98","GOOG_99__matrix_profile__feature_""75""__threshold_0.98",GOOG_99__mean_n_absolute_max__number_of_maxima_7
AAPL_0,0.0,0.0,0.0,0.0,379.980358,2838.99323,0.102557,0.013757,0.001239,7.418571,...,,,,,,,,,,
AAPL_1,,,,,,,,,,,...,,,,,,,,,,
AAPL_10,,,,,,,,,,,...,,,,,,,,,,
AAPL_11,,,,,,,,,,,...,,,,,,,,,,
AAPL_12,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GOOG_95,,,,,,,,,,,...,,,,,,,,,,
GOOG_96,,,,,,,,,,,...,,,,,,,,,,
GOOG_97,,,,,,,,,,,...,,,,,,,,,,
GOOG_98,,,,,,,,,,,...,,,,,,,,,,
