In [3]:
import yfinance as yf
import pandas as pd
import math
from tqdm import tqdm as tqdm
import pickle
import numpy as np
import random

DOWNLOAD = True

In [None]:
tickers = ['TSLA',
           'AAPL',
           'MSFT',
           'U',
           'CCL',
           'TD',
           'SPY',
           'FB',
           'V',
           'DIS',
           'CNR',
           'HD',
           'UNH',
           'MCD',
           'MMM',
           'ATVI',
           'ADBE',
           'AMD',
           'GOOG',
           'AMZN',
           'AXP',
           'BAC',
           'BA',
           'CVX',
           'C',
           'KO',
           'DOW',
           'GM',
           'GILD',
           'INTC',
           'MA',
           'NVDA',
           'TXN',
           'XRX',
           'RY.TO',
           'CP.TO',
           'TRI.TO',
           'ATD-B.TO',
           'L.TO',
           'DOL.TO',
           'BB.TO',
           'DOO.TO',
           'WEED.TO',
           'SNC.TO',
           'SHOP',
           'SU.TO',
           'CM.TO',
           'TD.TO',
           'ENB.TO',
           'APHA.TO',
           'XIU.TO', # s&p/tsx composite 60
           'AC.TO']

In [None]:
raws = {}

for ticker in tqdm(tickers):
    t = yf.Ticker(ticker)
    t_data = t.history(period = '10y', interval = '1d')
    
    raws[ticker] = t_data
    

In [None]:
col_d = {} # dropped columns

for ticker in tqdm(tickers):
    raw = raws[ticker]
    
    col_d[ticker] = raw.drop(columns=['Dividends', 'Stock Splits'])
    
col_d['TSLA'].head()

In [None]:
good = {}

for ticker in tqdm(tickers):
    df = col_d[ticker]
    df.dropna(inplace=True)  # drops NaN rows
    
    pct = df.pct_change()
        
    pct['DayOfWeek'] = [i.dayofweek/4 for i in df.index] # day of the week on [0,1]
    
    pct.dropna(inplace=True)
    
    good[ticker] = pct
    

print(good['TSLA'])

    

In [None]:

#serializing for future use

with open('./data/std-stocks.pkl', 'wb') as f:
    pickle.dump(good, f, protocol=pickle.HIGHEST_PROTOCOL)

## Variables
- Open (n)
- High (n)
- Low (n)
- Close (n)
- Volume (n)
- Day of Week (ex. weekends)


In [None]:
with open('./data/train-sequences.pkl', 'wb') as f:
    pickle.dump(train_seq, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/test-sequences.pkl', 'wb') as f:
    pickle.dump(val_seq, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
print(len(train_seq['APHA.TO']))
print(len(val_seq['APHA.TO']))

## New Normalization Approach

Using built in pandas methods to use z scoring to make it completely error proof (at least not caused by me)

Will build sequences of size 11, then z-score, set the closing of last row as the future price, and drop the last row.

In [None]:
# df is dataframe of 11 rows
def z_score(df):
    # DAY OF WEEK ALSO GETS STANDARDIZED
    df['DayOfWeek']=[i.dayofweek for i in df.index]
    
    for column in df.columns:
        std = df[column].std()
        
        # if all values are the same, then std=0, so dividing by std would make a NaN
        df[column] = (df[column] - df[column].mean()) / std if std != 0 else 0
    
    future = df['Close'][10]
    
    final_seq_df = df.iloc[0:10]
    
    nan_flag = final_seq_df.isnull().values.any()
    
    
    # return first 10 rows of df, excluding the future row, also nan flag
    return [final_seq_df.values, future], nan_flag
    

all_seq = {}

for ticker in tqdm(tickers):
    all_seq[ticker] = []
    
    for i in range(len(col_d[ticker].index)-11):
        sequence, nan_flag = z_score(col_d[ticker].iloc[i:i+11])
        
        if(nan_flag):
            print('Found a NaN! Dropping sequence')
        else:
            all_seq[ticker].append(sequence)
        
        



In [None]:
all_seq['TSLA'][0]

In [None]:

train = []
test = []

# test:train ratio
RATIO = 0.05 

for ticker in tqdm(tickers):
    seqs = all_seq[ticker]
    
    index = math.floor(len(seqs)*RATIO)
    
    train.extend(seqs[:-index])
    test.extend(seqs[-index:])
    

random.shuffle(train)
random.shuffle(test)

    


In [None]:

with open('./data/new-train.pkl', 'wb') as f:
    pickle.dump(train, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/new-test.pkl', 'wb') as f:
    pickle.dump(test, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:


test = []
with open('./data/new-test.pkl', 'rb') as f:
    test = pickle.load(f)
    
train = []
with open('./data/new-train.pkl', 'rb') as f:
    train = pickle.load(f)
    


In [None]:
for t in test:
    seq = t[0]
    assert len(seq) == 10
    
    for r in seq:
        assert len(r) == 6
    