In [2]:
import yfinance as yf
import pandas as pd
import math
from tqdm import tqdm as tqdm
import pickle
import numpy as np
import random
from multiprocessing import Process, Pool

pd.options.mode.chained_assignment = None  # default='warn'

DOWNLOAD = True

In [3]:
tickers = ['TSLA',
           'AAPL',
           'MSFT',
           'U',
           'CCL',
           'TD',
           'SPY',
           'FB',
           'V',
           'DIS',
           'CNR',
           'HD',
           'UNH',
           'MCD',
           'MMM',
           'ATVI',
           'ADBE',
           'AMD',
           'GOOG',
           'AMZN',
           'AXP',
           'BAC',
           'BA',
           'CVX',
           'C',
           'KO',
           'DOW',
           'GM',
           'GILD',
           'INTC',
           'MA',
           'NVDA',
           'TXN',
           'XRX',
           'RY.TO',
           'CP.TO',
           'TRI.TO',
           'ATD-B.TO',
           'L.TO',
           'DOL.TO',
           'BB.TO',
           'DOO.TO',
           'WEED.TO',
           'SNC.TO',
           'SHOP',
           'SU.TO',
           'CM.TO',
           'TD.TO',
           'ENB.TO',
           'APHA.TO',
           'XIU.TO', # s&p/tsx composite 60
           'AC.TO']

In [4]:
raws = {}

for ticker in tqdm(tickers):
    t = yf.Ticker(ticker)
    t_data = t.history(period = '10y', interval = '1d')
    
    raws[ticker] = t_data
    

100%|██████████| 52/52 [00:17<00:00,  3.04it/s]


In [5]:
col_d = {} # dropped columns

for ticker in tqdm(tickers):
    raw = raws[ticker]
    
    col_d[ticker] = raw.drop(columns=['Dividends', 'Stock Splits'])
    
col_d['TSLA'].head()

100%|██████████| 52/52 [00:00<00:00, 859.97it/s]


Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-01-28,4.976,4.976,4.75,4.802,5242000
2011-01-31,4.81,4.824,4.7,4.82,4151500
2011-02-01,4.862,4.946,4.708,4.782,3539000
2011-02-02,4.832,4.836,4.734,4.788,2847500
2011-02-03,4.764,4.78,4.63,4.726,2560000


In [10]:
list(col_d.values())

[                  Open        High         Low       Close    Volume
 Date                                                                
 2011-01-28    4.976000    4.976000    4.750000    4.802000   5242000
 2011-01-31    4.810000    4.824000    4.700000    4.820000   4151500
 2011-02-01    4.862000    4.946000    4.708000    4.782000   3539000
 2011-02-02    4.832000    4.836000    4.734000    4.788000   2847500
 2011-02-03    4.764000    4.780000    4.630000    4.726000   2560000
 ...                ...         ...         ...         ...       ...
 2021-01-22  834.309998  848.000000  828.619995  846.640015  20066500
 2021-01-25  855.000000  900.400024  838.820007  880.799988  41173400
 2021-01-26  891.380005  895.900024  871.599976  883.090027  23131600
 2021-01-27  870.349976  891.500000  858.659973  864.159973  26782300
 2021-01-28  820.000000  847.996582  801.000000  839.969971  18734504
 
 [2517 rows x 5 columns],
                   Open        High         Low       Close   

In [None]:

#serializing for future use

with open('./data/std-stocks.pkl', 'wb') as f:
    pickle.dump(good, f, protocol=pickle.HIGHEST_PROTOCOL)

## Variables
- Open (n)
- High (n)
- Low (n)
- Close (n)
- Volume (n)
- Day of Week (ex. weekends)


In [None]:
with open('./data/train-sequences.pkl', 'wb') as f:
    pickle.dump(train_seq, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/test-sequences.pkl', 'wb') as f:
    pickle.dump(val_seq, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
print(len(train_seq['APHA.TO']))
print(len(val_seq['APHA.TO']))

## New Normalization Approach

Using built in pandas methods to use z scoring to make it completely error proof (at least not caused by me)

Will build sequences of size 11, then z-score, set the closing of last row as the future price, and drop the last row.

In [12]:
# df is dataframe of 10 rows, future is a future value
def z_score(df, f1, f2, f3):
    # DAY OF WEEK ALSO GETS STANDARDIZED
    df['DayOfWeek']=[i.dayofweek for i in df.index]
    
    for column in df.columns:
        std = df[column].std()
        mean = df[column].mean()
        
        # if all values are the same, then std=0, so dividing by std would make a NaN
        df[column] = (df[column] - mean) / std if std != 0 else 0
        
        # future value is excluded from std and mean calculations
        if column == 'Close':
            f1 = (f1-mean)/std if std != 0 else 0
            f2 = (f2-mean)/std if std != 0 else 0
            f3 = (f3-mean)/std if std != 0 else 0
    
    
   
    nan_flag = df.isnull().values.any()
    
    
    # return first 10 rows of df, excluding the future row, also nan flag
    return df.values, f1, f2, f3, nan_flag
    

#all_seq = {}

a1 = []
a2 = []
a3 = []

def sequencify(df):
    s1,s2,s3=[],[],[]
    
    for i in range(len(df.index)-13):
        sequence, f1, f2, f3, nan_flag = z_score(df.iloc[i:i+10], df['Close'].iloc[i+10], df['Close'].iloc[i+11], df['Close'].iloc[i+12])
        
        if(nan_flag):
            pass
        else:
            s1.append([sequence, f1])
            s2.append([sequence, f2])
            s3.append([sequence, f3])
    
    return [s1,s2,s3]

with Pool(8) as pool:
    results = pool.imap_unordered(sequencify, list(col_d.values()))
    
    for res in results:
        a1.append(res[0])
        a2.append(res[1])
        a3.append(res[2])
    
        


        


Found a NaN! Dropping sequence
Found a NaN! Dropping sequence
Found a NaN! Dropping sequence
Found a NaN! Dropping sequence
Found a NaN! Dropping sequence
Found a NaN! Dropping sequence
Found a NaN! Dropping sequence
Found a NaN! Dropping sequence
Found a NaN! Dropping sequence
Found a NaN! Dropping sequence


In [14]:
train1, test1 = [], []
train2, test2 = [], []
train3, test3 = [], []

#test : train
RATIO = 0.05 


# iterating by histories of each ticker
for hist in a1:
    split = math.floor(len(hist) * RATIO)
    train1.extend(hist[:-split])
    test1.extend(hist[-split:])

for hist in a2:
    split = math.floor(len(hist) * RATIO)
    train2.extend(hist[:-split])
    test2.extend(hist[-split:])

for hist in a3:
    split = math.floor(len(hist) * RATIO)
    train3.extend(hist[:-split])
    test3.extend(hist[-split:])

    
random.shuffle(train1)
random.shuffle(train2)
random.shuffle(train3)

random.shuffle(test1)
random.shuffle(test2)
random.shuffle(test3)

In [15]:

with open('./data/train1.pkl', 'wb') as f:
    pickle.dump(train1, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/test1.pkl', 'wb') as f:
    pickle.dump(test1, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/train2.pkl', 'wb') as f:
    pickle.dump(train2, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/test2.pkl', 'wb') as f:
    pickle.dump(test2, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/train3.pkl', 'wb') as f:
    pickle.dump(train3, f, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('./data/test3.pkl', 'wb') as f:
    pickle.dump(test3, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:


test = []
with open('./data/new-test.pkl', 'rb') as f:
    test = pickle.load(f)
    
train = []
with open('./data/new-train.pkl', 'rb') as f:
    train = pickle.load(f)
    


In [17]:
for t in test1:
    seq = t[0]
    assert len(seq) == 10
    
    for r in seq:
        assert len(r) == 6
        
assert len(test1) < len(train1)
assert len(test1) == len(test2)
assert len(train1) == len(train2)

    