#Load Data

In [66]:
import pandas as pd

df = pd.read_csv('history.csv')

print(df)

        Symbol                 Date       Open      Close       High  \
0          RPD  2018-01-02 00:00:00  18.660000  19.010000  19.090000   
1          RPD  2018-01-03 00:00:00  19.040001  19.350000  19.650000   
2          RPD  2018-01-04 00:00:00  19.389999  19.980000  20.000000   
3          RPD  2018-01-05 00:00:00  20.000000  20.010000  20.100000   
4          RPD  2018-01-08 00:00:00  20.020000  20.350000  20.500000   
...        ...                  ...        ...        ...        ...   
5177034    RNG  2023-03-14 00:00:00  32.180000  31.590000  32.680000   
5177035    OGI  2023-03-15 00:00:00   0.650000   0.640000   0.651000   
5177036    RNG  2023-03-15 00:00:00  31.320000  32.380001  32.549999   
5177037    OGI  2023-03-16 00:00:00   0.640000   0.649000   0.667000   
5177038    RNG  2023-03-16 00:00:00  32.419998  31.969999  32.580002   

               Low     Volume   AdjClose  
0        18.500000   124200.0  19.010000  
1        19.040001   204100.0  19.350000  
2     

#Generate Features

In [67]:
#Add in special date columns
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df.apply(lambda row: row['Date'].year, axis=1)
df['Month'] = df.apply(lambda row: row['Date'].month, axis=1)
df['DayOfWeek'] = df.apply(lambda row: row['Date'].weekday(), axis=1)
df['WeekOfYear'] = df.apply(lambda row: row['Date'].isocalendar()[1], axis=1)

print(df)

        Symbol       Date       Open      Close       High        Low  \
0          RPD 2018-01-02  18.660000  19.010000  19.090000  18.500000   
1          RPD 2018-01-03  19.040001  19.350000  19.650000  19.040001   
2          RPD 2018-01-04  19.389999  19.980000  20.000000  19.389999   
3          RPD 2018-01-05  20.000000  20.010000  20.100000  19.719999   
4          RPD 2018-01-08  20.020000  20.350000  20.500000  19.950001   
...        ...        ...        ...        ...        ...        ...   
5177034    RNG 2023-03-14  32.180000  31.590000  32.680000  31.230000   
5177035    OGI 2023-03-15   0.650000   0.640000   0.651000   0.629000   
5177036    RNG 2023-03-15  31.320000  32.380001  32.549999  30.990000   
5177037    OGI 2023-03-16   0.640000   0.649000   0.667000   0.629000   
5177038    RNG 2023-03-16  32.419998  31.969999  32.580002  31.180000   

            Volume   AdjClose  Year  Month  DayOfWeek  WeekOfYear  
0         124200.0  19.010000  2018      1          1  

In [68]:
import numpy as np

def generate_cyclical_features(df, col_name, period, start_num=0):
    kwargs = {
        f'sin_{col_name}' : lambda x: np.sin(2*np.pi*(df[col_name]-start_num)/period),
        f'cos_{col_name}' : lambda x: np.cos(2*np.pi*(df[col_name]-start_num)/period)    
             }
    return df.assign(**kwargs)

df = generate_cyclical_features(df, 'DayOfWeek', 7, 0)
df = generate_cyclical_features(df, 'Month', 12, 1)
df = generate_cyclical_features(df, 'WeekOfYear', 52, 0)

print(df.columns)

Index(['Symbol', 'Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'AdjClose',
       'Year', 'Month', 'DayOfWeek', 'WeekOfYear', 'sin_DayOfWeek',
       'cos_DayOfWeek', 'sin_Month', 'cos_Month', 'sin_WeekOfYear',
       'cos_WeekOfYear'],
      dtype='object')


In [69]:
import holidays
us_holidays = holidays.US()

def is_holiday(date):
    date = date.replace(hour = 0)
    return 1 if (date in us_holidays) else 0

def add_holiday_col(df, holidays):
    return df.assign(is_holiday = df['Date'].apply(is_holiday))

df = add_holiday_col(df, us_holidays)

print(df.columns)

Index(['Symbol', 'Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'AdjClose',
       'Year', 'Month', 'DayOfWeek', 'WeekOfYear', 'sin_DayOfWeek',
       'cos_DayOfWeek', 'sin_Month', 'cos_Month', 'sin_WeekOfYear',
       'cos_WeekOfYear', 'is_holiday'],
      dtype='object')


In [76]:
#Convert symbol to number
symbol_map = {}
symbol_map_rev = {}

symbols = df['Symbol'].unique()
for i, symbol in enumerate(symbols):
    symbol_map[symbol] = i
    symbol_map_rev[i] = symbol

df['Symbol_Num'] = df.apply(lambda row: symbol_map[row['Symbol']], axis=1)
df = df.drop('Symbol', axis=1)



#Split into Training and Test sets

Do this by breaking data up into date ranges, as we are mirroing a live setup where we have the full 
past data, and need to predict the future. We are not trying to do things like, given random selctions
of dates in the past predict a future date. 

In [78]:
train_x = df[(df['Year']==2018) & (df['Month'] < 12)]
train_y = df[(df['Year']==2018) & (df['Month'] == 12)]
train_y = train_y.groupby('Symbol_Num').agg({'Volume':["mean"]})

val_x = df[(df['Year']==2019) & (df['Month'] < 6)]
val_y = df[(df['Year']==2019) & (df['Month'] == 6)]
val_y = val_y.groupby('Symbol_Num').agg({'Volume':["mean"]})

test_x = df[(df['Year']==2019) & (df['Month'] < 12) & (df['Month'] > 6)]
test_y = df[(df['Year']==2019) & (df['Month'] == 12)]
test_y = test_y.groupby('Symbol_Num').agg({'Volume':["mean"]})

cols_to_drop = ['Month', 'Date', 'DayOfWeek', 'WeekOfYear']
train_x = train_x.drop(cols_to_drop, axis=1)
test_x = test_x.drop(cols_to_drop, axis=1)
val_x = val_x.drop(cols_to_drop, axis=1)

train_x.columns


Index(['Open', 'Close', 'High', 'Low', 'Volume', 'AdjClose', 'Year',
       'sin_DayOfWeek', 'cos_DayOfWeek', 'sin_Month', 'cos_Month',
       'sin_WeekOfYear', 'cos_WeekOfYear', 'is_holiday', 'Symbol_Num'],
      dtype='object')

#Scale the data

In [79]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_arr = scaler.fit_transform(train_x)
X_val_arr = scaler.transform(val_x)
X_test_arr = scaler.transform(test_x)

y_train_arr = scaler.fit_transform(train_y)
y_val_arr = scaler.transform(val_y)
y_test_arr = scaler.transform(test_y)


#Create Dataloaders

In [81]:

from torch.utils.data import TensorDataset, DataLoader
import torch

batch_size = 64

train_features = torch.Tensor(X_train_arr)
train_targets = torch.Tensor(y_train_arr)
val_features = torch.Tensor(X_val_arr)
val_targets = torch.Tensor(y_val_arr)
test_features = torch.Tensor(X_test_arr)
test_targets = torch.Tensor(y_test_arr)

train = TensorDataset(train_features, train_targets)
val = TensorDataset(val_features, val_targets)
test = TensorDataset(test_features, test_targets)

train_loader = DataLoader(train, batch_size=batch_size, shuffle=False, drop_last=True)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader_one = DataLoader(test, batch_size=1, shuffle=False, drop_last=True)

AssertionError: Size mismatch between tensors