#Load Data

In [66]:
import pandas as pd

df = pd.read_csv('history.csv')

print(df)

        Symbol                 Date       Open      Close       High  \
0          RPD  2018-01-02 00:00:00  18.660000  19.010000  19.090000   
1          RPD  2018-01-03 00:00:00  19.040001  19.350000  19.650000   
2          RPD  2018-01-04 00:00:00  19.389999  19.980000  20.000000   
3          RPD  2018-01-05 00:00:00  20.000000  20.010000  20.100000   
4          RPD  2018-01-08 00:00:00  20.020000  20.350000  20.500000   
...        ...                  ...        ...        ...        ...   
5177034    RNG  2023-03-14 00:00:00  32.180000  31.590000  32.680000   
5177035    OGI  2023-03-15 00:00:00   0.650000   0.640000   0.651000   
5177036    RNG  2023-03-15 00:00:00  31.320000  32.380001  32.549999   
5177037    OGI  2023-03-16 00:00:00   0.640000   0.649000   0.667000   
5177038    RNG  2023-03-16 00:00:00  32.419998  31.969999  32.580002   

               Low     Volume   AdjClose  
0        18.500000   124200.0  19.010000  
1        19.040001   204100.0  19.350000  
2     

#Generate Features

In [67]:
#Add in special date columns
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df.apply(lambda row: row['Date'].year, axis=1)
df['Month'] = df.apply(lambda row: row['Date'].month, axis=1)
df['DayOfWeek'] = df.apply(lambda row: row['Date'].weekday(), axis=1)
df['WeekOfYear'] = df.apply(lambda row: row['Date'].isocalendar()[1], axis=1)

print(df)

        Symbol       Date       Open      Close       High        Low  \
0          RPD 2018-01-02  18.660000  19.010000  19.090000  18.500000   
1          RPD 2018-01-03  19.040001  19.350000  19.650000  19.040001   
2          RPD 2018-01-04  19.389999  19.980000  20.000000  19.389999   
3          RPD 2018-01-05  20.000000  20.010000  20.100000  19.719999   
4          RPD 2018-01-08  20.020000  20.350000  20.500000  19.950001   
...        ...        ...        ...        ...        ...        ...   
5177034    RNG 2023-03-14  32.180000  31.590000  32.680000  31.230000   
5177035    OGI 2023-03-15   0.650000   0.640000   0.651000   0.629000   
5177036    RNG 2023-03-15  31.320000  32.380001  32.549999  30.990000   
5177037    OGI 2023-03-16   0.640000   0.649000   0.667000   0.629000   
5177038    RNG 2023-03-16  32.419998  31.969999  32.580002  31.180000   

            Volume   AdjClose  Year  Month  DayOfWeek  WeekOfYear  
0         124200.0  19.010000  2018      1          1  

In [68]:
import numpy as np

def generate_cyclical_features(df, col_name, period, start_num=0):
    kwargs = {
        f'sin_{col_name}' : lambda x: np.sin(2*np.pi*(df[col_name]-start_num)/period),
        f'cos_{col_name}' : lambda x: np.cos(2*np.pi*(df[col_name]-start_num)/period)    
             }
    return df.assign(**kwargs)

df = generate_cyclical_features(df, 'DayOfWeek', 7, 0)
df = generate_cyclical_features(df, 'Month', 12, 1)
df = generate_cyclical_features(df, 'WeekOfYear', 52, 0)

print(df.columns)

Index(['Symbol', 'Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'AdjClose',
       'Year', 'Month', 'DayOfWeek', 'WeekOfYear', 'sin_DayOfWeek',
       'cos_DayOfWeek', 'sin_Month', 'cos_Month', 'sin_WeekOfYear',
       'cos_WeekOfYear'],
      dtype='object')


In [69]:
import holidays
us_holidays = holidays.US()

def is_holiday(date):
    date = date.replace(hour = 0)
    return 1 if (date in us_holidays) else 0

def add_holiday_col(df, holidays):
    return df.assign(is_holiday = df['Date'].apply(is_holiday))

df = add_holiday_col(df, us_holidays)

print(df.columns)

Index(['Symbol', 'Date', 'Open', 'Close', 'High', 'Low', 'Volume', 'AdjClose',
       'Year', 'Month', 'DayOfWeek', 'WeekOfYear', 'sin_DayOfWeek',
       'cos_DayOfWeek', 'sin_Month', 'cos_Month', 'sin_WeekOfYear',
       'cos_WeekOfYear', 'is_holiday'],
      dtype='object')


In [76]:
#Convert symbol to number
symbol_map = {}
symbol_map_rev = {}

symbols = df['Symbol'].unique()
for i, symbol in enumerate(symbols):
    symbol_map[symbol] = i
    symbol_map_rev[i] = symbol

df['Symbol_Num'] = df.apply(lambda row: symbol_map[row['Symbol']], axis=1)
df = df.drop('Symbol', axis=1)



#Split into Training and Test sets

Do this by breaking data up into date ranges, as we are mirroing a live setup where we have the full 
past data, and need to predict the future. We are not trying to do things like, given random selctions
of dates in the past predict a future date. 

In [136]:
# cols_to_drop = ['Month', 'Date', 'DayOfWeek', 'WeekOfYear']

# train_x = df[(df['Year']==2018) & (df['Month'] < 12)]
# i1 = train_x.set_index(['Symbol_Num']).index
# train_x = train_x.drop(cols_to_drop, axis=1)
# train_y = df[(df['Year']==2018) & (df['Month'] == 12)]
# i2 = train_y.set_index(['Symbol_Num']).index
# train_y = train_y[i2.isin(i1)]
# train_y = train_y.groupby('Symbol_Num').agg({'Volume':["mean"]})

# print(len(train_x), len(train_y))

# val_x = df[(df['Year']==2019) & (df['Month'] < 6)]
# i1 = val_x.set_index(['Symbol_Num']).index
# val_x = val_x.drop(cols_to_drop, axis=1)
# val_y = df[(df['Year']==2019) & (df['Month'] == 6)]
# i2 = val_y.set_index(['Symbol_Num']).index
# val_y = val_y[i2.isin(i1)]
# val_y = val_y.groupby('Symbol_Num').agg({'Volume':["mean"]})

# print(len(val_x), len(val_y))


# test_x = df[(df['Year']==2019) & (df['Month'] < 12) & (df['Month'] > 6)]
# i1 = test_x.set_index(['Symbol_Num']).index
# test_x = test_x.drop(cols_to_drop, axis=1).groupby('Symbol_Num')
# test_y = df[(df['Year']==2019) & (df['Month'] == 12)]
# i2 = test_y.set_index(['Symbol_Num']).index
# test_y = test_y[i2.isin(i1)]
# test_y = test_y.groupby('Symbol_Num').agg({'Volume':["mean"]})

# print(len(test_x), len(test_y))


750141 3332
351138 3421
372819 3523


In [145]:
cols_to_drop = ['Month', 'Date', 'DayOfWeek', 'WeekOfYear']

train_x = df[(df['Year']==2018) & (df['Month'] < 12)].drop(cols_to_drop, axis=1)
train_y = df[(df['Year']==2018) & (df['Month'] == 12)].drop(cols_to_drop, axis=1)

val_x = df[(df['Year']==2019) & (df['Month'] < 6)].drop(cols_to_drop, axis=1)
val_y = df[(df['Year']==2019) & (df['Month'] == 6)].drop(cols_to_drop, axis=1)

test_x = df[(df['Year']==2019) & (df['Month'] < 12) & (df['Month'] > 6)].drop(cols_to_drop, axis=1)
test_y = df[(df['Year']==2019) & (df['Month'] == 12)].drop(cols_to_drop, axis=1)


#Scale the data

In [163]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# t = scaler.fit_transform(train_x)
# print(t.shape)
#X_train_arr = transfrom_group(train_x, scaler)
# X_val_arr = transfrom_group(val_x, scaler)
# X_test_arr = transfrom_group(test_x, scaler)


# y_train_arr = scaler.fit_transform(train_y)
# y_val_arr = scaler.transform(val_y)
# y_test_arr = scaler.transform(test_y)

# print(len(X_train_arr), len(y_train_arr))
# print(len(X_val_arr), len(y_val_arr))
# print(len(X_test_arr), len(y_test_arr))

#TODO: This doesnt work right as valuations and test will not have same symbol num,
#we probably need to figure out the right way to encode the symbol

train_x_scaled = pd.DataFrame(scaler.fit_transform(train_x),columns = train_x.columns)
train_y_scaled = pd.DataFrame(scaler.transform(train_y),columns = train_y.columns)

val_x_scaled = pd.DataFrame(scaler.fit_transform(val_x),columns = val_x.columns)
val_y_scaled = pd.DataFrame(scaler.transform(val_y),columns = val_y.columns)

test_x_scaled = pd.DataFrame(scaler.fit_transform(test_x),columns = test_x.columns)
test_y_scaled = pd.DataFrame(scaler.transform(test_y),columns = test_y.columns)



#Create Dataloaders

In [245]:
from torch.utils.data import DataLoader, Dataset
import torch
from torch.nn.utils.rnn import pad_sequence

# Create a custom dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        y = y_data.groupby('Symbol_Num').agg({'Volume':["mean"]})
        self.y_data = dict(zip(y.index, y[('Volume', 'mean')]))

        symbols = x_data.groupby('Symbol_Num').groups.keys()
        y_symbols = y_data.groupby('Symbol_Num').groups
        self.symbol_nums = {}
        count = 0
        for key in symbols:
            if key in y_symbols:
                self.symbol_nums[count] = key
                count += 1
        # y = set(y_symbols.keys())
        # x = set(self.symbol_nums.values())
        # print(len(x.intersection(y)))
        # print(-1.7371406599594579 in y)
        self.len = count
        
    def __len__(self):
        return self.len
        
    def __getitem__(self, idx):
        symbol = self.symbol_nums[idx]
        y = self.y_data[symbol]
        x_data = self.x_data[self.x_data["Symbol_Num"]==symbol]
        t = torch.tensor(x_data.values)
        return t, y, len(x_data)

train_dataset = TimeSeriesDataset(train_x_scaled, train_y_scaled)
val_dataset = TimeSeriesDataset(val_x_scaled, val_y_scaled)
test_dataset = TimeSeriesDataset(test_x_scaled, test_y_scaled)

def time_series_collate(batch):
    # Pad sequences with zeros to make them the same length
    return pad_sequence(batch, batch_first=True, padding_value=0)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=time_series_collate)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=time_series_collate)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=time_series_collate)
test_loader_one = DataLoader(test_dataset, batch_size=1, drop_last=True)



In [238]:
print(len(train_dataset))
train_dataset[0]

3332


(tensor([[-0.0210, -0.0213, -0.0202,  ...,  1.6189, -0.0944, -1.7371],
         [-0.0209, -0.0213, -0.0201,  ...,  1.6189, -0.0944, -1.7371],
         [-0.0209, -0.0212, -0.0201,  ...,  1.6189, -0.0944, -1.7371],
         ...,
         [-0.0197, -0.0198, -0.0189,  ...,  1.4588, -0.0944, -1.7371],
         [-0.0195, -0.0198, -0.0188,  ...,  1.4588, -0.0944, -1.7371],
         [-0.0194, -0.0197, -0.0187,  ...,  1.4588, -0.0944, -1.7371]],
        dtype=torch.float64),
 -0.1289282317971904,
 232)

# Define model

In [241]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, dropout_prob):
        super(GRUModel, self).__init__()

        # Defining the number of layers and the nodes in each layer
        self.layer_dim = layer_dim
        self.hidden_dim = hidden_dim

        # GRU layers
        self.gru = nn.GRU(
            input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob
        )

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        # Initializing hidden state for first input with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        #unpad 
        x = pack_padded_sequence(x, lengths)

        # Forward propagation by passing in the input and hidden state into the model
        out, _ = self.gru(x, h0.detach())

        #pad back to expected length
        out, lengths = pad_packed_sequence(out)

        # Reshaping the outputs in the shape of (batch_size, seq_length, hidden_size)
        # so that it can fit into the fully connected layer
        out = out[:, -1, :]

        # Convert the final state to our desired output shape (batch_size, output_dim)
        out = self.fc(out)

        return out

In [252]:
from datetime import datetime
import matplotlib.pyplot as plt

device = "cpu"

class Optimization:
    def __init__(self, model, loss_fn, optimizer):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_losses = []
        self.val_losses = []
    
    def train_step(self, x, y, lengths):
        # Sets model to train mode
        self.model.train()

        # Makes predictions
        yhat = self.model(x, lengths)

        # Computes loss
        loss = self.loss_fn(y, yhat)

        # Computes gradients
        loss.backward()

        # Updates parameters and zeroes gradients
        self.optimizer.step()
        self.optimizer.zero_grad()

        # Returns the loss
        return loss.item()

    def train(self, train_loader, val_loader, batch_size=64, n_epochs=50, n_features=1):
        model_path = f'models/{self.model}_{datetime.now().strftime("%Y-%m-%d %H:%M:%S")}'

        for epoch in range(1, n_epochs + 1):
            batch_losses = []
            for x_batch, y_batch, lengths in train_loader:
                x_batch = x_batch.view([batch_size, -1, n_features]).to(device)
                y_batch = y_batch.to(device)
                loss = self.train_step(x_batch, y_batch, lengths)
                batch_losses.append(loss)
            training_loss = np.mean(batch_losses)
            self.train_losses.append(training_loss)

            with torch.no_grad():
                batch_val_losses = []
                for x_val, y_val, lengths in val_loader:
                    x_val = x_val.view([batch_size, -1, n_features]).to(device)
                    y_val = y_val.to(device)
                    self.model.eval()
                    yhat = self.model(x_val, lengths)
                    val_loss = self.loss_fn(y_val, yhat).item()
                    batch_val_losses.append(val_loss)
                validation_loss = np.mean(batch_val_losses)
                self.val_losses.append(validation_loss)

            if (epoch <= 10) | (epoch % 50 == 0):
                print(
                    f"[{epoch}/{n_epochs}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}"
                )

        torch.save(self.model.state_dict(), model_path)

    def evaluate(self, test_loader, batch_size=1, n_features=1):
        with torch.no_grad():
            predictions = []
            values = []
            for x_test, y_test, lengths in test_loader:
                x_test = x_test.view([batch_size, -1, n_features]).to(device)
                y_test = y_test.to(device)
                self.model.eval()
                yhat = self.model(x_test, lengths)
                predictions.append(yhat.to(device).detach().numpy())
                values.append(y_test.to(device).detach().numpy())

        return predictions, values
    
    def plot_losses(self):
        plt.plot(self.train_losses, label="Training loss")
        plt.plot(self.val_losses, label="Validation loss")
        plt.legend()
        plt.title("Losses")
        plt.show()
        plt.close()

In [244]:

def get_model(model, model_params):
    models = {
        "gru": GRUModel,
    }
    return models.get(model.lower())(**model_params)

In [251]:
import torch.optim as optim

input_dim = len(train_x.columns)
output_dim = 1
hidden_dim = 64
layer_dim = 3
batch_size = 64
dropout = 0.2
n_epochs = 2
learning_rate = 1e-3
weight_decay = 1e-6

model_params = {'input_dim': input_dim,
                'hidden_dim' : hidden_dim,
                'layer_dim' : layer_dim,
                'output_dim' : output_dim,
                'dropout_prob' : dropout}

model = get_model('gru', model_params)

loss_fn = nn.MSELoss(reduction="mean")
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

opt = Optimization(model=model, loss_fn=loss_fn, optimizer=optimizer)
opt.train(train_loader, val_loader, batch_size=batch_size, n_epochs=n_epochs, n_features=input_dim)
opt.plot_losses()

predictions, values = opt.evaluate(test_loader_one, batch_size=1, n_features=input_dim)


TypeError: expected Tensor as element 0 in argument 0, but got tuple