## imports

In [5]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler
from torch_geometric.utils import dense_to_sparse
from torch_geometric_temporal.signal import DynamicGraphTemporalSignal
from torch_geometric_temporal.signal import temporal_signal_split

## utils

In [3]:
def transform_and_split(data):
    # Normalize node features and transform data type
    data.x = normalize(data.x, axis=1, norm='max')
    data.x = torch.from_numpy(data.x).to(torch.float64)
    data.y = data.y.apply_(lambda x:  1 if (x > 0) else 0) # Change y into {0, 1} for binary classification
    data.y = data.y.to(torch.float64)    
    data.edge_attr = data.edge_attr.to(torch.double)


    # Split into train/test set
#    split = nodeSplit(split="train_rest", num_splits = 1, num_val = 0.0, num_test= 0.2)
#    masked_data = split(data)

#    print("Training samples:", torch.sum(masked_data.train_mask).item())
#    print("Validation samples:", torch.sum(masked_data.val_mask ).item())
#    print("Test samples:", torch.sum(masked_data.test_mask ).item())
    print_basic_info(data)
    return data

def print_basic_info(data):
    print()
    print(data)
    print('===========================================================================================================')

    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Has isolated nodes: {data.has_isolated_nodes()}')
    print(f'Has self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

## get and split data

In [4]:
year = 2016
month = 10
month_year = []
for i in range(59):
    if month == 13:
        month = 1
        year += 1
    month_year.append([year, month])
    month += 1
        

In [6]:
stock_df = pd.read_csv(
            os.path.join('../data/raw',"stock","raw.csv"),
            usecols=["ticker_symbol", "Date", "Close"],
            parse_dates=["Date"],
        )

In [7]:
stock_df.head()

Unnamed: 0,Date,Close,ticker_symbol
0,2016-10-03,28.129999,aapl
1,2016-10-04,28.25,aapl
2,2016-10-05,28.262501,aapl
3,2016-10-06,28.4725,aapl
4,2016-10-07,28.514999,aapl


In [8]:
X_y = []
mat_list = []
for yr_month in month_year:
    yr = yr_month[0]
    month = yr_month[1]
    directory = '../data/raw/twitter/'
    mat_list.append(np.load(directory+str(yr)+'_'+str(month)+'.npy'))
    ######################################################## 
    # prepare X (change this if you want to add SEC emb, etc.)
    ########################################################
    curr = stock_df[(stock_df.Date.dt.year == yr) & (stock_df.Date.dt.month == month)]
    X = curr.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    x_logret = np.diff(np.log(X))
    col_zeros = np.zeros((X.shape[0], 1))
    x_normalized = np.append(col_zeros, x_logret, 1)
    X_tensor = torch.tensor(x_normalized)
    X_tensor = X_tensor.to(torch.float64)
    
    ########################################################
    # prepare y (change this if you want to change labels)
    ########################################################
    if month == 12:
        y_yr = yr+1
        y_month = 1
    else:
        y_yr = yr
        y_month = month + 1
    
    nxt = stock_df[(stock_df.Date.dt.year == y_yr) & (stock_df.Date.dt.month == y_month)]
    y = nxt.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    y = (y.mean(1) - X.mean(1)) / X.mean(1)
    y_tensor = torch.tensor(y)
    X_y.append((X_tensor,y_tensor))
    
edge_idx = []
edge_att = []
for i in range(59):
    edge_index, edge_attr = dense_to_sparse(torch.from_numpy(mat_list[i]))
    edge_idx.append(edge_index)
    edge_att.append(edge_attr)
edge_indices = [i.numpy() for i in edge_idx]
edge_weights = [i.numpy() for i in edge_att]
features = []
targets = []
for i in range(59):
    features.append(normalize(X_y[i][0].numpy(), axis=1, norm='max'))
    #features.append(X_y[i][0].numpy())
    targets.append([1 if a > 0 else 0 for a in X_y[i][1].numpy()])
targets = np.asarray(targets)

In [13]:
padded_features = []
for i in features:
    padded_features.append(np.pad(i, [(0, 0), (0, 23-i.shape[1])], 'mean'))

comp_emb = []
for fp in sorted(os.listdir("../data/raw/sec/")):
    full_path = os.path.join("../data/raw", "sec", fp)
    if fp.split(".")[-1]=='npy':
        comp_emb.append(torch.from_numpy(np.load(full_path)))
comp_emb = torch.stack(comp_emb)
comp_emb = np.asarray([comp_emb.numpy() for i in range(59)])
padded_features = np.concatenate((padded_features, comp_emb), axis = 2)

In [14]:
temporal_signal = DynamicGraphTemporalSignal(edge_indices = edge_indices , \
                                             edge_weights = edge_weights, \
                                             features = padded_features, \
                                             targets = targets)

train_dataset, test_dataset = temporal_signal_split(temporal_signal, train_ratio=0.8)b

ModuleNotFoundError: No module named 'torch_geometric_temporal'

## model definition

In [11]:
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import DCRNN
from torch_geometric_temporal import TemporalConv
from torch_geometric_temporal import EvolveGCNO
from torch_geometric_temporal import GConvGRU
class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(RecurrentGCN, self).__init__()
        self.evol = EvolveGCNO(node_features)
        self.recurrent = DCRNN(node_features, 16, 1)
        self.conv = GConvGRU(node_features, 64, 3)
        #self.linear = torch.nn.Linear(16, 1)
        self.linear = torch.nn.Linear(64, 2)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x, edge_index, edge_weight):
#        h = self.recurrent(x, edge_index, edge_weight)
#        h = self.dropout(h)
        h = self.conv(x, edge_index, edge_weight)
        #h = self.dropout(h)
        h = F.relu(h)
        h = self.linear(h)
        h = torch.sigmoid(h)
        return h

## model training

In [None]:
from tqdm import tqdm

model = RecurrentGCN(node_features = 791)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in tqdm(range(200)):
    loss = 0
    for time, snapshot in enumerate(train_dataset):
        y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        loss += torch.nn.CrossEntropyLoss()(y_hat, snapshot.y.long())
#        loss += torch.mean((y_hat-snapshot.y)**2)
#        loss = loss / (time+1)
        
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

In [None]:
y_hat_l = []
model.eval()
cost = 0
for time, snapshot in enumerate(test_dataset):
    y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
    #cost = cost + torch.mean((y_hat-snapshot.y)**2)
    y_hat_l.append(y_hat)
#cost = cost / (time+1)
#cost = cost.item()
#print("MSE: {:.4f}".format(cost))

In [None]:
y_hat_l = [list(np.squeeze(i.detach().numpy())) for i in y_hat_l]
y_hat_l = [z for y in y_hat_l for z in y]
y_hat_l = [y[1] for y in y_hat_l]

In [None]:
true_label = []
for time, snapshot in enumerate(test_dataset):
    true_label.append(list(snapshot.y.detach().numpy()))
true_label = [int(z) for y in true_label for z in y]

## experiment results

In [None]:
y = true_label
pred = np.array(y_hat_l)
fpr, tpr, thresholds = metrics.roc_curve(y, pred)
metrics.auc(fpr, tpr)

In [None]:
y_hat_list = [1 if x > 0.5 else 0 for x in y_hat_l]
y_true = true_label
target_names = ['class 0', 'class 1']
print(metrics.classification_report(y_true, y_hat_list, target_names=target_names))