To use pytorch geometric temporal, make sure you have torch 1.9.0 installed (uninstall 1.10.0 before).

In [98]:
import torch
print(torch.__version__)

1.9.0+cpu


In [99]:
import torch
import numpy as np
import pandas as pd

In [100]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-geometric
!pip install torch-geometric-temporal

Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html


In [100]:
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

def transform_and_split(data):
    # Normalize node features and transform data type
    data.x = normalize(data.x, axis=1, norm='max')
    data.x = torch.from_numpy(data.x).to(torch.float64)
    data.y = data.y.apply_(lambda x:  1 if (x > 0) else 0) # Change y into {0, 1} for binary classification
    data.y = data.y.to(torch.float64)    
    data.edge_attr = data.edge_attr.to(torch.double)


    # Split into train/test set
#    split = nodeSplit(split="train_rest", num_splits = 1, num_val = 0.0, num_test= 0.2)
#    masked_data = split(data)

#    print("Training samples:", torch.sum(masked_data.train_mask).item())
#    print("Validation samples:", torch.sum(masked_data.val_mask ).item())
#    print("Test samples:", torch.sum(masked_data.test_mask ).item())
    print_basic_info(data)
    return data

In [101]:
def print_basic_info(data):
    print()
    print(data)
    print('===========================================================================================================')

    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Has isolated nodes: {data.has_isolated_nodes()}')
    print(f'Has self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

### Get and split data

In [103]:
year = 2016
month = 10
month_year = []
for i in range(59):
    if month == 13:
        month = 1
        year += 1
    month_year.append([year, month])
    month += 1
        

In [104]:
import os
stock_df = pd.read_csv(
            os.path.join('../data/raw',"stock","raw.csv"),
            usecols=["ticker_symbol", "Date", "Close"],
            parse_dates=["Date"],
        )

In [105]:
stock_df

Unnamed: 0,Date,Close,ticker_symbol
0,2016-10-03,28.129999,aapl
1,2016-10-04,28.250000,aapl
2,2016-10-05,28.262501,aapl
3,2016-10-06,28.472500,aapl
4,2016-10-07,28.514999,aapl
...,...,...,...
36506,2021-09-27,142.250000,wmt
36507,2021-09-28,140.500000,wmt
36508,2021-09-29,140.440002,wmt
36509,2021-09-30,139.380005,wmt


In [106]:
X_y = []
mat_list = []
for yr_month in month_year:
    yr = yr_month[0]
    month = yr_month[1]
    directory = '../data/raw/twitter/'
    mat_list.append(np.load(directory+str(yr)+'_'+str(month)+'.npy'))
    ######################################################## 
    # prepare X (change this if you want to add SEC emb, etc.)
    ########################################################
    curr = stock_df[(stock_df.Date.dt.year == yr) & (stock_df.Date.dt.month == month)]
    X = curr.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    x_logret = np.diff(np.log(X))
    col_zeros = np.zeros((X.shape[0], 1))
    x_normalized = np.append(col_zeros, x_logret, 1)
    X_tensor = torch.tensor(x_normalized)
    X_tensor = X_tensor.to(torch.float64)
    
    ########################################################
    # prepare y (change this if you want to change labels)
    ########################################################
    if month == 12:
        y_yr = yr+1
        y_month = 1
    else:
        y_yr = yr
        y_month = month + 1
    
    nxt = stock_df[(stock_df.Date.dt.year == y_yr) & (stock_df.Date.dt.month == y_month)]
    y = nxt.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    y = (y.mean(1) - X.mean(1)) / X.mean(1)
    y_tensor = torch.tensor(y)
    X_y.append((X_tensor,y_tensor))

In [59]:
from torch_geometric.utils import dense_to_sparse
edge_idx = []
edge_att = []
for i in range(59):
    edge_index, edge_attr = dense_to_sparse(torch.from_numpy(mat_list[i]))
    edge_idx.append(edge_index)
    edge_att.append(edge_attr)

In [60]:
edge_indices = [i.numpy() for i in edge_idx]
edge_weights = [i.numpy() for i in edge_att]

In [61]:
features = []
targets = []
for i in range(59):
    features.append(normalize(X_y[i][0].numpy(), axis=1, norm='max'))
    #features.append(X_y[i][0].numpy())
    targets.append([1 if a > 0 else 0 for a in X_y[i][1].numpy()])
targets = np.asarray(targets)

In [62]:
targets[3]

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1])

In [63]:
mat_list

[array([[   0,    0,    2,  651,   61,    6,    0,    3,   29,  145,   65,
           77,    2,    1,    0,    0,  424,   22, 1906,    0,    9,   16,
           32,    1,    0, 4962,   21,    8,    0],
        [   0,    0,    2,  651,   61,    6,    0,    3,   29,  145,   65,
           77,    2,    1,    0,    0,  424,   22, 1906,    0,    9,   16,
           32,    1,    0, 4962,   21,    8,    0],
        [   2,    2,    2,  653,   63,    8,    2,    5,   30,  147,   67,
           79,    4,    3,    2,    2,  426,   24, 1908,    2,   11,   18,
           34,    3,    2, 4962,   23,   10,    2],
        [ 651,  651,  653,  651,  704,  657,  651,  652,  671,  786,  709,
          722,  653,  652,  651,  651,  997,  672, 2537,  651,  660,  661,
          682,  652,  651, 5342,  672,  651,  651],
        [  61,   61,   63,  704,   61,   67,   61,   64,   90,  199,  126,
          136,   63,   62,   61,   61,  483,   81, 1966,   61,   70,   77,
           93,   62,   61, 4995,   82,   6

In [64]:
features[1].shape

(29, 21)

In [65]:
padded_features = []
for i in features:
    padded_features.append(np.pad(i, [(0, 0), (0, 23-i.shape[1])], 'mean'))

In [66]:
comp_emb = []
for fp in sorted(os.listdir("../data/raw/sec/")):
    full_path = os.path.join("../data/raw", "sec", fp)
    if fp.split(".")[-1]=='npy':
        comp_emb.append(torch.from_numpy(np.load(full_path)))
comp_emb = torch.stack(comp_emb)

In [67]:
comp_emb = np.asarray([comp_emb.numpy() for i in range(59)])

In [68]:
padded_features = np.concatenate((padded_features, comp_emb), axis = 2)

In [69]:
padded_features.shape

(59, 29, 791)

In [70]:
from torch_geometric_temporal.signal import DynamicGraphTemporalSignal

In [71]:
temporal_signal = DynamicGraphTemporalSignal(edge_indices = edge_indices , edge_weights = edge_weights, features = padded_features, targets = targets)

In [72]:
temporal_signal

<torch_geometric_temporal.signal.dynamic_graph_temporal_signal.DynamicGraphTemporalSignal at 0x2ad1ae187c0>

In [73]:
from torch_geometric_temporal.signal import temporal_signal_split

train_dataset, test_dataset = temporal_signal_split(temporal_signal, train_ratio=0.8)

In [74]:
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import DCRNN
from torch_geometric_temporal import TemporalConv
from torch_geometric_temporal import EvolveGCNO
from torch_geometric_temporal import GConvGRU
class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(RecurrentGCN, self).__init__()
        self.evol = EvolveGCNO(node_features)
        self.recurrent = DCRNN(node_features, 16, 1)
        self.conv = GConvGRU(node_features, 64, 3)
        #self.linear = torch.nn.Linear(16, 1)
        self.linear = torch.nn.Linear(64, 2)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x, edge_index, edge_weight):
#        h = self.recurrent(x, edge_index, edge_weight)
#        h = self.dropout(h)
        h = self.conv(x, edge_index, edge_weight)
        #h = self.dropout(h)
        h = F.relu(h)
        h = self.linear(h)
        h = torch.sigmoid(h)
        return h

In [75]:
from tqdm import tqdm

model = RecurrentGCN(node_features = 791)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in tqdm(range(200)):
    loss = 0
    for time, snapshot in enumerate(train_dataset):
        y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        loss += torch.nn.CrossEntropyLoss()(y_hat, snapshot.y.long())
#        loss += torch.mean((y_hat-snapshot.y)**2)
#        loss = loss / (time+1)
        
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

100%|██████████| 200/200 [03:20<00:00,  1.00s/it]


In [76]:
y_hat_l = []
model.eval()
cost = 0
for time, snapshot in enumerate(test_dataset):
    y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
    #cost = cost + torch.mean((y_hat-snapshot.y)**2)
    y_hat_l.append(y_hat)
#cost = cost / (time+1)
#cost = cost.item()
#print("MSE: {:.4f}".format(cost))


In [77]:
y_hat_l

[tensor([[0.8068, 0.2182],
         [0.7598, 0.2399],
         [0.9793, 0.0228],
         [0.9287, 0.0799],
         [0.9311, 0.0776],
         [0.9860, 0.0165],
         [0.9725, 0.0298],
         [0.9717, 0.0339],
         [0.9881, 0.0124],
         [0.9814, 0.0217],
         [0.8559, 0.1443],
         [0.4229, 0.5677],
         [0.9413, 0.0597],
         [0.2174, 0.7876],
         [0.9704, 0.0332],
         [0.8026, 0.2204],
         [0.9212, 0.0835],
         [0.0213, 0.9785],
         [0.9434, 0.0657],
         [0.9911, 0.0097],
         [0.7318, 0.2706],
         [0.8286, 0.1654],
         [0.1876, 0.7834],
         [0.9955, 0.0056],
         [0.7120, 0.3004],
         [0.9364, 0.0675],
         [0.9977, 0.0026],
         [0.9314, 0.0645],
         [0.8699, 0.1321]], grad_fn=<SigmoidBackward>),
 tensor([[1.1312e-01, 8.6419e-01],
         [4.0606e-04, 9.9954e-01],
         [9.8713e-01, 1.4269e-02],
         [1.8079e-01, 8.1311e-01],
         [1.0656e-03, 9.9879e-01],
         [4.1

In [78]:
y_hat_l = [list(np.squeeze(i.detach().numpy())) for i in y_hat_l]
y_hat_l = [z for y in y_hat_l for z in y]

In [79]:
y_hat_l = [y[1] for y in y_hat_l]

In [80]:
y_hat_l

[0.21815945,
 0.23994403,
 0.022844344,
 0.079914235,
 0.07762006,
 0.016465086,
 0.02983402,
 0.033913877,
 0.012437577,
 0.021742323,
 0.1442695,
 0.56769544,
 0.05974656,
 0.78758454,
 0.033242512,
 0.22042239,
 0.08348477,
 0.97847307,
 0.06571783,
 0.009687315,
 0.2706278,
 0.16538452,
 0.7834206,
 0.005573214,
 0.30044538,
 0.06746661,
 0.0025515615,
 0.06446375,
 0.13209759,
 0.8641938,
 0.9995358,
 0.014268763,
 0.8131128,
 0.99878997,
 0.96020246,
 0.09003078,
 0.54424596,
 0.9997428,
 0.6158212,
 0.35401478,
 0.99321693,
 0.9999943,
 0.9980685,
 0.07859284,
 0.99271494,
 0.99816966,
 0.98924214,
 0.99928963,
 0.783865,
 0.9515104,
 0.6212229,
 0.984055,
 0.9999913,
 0.9961832,
 0.9999927,
 0.7072106,
 0.28444862,
 0.9944747,
 0.99996364,
 0.10777703,
 0.9647999,
 0.9475789,
 0.9967045,
 0.9981456,
 0.9952958,
 0.6059759,
 0.99389726,
 0.9944758,
 0.92489344,
 0.9996024,
 0.9999981,
 0.99996614,
 0.86596876,
 0.8501247,
 0.6359334,
 0.9990503,
 0.9859936,
 0.83242136,
 0.95532

In [81]:
true_label = []
for time, snapshot in enumerate(test_dataset):
    true_label.append(list(snapshot.y.detach().numpy()))

In [82]:
true_label = [int(z) for y in true_label for z in y]

In [83]:
import numpy as np
from sklearn import metrics
y = true_label
pred = np.array(y_hat_l)
fpr, tpr, thresholds = metrics.roc_curve(y, pred)
metrics.auc(fpr, tpr)

0.6561956561956562

In [96]:
y_hat_list = [1 if x > 0.5 else 0 for x in y_hat_l]

In [97]:
from sklearn.metrics import classification_report
y_true = true_label
target_names = ['class 0', 'class 1']
print(classification_report(y_true, y_hat_list, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.47      0.47      0.47       117
     class 1       0.73      0.74      0.73       231

    accuracy                           0.65       348
   macro avg       0.60      0.60      0.60       348
weighted avg       0.65      0.65      0.65       348

