To use pytorch geometric temporal, make sure you have torch 1.9.0 installed (uninstall 1.10.0 before).

In [1]:
import torch
print(torch.__version__)

1.9.0+cpu


In [2]:
import torch
import numpy as np
import pandas as pd

In [3]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-geometric
!pip install torch-geometric-temporal

Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html


In [4]:
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

def transform_and_split(data):
    # Normalize node features and transform data type
    data.x = normalize(data.x, axis=1, norm='max')
    data.x = torch.from_numpy(data.x).to(torch.float64)
    data.y = data.y.apply_(lambda x:  1 if (x > 0) else 0) # Change y into {0, 1} for binary classification
    data.y = data.y.to(torch.float64)    
    data.edge_attr = data.edge_attr.to(torch.double)


    # Split into train/test set
#    split = nodeSplit(split="train_rest", num_splits = 1, num_val = 0.0, num_test= 0.2)
#    masked_data = split(data)

#    print("Training samples:", torch.sum(masked_data.train_mask).item())
#    print("Validation samples:", torch.sum(masked_data.val_mask ).item())
#    print("Test samples:", torch.sum(masked_data.test_mask ).item())
    print_basic_info(data)
    return data

In [5]:
def print_basic_info(data):
    print()
    print(data)
    print('===========================================================================================================')

    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Has isolated nodes: {data.has_isolated_nodes()}')
    print(f'Has self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

### Get and split data

In [12]:
df = pd.read_csv('../data/raw/news/news_data_weekly.csv')

In [13]:
df = df.dropna()

In [14]:
df = df.sort_values('Date')

In [16]:
df

Unnamed: 0,Date,url,texts
2746,2016-10-08,https://www.nytimes.com/2016/10/04/opinion/the...,Donald Trump is a thug. He’s a thug who talks...
33,2016-10-08,https://www.nytimes.com/2016/10/08/us/politics...,In lucrative paid speeches that Hillary Clint...
32,2016-10-08,https://www.nytimes.com/2016/10/07/technology/...,"SAN FRANCISCO — Marc Benioff, the founder and..."
31,2016-10-08,https://www.nytimes.com/2016/10/05/business/pr...,Prepaid debit cards are a financial lifeline ...
30,2016-10-08,https://www.nytimes.com/2016/10/04/world/ameri...,RIO DE JANEIRO — It was not a banner day for ...
...,...,...,...
1518,2021-10-02,https://www.nytimes.com/2021/09/30/sports/socc...,Looking to expand its global footprint beyond...
1517,2021-10-02,https://www.nytimes.com/2021/10/01/business/cr...,Despite the popularity of mobile apps promisi...
1516,2021-10-02,https://www.nytimes.com/2021/10/02/your-money/...,Introducing your child to the real-world use ...
2736,2021-10-02,https://www.nytimes.com/2021/09/26/fashion/wat...,Like their counterparts in industries such as...


In [17]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

In [18]:
df_list = [part for _, part in df.groupby(pd.Grouper(key='Date', freq='M'))]

In [19]:
df_list[0]

Unnamed: 0,Date,url,texts
2746,2016-10-08,https://www.nytimes.com/2016/10/04/opinion/the...,Donald Trump is a thug. He’s a thug who talks...
33,2016-10-08,https://www.nytimes.com/2016/10/08/us/politics...,In lucrative paid speeches that Hillary Clint...
32,2016-10-08,https://www.nytimes.com/2016/10/07/technology/...,"SAN FRANCISCO — Marc Benioff, the founder and..."
31,2016-10-08,https://www.nytimes.com/2016/10/05/business/pr...,Prepaid debit cards are a financial lifeline ...
30,2016-10-08,https://www.nytimes.com/2016/10/04/world/ameri...,RIO DE JANEIRO — It was not a banner day for ...
36,2016-10-08,https://www.nytimes.com/2016/10/07/business/de...,WASHINGTON — Nearly five years after Jon S. C...
37,2016-10-08,https://www.nytimes.com/2016/10/08/business/in...,LONDON — As Europe has grappled with the trau...
38,2016-10-08,https://www.nytimes.com/2016/10/08/world/europ...,LONDON — For those blithely inclined toward t...
34,2016-10-08,https://www.nytimes.com/2016/10/04/business/de...,The Janus Capital Group and the Henderson Gro...
2747,2016-10-08,https://www.nytimes.com/2016/10/09/world/middl...,TEHRAN — Rushing for a plane to Tehran becaus...


In [20]:
import yaml

with open('../configs/dow_jones.yaml') as f:
    
    data = yaml.load(f, Loader=yaml.FullLoader)
    print(data)

{'companies': [{'wba': {'alias': ['$wba', 'wba', 'walgreen boots alliance inc', 'walgreen boots alliance', 'walgreenbootsalliance']}}, {'v': {'alias': ['$v', 'v', 'visa inc class a', 'visa']}}, {'crm': {'alias': ['$crm', 'crm', 'salesforce.com inc', 'salesforce']}}, {'cvx': {'alias': ['$cvx', 'cvx', 'chevron corp', 'chevron']}}, {'pg': {'alias': ['$pg', 'pg', 'procter & gamble', 'procter&gamble']}}, {'vz': {'alias': ['$vz', 'vz', 'verizon communications inc', 'verizon']}}, {'wmt': {'alias': ['$wmt', 'wmt', 'walmart stores inc', 'walmart stores', 'walmart']}}, {'unh': {'alias': ['$unh', 'unh', 'unitedhealth group inc', 'unitedhealth group', 'unitedhealthgroup']}}, {'trv': {'alias': ['$trv', 'trv', 'travelers companies inc', 'travelers companies', 'travelers', 'travelerscompanies']}}, {'mcd': {'alias': ['$mcd', 'mcd', 'mcdonalds corp', 'mcdonalds']}}, {'mmm': {'alias': ['$mmm', 'mmm', '3m', '3m']}}, {'nke': {'alias': ['$nke', 'nke', 'nike inc class b', 'nike']}}, {'mrk': {'alias': ['$mrk

In [25]:
def get_matrix(df):
    companies = [list(com.keys())[0] for com in data['companies']]
    alias = list(map(lambda x: list(x.items())[0][1]["alias"], data['companies']))
    res = pd.DataFrame(0, index=companies, columns=companies)
    for company1, search_items1 in zip(companies, alias):
        for company2, search_items2 in zip(companies, alias):
            if company1 != company2:
                search_items = search_items1 + search_items2
            else:
                search_items = search_items1
            pat = "|".join(search_items)
            res[company1][company2] += df.texts.str.contains(
                pat
            ).sum()
    return res.values

In [26]:
mat_list = [get_matrix(df) for df in df_list]

In [27]:
year = 2016
month = 10
month_year = []
for i in range(60):
    if month == 13:
        month = 1
        year += 1
    month_year.append([year, month])
    np.save('../data/raw/news/'+str(year)+'_'+str(month)+'.npy', mat_list[i])
    month += 1
        

In [28]:
import os
stock_df = pd.read_csv(
            os.path.join('../data/raw',"stock","raw.csv"),
            usecols=["ticker_symbol", "Date", "Close"],
            parse_dates=["Date"],
        )

In [29]:
stock_df

Unnamed: 0,Date,Close,ticker_symbol
0,2016-10-03,28.129999,aapl
1,2016-10-04,28.250000,aapl
2,2016-10-05,28.262501,aapl
3,2016-10-06,28.472500,aapl
4,2016-10-07,28.514999,aapl
...,...,...,...
36506,2021-09-27,142.250000,wmt
36507,2021-09-28,140.500000,wmt
36508,2021-09-29,140.440002,wmt
36509,2021-09-30,139.380005,wmt


In [30]:
X_y = []
for yr_month in month_year:
    yr = yr_month[0]
    month = yr_month[1]
    
    ######################################################## 
    # prepare X (change this if you want to add SEC emb, etc.)
    ########################################################
    curr = stock_df[(stock_df.Date.dt.year == yr) & (stock_df.Date.dt.month == month)]
    X = curr.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    x_logret = np.diff(np.log(X))
    col_zeros = np.zeros((X.shape[0], 1))
    x_normalized = np.append(col_zeros, x_logret, 1)
    X_tensor = torch.tensor(x_normalized)
    X_tensor = X_tensor.to(torch.float64)
    
    ########################################################
    # prepare y (change this if you want to change labels)
    ########################################################
    if month == 12:
        y_yr = yr+1
        y_month = 1
    else:
        y_yr = yr
        y_month = month + 1
    
    nxt = stock_df[(stock_df.Date.dt.year == y_yr) & (stock_df.Date.dt.month == y_month)]
    y = nxt.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    y = (y.mean(1) - X.mean(1)) / X.mean(1)
    y_tensor = torch.tensor(y)
    X_y.append((X_tensor,y_tensor))

In [31]:
from torch_geometric.utils import dense_to_sparse
edge_idx = []
edge_att = []
for i in range(60):
    edge_index, edge_attr = dense_to_sparse(torch.from_numpy(mat_list[i]))
    edge_idx.append(edge_index)
    edge_att.append(edge_attr)

In [32]:
edge_indices = [i.numpy() for i in edge_idx]
edge_weights = [i.numpy() for i in edge_att]

In [33]:
features = []
targets = []
for i in range(60):
    features.append(normalize(X_y[i][0].numpy(), axis=1, norm='max'))
    #features.append(X_y[i][0].numpy())
    targets.append([1 if a > 0 else 0 for a in X_y[i][1].numpy()])
targets = np.asarray(targets)

In [34]:
targets[3]

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1])

In [35]:
mat_list

[array([[ 1, 32,  1,  2,  1,  1,  1,  3,  1,  1,  1, 10,  1,  1,  1,  8,
          1, 23,  8, 12,  1, 11, 28, 25,  5,  2, 32,  1,  1],
        [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
         32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32],
        [ 1, 32,  0,  1,  0,  0,  0,  2,  0,  0,  0, 10,  0,  0,  0,  8,
          0, 22,  7, 11,  0, 10, 28, 24,  4,  1, 32,  0,  0],
        [ 2, 32,  1,  1,  1,  1,  1,  3,  1,  1,  1, 10,  1,  1,  1,  9,
          1, 22,  8, 11,  1, 10, 28, 24,  5,  2, 32,  1,  1],
        [ 1, 32,  0,  1,  0,  0,  0,  2,  0,  0,  0, 10,  0,  0,  0,  8,
          0, 22,  7, 11,  0, 10, 28, 24,  4,  1, 32,  0,  0],
        [ 1, 32,  0,  1,  0,  0,  0,  2,  0,  0,  0, 10,  0,  0,  0,  8,
          0, 22,  7, 11,  0, 10, 28, 24,  4,  1, 32,  0,  0],
        [ 1, 32,  0,  1,  0,  0,  0,  2,  0,  0,  0, 10,  0,  0,  0,  8,
          0, 22,  7, 11,  0, 10, 28, 24,  4,  1, 32,  0,  0],
        [ 3, 32,  2,  3,  2,  2,  2,  2,  2,  2,  2, 11

In [36]:
features[1].shape

(29, 21)

In [37]:
padded_features = []
for i in features:
    padded_features.append(np.pad(i, [(0, 0), (0, 23-i.shape[1])], 'mean'))

In [38]:
comp_emb = []
for fp in sorted(os.listdir("../data/raw/sec/")):
    full_path = os.path.join("../data/raw", "sec", fp)
    if fp.split(".")[-1]=='npy':
        comp_emb.append(torch.from_numpy(np.load(full_path)))
comp_emb = torch.stack(comp_emb)

In [39]:
comp_emb = np.asarray([comp_emb.numpy() for i in range(60)])

In [40]:
padded_features = np.concatenate((padded_features, comp_emb), axis = 2)

In [41]:
padded_features.shape

(60, 29, 791)

In [42]:
from torch_geometric_temporal.signal import DynamicGraphTemporalSignal

In [43]:
temporal_signal = DynamicGraphTemporalSignal(edge_indices = edge_indices , edge_weights = edge_weights, features = padded_features, targets = targets)

In [44]:
temporal_signal

<torch_geometric_temporal.signal.dynamic_graph_temporal_signal.DynamicGraphTemporalSignal at 0x183814e0910>

In [45]:
from torch_geometric_temporal.signal import temporal_signal_split

train_dataset, test_dataset = temporal_signal_split(temporal_signal, train_ratio=0.8)

In [46]:
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import DCRNN
from torch_geometric_temporal import TemporalConv
from torch_geometric_temporal import EvolveGCNO
from torch_geometric_temporal import GConvGRU
class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(RecurrentGCN, self).__init__()
        self.evol = EvolveGCNO(node_features)
        self.recurrent = DCRNN(node_features, 16, 1)
        self.conv = GConvGRU(node_features, 64, 3)
        #self.linear = torch.nn.Linear(16, 1)
        self.linear = torch.nn.Linear(64, 2)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x, edge_index, edge_weight):
#        h = self.recurrent(x, edge_index, edge_weight)
#        h = self.dropout(h)
        h = self.conv(x, edge_index, edge_weight)
        #h = self.dropout(h)
        h = F.relu(h)
        h = self.linear(h)
        h = torch.sigmoid(h)
        return h

In [47]:
from tqdm import tqdm

model = RecurrentGCN(node_features = 791)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in tqdm(range(200)):
    loss = 0
    for time, snapshot in enumerate(train_dataset):
        y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        loss += torch.nn.CrossEntropyLoss()(y_hat, snapshot.y.long())
#        loss += torch.mean((y_hat-snapshot.y)**2)
#        loss = loss / (time+1)
        
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

100%|██████████| 200/200 [04:45<00:00,  1.43s/it]


In [49]:
y_hat_l = []
model.eval()
cost = 0
for time, snapshot in enumerate(test_dataset):
    y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
    #cost = cost + torch.mean((y_hat-snapshot.y)**2)
    y_hat_l.append(y_hat)
#cost = cost / (time+1)
#cost = cost.item()
#print("MSE: {:.4f}".format(cost))


In [50]:
y_hat_l

[tensor([[7.7283e-01, 2.3925e-01],
         [2.0537e-04, 9.9979e-01],
         [9.8682e-01, 1.3444e-02],
         [9.0894e-01, 8.1061e-02],
         [4.5048e-05, 9.9995e-01],
         [3.1220e-01, 6.5268e-01],
         [9.9030e-01, 1.0398e-02],
         [8.9722e-01, 1.0728e-01],
         [1.0262e-02, 9.8935e-01],
         [9.1738e-01, 7.8568e-02],
         [9.1942e-01, 7.6260e-02],
         [6.4136e-02, 9.4372e-01],
         [3.5018e-04, 9.9965e-01],
         [2.0442e-01, 8.1504e-01],
         [9.0524e-01, 1.0841e-01],
         [3.1424e-04, 9.9964e-01],
         [2.6675e-01, 7.2271e-01],
         [2.8449e-04, 9.9972e-01],
         [2.7432e-03, 9.9751e-01],
         [5.5804e-01, 4.3045e-01],
         [1.2317e-01, 8.7893e-01],
         [1.3413e-01, 8.6382e-01],
         [3.0423e-05, 9.9997e-01],
         [2.1026e-07, 1.0000e+00],
         [4.3049e-03, 9.9665e-01],
         [2.7186e-01, 7.3935e-01],
         [3.3692e-04, 9.9971e-01],
         [5.0079e-01, 5.5544e-01],
         [1.0548e-01

In [51]:
y_hat_l = [list(np.squeeze(i.detach().numpy())) for i in y_hat_l]
y_hat_l = [z for y in y_hat_l for z in y]

In [52]:
y_hat_l = [y[1] for y in y_hat_l]

In [53]:
y_hat_l

[0.23925272,
 0.99979454,
 0.013444317,
 0.08106069,
 0.9999541,
 0.65268016,
 0.010398388,
 0.1072829,
 0.98934644,
 0.078568235,
 0.07626017,
 0.9437219,
 0.99965394,
 0.8150422,
 0.10840962,
 0.9996414,
 0.7227076,
 0.9997186,
 0.99750996,
 0.43045107,
 0.878933,
 0.86382234,
 0.99996865,
 0.99999976,
 0.99665314,
 0.73934954,
 0.9997112,
 0.55544424,
 0.88824743,
 0.9994885,
 0.22375722,
 0.72197133,
 0.9919849,
 0.9747985,
 0.93706155,
 0.9973538,
 0.38454056,
 0.99782526,
 0.99987745,
 0.58961546,
 0.99794334,
 0.999998,
 0.9999465,
 0.7277189,
 0.93592197,
 0.079343595,
 0.9834688,
 0.99902296,
 0.7697284,
 0.92925173,
 0.99999976,
 0.11691703,
 0.86763954,
 0.52332026,
 0.99934095,
 0.97383016,
 0.04013679,
 0.99999833,
 0.13157012,
 0.0059828307,
 0.8741419,
 0.007647242,
 0.49588388,
 0.34299353,
 0.119552016,
 0.053867493,
 0.0784617,
 0.97235495,
 0.240469,
 0.80101335,
 0.5810852,
 0.17988978,
 0.9968413,
 0.8100992,
 0.9776292,
 0.170082,
 0.17706394,
 0.087513864,
 0.986

In [54]:
true_label = []
for time, snapshot in enumerate(test_dataset):
    true_label.append(list(snapshot.y.detach().numpy()))

In [55]:
true_label = [int(z) for y in true_label for z in y]

In [56]:
import numpy as np
from sklearn import metrics
y = true_label
pred = np.array(y_hat_l)
fpr, tpr, thresholds = metrics.roc_curve(y, pred)
metrics.auc(fpr, tpr)

0.6885456885456886

In [61]:
y_hat_list = [1 if x > 0.7 else 0 for x in y_hat_l]

In [62]:
from sklearn.metrics import classification_report
y_true = true_label
target_names = ['class 0', 'class 1']
print(classification_report(y_true, y_hat_list, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.54      0.56      0.55       126
     class 1       0.75      0.73      0.74       222

    accuracy                           0.67       348
   macro avg       0.64      0.64      0.64       348
weighted avg       0.67      0.67      0.67       348

