To use pytorch geometric temporal, make sure you have torch 1.9.0 installed (uninstall 1.10.0 before).

In [147]:
import torch
print(torch.__version__)

1.9.0+cpu


In [148]:
import torch
import numpy as np
import pandas as pd

In [100]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-geometric
!pip install torch-geometric-temporal

Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html


In [149]:
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

def transform_and_split(data):
    # Normalize node features and transform data type
    data.x = normalize(data.x, axis=1, norm='max')
    data.x = torch.from_numpy(data.x).to(torch.float64)
    data.y = data.y.apply_(lambda x:  1 if (x > 0) else 0) # Change y into {0, 1} for binary classification
    data.y = data.y.to(torch.float64)    
    data.edge_attr = data.edge_attr.to(torch.double)


    # Split into train/test set
#    split = nodeSplit(split="train_rest", num_splits = 1, num_val = 0.0, num_test= 0.2)
#    masked_data = split(data)

#    print("Training samples:", torch.sum(masked_data.train_mask).item())
#    print("Validation samples:", torch.sum(masked_data.val_mask ).item())
#    print("Test samples:", torch.sum(masked_data.test_mask ).item())
    print_basic_info(data)
    return data

In [150]:
def print_basic_info(data):
    print()
    print(data)
    print('===========================================================================================================')

    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Has isolated nodes: {data.has_isolated_nodes()}')
    print(f'Has self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

### Get and split data

In [151]:
df = pd.read_csv('../data/raw/news_data.csv', index_col = 0)

In [152]:
df = df.dropna()

In [153]:
df = df.sort_values('Date')

In [154]:
df

Unnamed: 0,Date,url,full_text
1257,2016-10-02,https://www.nytimes.com/2016/09/02/science/spa...,A spectacular explosion of a SpaceX rocket on...
3954,2016-10-02,https://www.nytimes.com/2016/09/19/business/tr...,The Treasury’s schedule of financing this wee...
6585,2016-10-02,https://www.nytimes.com/2016/09/08/technology/...,SAN FRANCISCO — Michael Dell and Meg Whitman ...
6584,2016-10-02,https://www.nytimes.com/2016/10/01/business/de...,Remember the echo of doom we mentioned yester...
4535,2016-10-02,https://www.nytimes.com/2016/09/23/business/de...,"For the last three years, JPMorgan Chase’s hi..."
...,...,...,...
7242,2021-09-04,https://www.nytimes.com/2021/08/05/us/politics...,WASHINGTON — The Biden administration is deve...
7243,2021-09-04,https://www.nytimes.com/2021/08/20/world/asia/...,The members of the Bordia family thought they...
7244,2021-09-04,https://www.nytimes.com/2021/08/18/travel/blin...,"April DeMuth and her partner, Warren Watson, ..."
6188,2021-09-04,https://www.nytimes.com/2021/09/02/business/de...,The Republican-controlled Texas Legislature t...


In [155]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

In [156]:
df_list = [part for _, part in df.groupby(pd.Grouper(key='Date', freq='M'))]

In [157]:
df_list[0]

Unnamed: 0,Date,url,full_text
1257,2016-10-02,https://www.nytimes.com/2016/09/02/science/spa...,A spectacular explosion of a SpaceX rocket on...
3954,2016-10-02,https://www.nytimes.com/2016/09/19/business/tr...,The Treasury’s schedule of financing this wee...
6585,2016-10-02,https://www.nytimes.com/2016/09/08/technology/...,SAN FRANCISCO — Michael Dell and Meg Whitman ...
6584,2016-10-02,https://www.nytimes.com/2016/10/01/business/de...,Remember the echo of doom we mentioned yester...
4535,2016-10-02,https://www.nytimes.com/2016/09/23/business/de...,"For the last three years, JPMorgan Chase’s hi..."
...,...,...,...
57,2016-10-02,https://www.nytimes.com/2016/09/29/opinion/the...,When Mohammad Azam started his shift on May 2...
56,2016-10-02,https://www.nytimes.com/2016/09/28/business/fu...,Public anger over the cost of drugs has burne...
55,2016-10-02,https://www.nytimes.com/2016/09/02/us/slaves-g...,WASHINGTON — Nearly two centuries after Georg...
6723,2016-10-02,https://www.nytimes.com/2016/09/02/movies/movi...,Read reviews of new releases and search our f...


In [158]:
import yaml

with open('../configs/dow_jones.yaml') as f:
    
    data = yaml.load(f, Loader=yaml.FullLoader)
    print(data)

{'companies': [{'wba': {'alias': ['$wba', 'wba', 'walgreen boots alliance inc', 'walgreen boots alliance', 'walgreenbootsalliance']}}, {'v': {'alias': ['$v', 'v', 'visa inc class a', 'visa']}}, {'crm': {'alias': ['$crm', 'crm', 'salesforce.com inc', 'salesforce']}}, {'cvx': {'alias': ['$cvx', 'cvx', 'chevron corp', 'chevron']}}, {'pg': {'alias': ['$pg', 'pg', 'procter & gamble', 'procter&gamble']}}, {'vz': {'alias': ['$vz', 'vz', 'verizon communications inc', 'verizon']}}, {'wmt': {'alias': ['$wmt', 'wmt', 'walmart stores inc', 'walmart stores', 'walmart']}}, {'unh': {'alias': ['$unh', 'unh', 'unitedhealth group inc', 'unitedhealth group', 'unitedhealthgroup']}}, {'trv': {'alias': ['$trv', 'trv', 'travelers companies inc', 'travelers companies', 'travelers', 'travelerscompanies']}}, {'mcd': {'alias': ['$mcd', 'mcd', 'mcdonalds corp', 'mcdonalds']}}, {'mmm': {'alias': ['$mmm', 'mmm', '3m', '3m']}}, {'nke': {'alias': ['$nke', 'nke', 'nike inc class b', 'nike']}}, {'mrk': {'alias': ['$mrk

In [159]:
def get_matrix(df):
    companies = [list(com.keys())[0] for com in data['companies']]
    alias = list(map(lambda x: list(x.items())[0][1]["alias"], data['companies']))
    res = pd.DataFrame(0, index=companies, columns=companies)
    for company1, search_items1 in zip(companies, alias):
        for company2, search_items2 in zip(companies, alias):
            if company1 != company2:
                search_items = search_items1 + search_items2
            else:
                search_items = search_items1
            pat = "|".join(search_items)
            res[company1][company2] += df.full_text.str.contains(
                pat
            ).sum()
    return res.values

In [160]:
mat_list = [get_matrix(df) for df in df_list]

In [162]:
year = 2016
month = 10
month_year = []
for i in range(60):
    if month == 13:
        month = 1
        year += 1
    month_year.append([year, month])
    np.save('../data/raw/news/'+str(year)+'_'+str(month)+'.npy', mat_list[i])
    month += 1
        

In [163]:
import os
stock_df = pd.read_csv(
            os.path.join('../data/raw',"stock","raw.csv"),
            usecols=["ticker_symbol", "Date", "Close"],
            parse_dates=["Date"],
        )

In [164]:
stock_df

Unnamed: 0,Date,Close,ticker_symbol
0,2016-10-03,28.129999,aapl
1,2016-10-04,28.250000,aapl
2,2016-10-05,28.262501,aapl
3,2016-10-06,28.472500,aapl
4,2016-10-07,28.514999,aapl
...,...,...,...
36506,2021-09-27,142.250000,wmt
36507,2021-09-28,140.500000,wmt
36508,2021-09-29,140.440002,wmt
36509,2021-09-30,139.380005,wmt


In [165]:
X_y = []
for yr_month in month_year:
    yr = yr_month[0]
    month = yr_month[1]
    
    ######################################################## 
    # prepare X (change this if you want to add SEC emb, etc.)
    ########################################################
    curr = stock_df[(stock_df.Date.dt.year == yr) & (stock_df.Date.dt.month == month)]
    X = curr.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    x_logret = np.diff(np.log(X))
    col_zeros = np.zeros((X.shape[0], 1))
    x_normalized = np.append(col_zeros, x_logret, 1)
    X_tensor = torch.tensor(x_normalized)
    X_tensor = X_tensor.to(torch.float64)
    
    ########################################################
    # prepare y (change this if you want to change labels)
    ########################################################
    if month == 12:
        y_yr = yr+1
        y_month = 1
    else:
        y_yr = yr
        y_month = month + 1
    
    nxt = stock_df[(stock_df.Date.dt.year == y_yr) & (stock_df.Date.dt.month == y_month)]
    y = nxt.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    y = (y.mean(1) - X.mean(1)) / X.mean(1)
    y_tensor = torch.tensor(y)
    X_y.append((X_tensor,y_tensor))

In [166]:
from torch_geometric.utils import dense_to_sparse
edge_idx = []
edge_att = []
for i in range(60):
    edge_index, edge_attr = dense_to_sparse(torch.from_numpy(mat_list[i]))
    edge_idx.append(edge_index)
    edge_att.append(edge_attr)

In [167]:
edge_indices = [i.numpy() for i in edge_idx]
edge_weights = [i.numpy() for i in edge_att]

In [168]:
features = []
targets = []
for i in range(60):
    features.append(normalize(X_y[i][0].numpy(), axis=1, norm='max'))
    #features.append(X_y[i][0].numpy())
    targets.append([1 if a > 0 else 0 for a in X_y[i][1].numpy()])
targets = np.asarray(targets)

In [169]:
targets[3]

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1])

In [170]:
mat_list

[array([[  6, 124,   6,   6,   8,   6,   6,  12,   9,   6,   6,  46,   6,
           6,   6,  21,   6,  92,  16,  49,   6,  24, 110,  85,  23,   6,
         119,   6,   6],
        [124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
         124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124, 124,
         124, 124, 124],
        [  6, 124,   0,   0,   2,   0,   0,   6,   3,   0,   0,  40,   0,
           0,   0,  15,   0,  88,  10,  45,   0,  18, 106,  83,  20,   0,
         119,   0,   0],
        [  6, 124,   0,   0,   2,   0,   0,   6,   3,   0,   0,  40,   0,
           0,   0,  15,   0,  88,  10,  45,   0,  18, 106,  83,  20,   0,
         119,   0,   0],
        [  8, 124,   2,   2,   2,   2,   2,   8,   4,   2,   2,  42,   2,
           2,   2,  17,   2,  88,  12,  46,   2,  20, 107,  84,  22,   2,
         119,   2,   2],
        [  6, 124,   0,   0,   2,   0,   0,   6,   3,   0,   0,  40,   0,
           0,   0,  15,   0,  88,  10,  45,   0,  18, 106,  8

In [171]:
features[1].shape

(29, 21)

In [172]:
padded_features = []
for i in features:
    padded_features.append(np.pad(i, [(0, 0), (0, 23-i.shape[1])], 'mean'))

In [173]:
comp_emb = []
for fp in sorted(os.listdir("../data/raw/sec/")):
    full_path = os.path.join("../data/raw", "sec", fp)
    if fp.split(".")[-1]=='npy':
        comp_emb.append(torch.from_numpy(np.load(full_path)))
comp_emb = torch.stack(comp_emb)

In [174]:
comp_emb = np.asarray([comp_emb.numpy() for i in range(60)])

In [175]:
padded_features = np.concatenate((padded_features, comp_emb), axis = 2)

In [176]:
padded_features.shape

(60, 29, 791)

In [177]:
from torch_geometric_temporal.signal import DynamicGraphTemporalSignal

In [178]:
temporal_signal = DynamicGraphTemporalSignal(edge_indices = edge_indices , edge_weights = edge_weights, features = padded_features, targets = targets)

In [179]:
temporal_signal

<torch_geometric_temporal.signal.dynamic_graph_temporal_signal.DynamicGraphTemporalSignal at 0x1d43d57f730>

In [180]:
from torch_geometric_temporal.signal import temporal_signal_split

train_dataset, test_dataset = temporal_signal_split(temporal_signal, train_ratio=0.8)

In [181]:
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import DCRNN
from torch_geometric_temporal import TemporalConv
from torch_geometric_temporal import EvolveGCNO
from torch_geometric_temporal import GConvGRU
class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(RecurrentGCN, self).__init__()
        self.evol = EvolveGCNO(node_features)
        self.recurrent = DCRNN(node_features, 16, 1)
        self.conv = GConvGRU(node_features, 64, 3)
        #self.linear = torch.nn.Linear(16, 1)
        self.linear = torch.nn.Linear(64, 2)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x, edge_index, edge_weight):
#        h = self.recurrent(x, edge_index, edge_weight)
#        h = self.dropout(h)
        h = self.conv(x, edge_index, edge_weight)
        #h = self.dropout(h)
        h = F.relu(h)
        h = self.linear(h)
        h = torch.sigmoid(h)
        return h

In [182]:
from tqdm import tqdm

model = RecurrentGCN(node_features = 791)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in tqdm(range(200)):
    loss = 0
    for time, snapshot in enumerate(train_dataset):
        y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        loss += torch.nn.CrossEntropyLoss()(y_hat, snapshot.y.long())
#        loss += torch.mean((y_hat-snapshot.y)**2)
#        loss = loss / (time+1)
        
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

100%|██████████| 200/200 [04:09<00:00,  1.25s/it]


In [183]:
y_hat_l = []
model.eval()
cost = 0
for time, snapshot in enumerate(test_dataset):
    y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
    #cost = cost + torch.mean((y_hat-snapshot.y)**2)
    y_hat_l.append(y_hat)
#cost = cost / (time+1)
#cost = cost.item()
#print("MSE: {:.4f}".format(cost))


In [184]:
y_hat_l

[tensor([[6.7365e-01, 3.4908e-01],
         [5.7886e-04, 9.9957e-01],
         [9.8632e-01, 1.1676e-02],
         [7.7369e-01, 2.3265e-01],
         [1.1040e-04, 9.9988e-01],
         [2.6077e-02, 9.7447e-01],
         [9.7934e-01, 2.0144e-02],
         [4.3791e-01, 5.5233e-01],
         [3.8970e-04, 9.9966e-01],
         [9.1638e-01, 7.6830e-02],
         [8.5650e-01, 1.3917e-01],
         [3.9446e-04, 9.9962e-01],
         [1.7430e-03, 9.9851e-01],
         [6.1549e-02, 9.4237e-01],
         [8.6118e-01, 1.4613e-01],
         [4.4005e-04, 9.9945e-01],
         [3.5023e-03, 9.9707e-01],
         [1.1311e-04, 9.9989e-01],
         [6.6353e-04, 9.9927e-01],
         [3.1236e-01, 6.5081e-01],
         [3.0795e-01, 7.1445e-01],
         [6.8178e-02, 9.2024e-01],
         [4.1018e-05, 9.9997e-01],
         [8.1422e-07, 1.0000e+00],
         [2.0870e-03, 9.9759e-01],
         [6.7640e-01, 2.9670e-01],
         [1.1128e-04, 9.9988e-01],
         [7.5257e-01, 2.4475e-01],
         [1.6366e-01

In [185]:
y_hat_l = [list(np.squeeze(i.detach().numpy())) for i in y_hat_l]
y_hat_l = [z for y in y_hat_l for z in y]

In [186]:
y_hat_l = [y[1] for y in y_hat_l]

In [187]:
y_hat_l

[0.34908077,
 0.9995685,
 0.0116755655,
 0.23265238,
 0.99987996,
 0.97447246,
 0.020144349,
 0.5523271,
 0.99966204,
 0.07683049,
 0.13916643,
 0.9996246,
 0.9985115,
 0.9423714,
 0.1461251,
 0.9994523,
 0.9970721,
 0.999895,
 0.9992704,
 0.6508115,
 0.71444577,
 0.9202393,
 0.9999653,
 0.9999993,
 0.99758804,
 0.29669628,
 0.99988186,
 0.24474835,
 0.8471245,
 0.99648213,
 0.292066,
 0.9590402,
 0.8802642,
 0.99680316,
 0.97572714,
 0.9989273,
 0.5129543,
 0.99934953,
 0.9978916,
 0.21985061,
 0.99970883,
 0.9999963,
 0.99987316,
 0.2217586,
 0.9403469,
 0.279077,
 0.9988129,
 0.99718374,
 0.8869059,
 0.9866636,
 0.9999826,
 0.30708215,
 0.5505598,
 0.15586576,
 0.99724895,
 0.9831798,
 0.01782463,
 0.9999833,
 0.022246977,
 0.009405541,
 0.799387,
 0.0043808785,
 0.14222065,
 0.42095903,
 0.3995422,
 0.004125261,
 0.17196748,
 0.91006047,
 0.016623722,
 0.89147097,
 0.37791058,
 0.40185675,
 0.98097503,
 0.6834005,
 0.97445166,
 0.11670937,
 0.18866284,
 0.02282482,
 0.984641,
 0.01

In [188]:
true_label = []
for time, snapshot in enumerate(test_dataset):
    true_label.append(list(snapshot.y.detach().numpy()))

In [189]:
true_label = [int(z) for y in true_label for z in y]

In [190]:
import numpy as np
from sklearn import metrics
y = true_label
pred = np.array(y_hat_l)
fpr, tpr, thresholds = metrics.roc_curve(y, pred)
metrics.auc(fpr, tpr)

0.6780351780351781

In [199]:
y_hat_list = [1 if x > 0.5 else 0 for x in y_hat_l]

In [198]:
from sklearn.metrics import classification_report
y_true = true_label
target_names = ['class 0', 'class 1']
print(classification_report(y_true, y_hat_list, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.49      0.36      0.41       126
     class 1       0.68      0.79      0.73       222

    accuracy                           0.64       348
   macro avg       0.59      0.57      0.57       348
weighted avg       0.62      0.64      0.62       348

