To use pytorch geometric temporal, make sure you have torch 1.9.0 installed (uninstall 1.10.0 before).

In [1]:
import torch
print(torch.__version__)

1.9.0+cpu


In [2]:
import torch
import numpy as np
import pandas as pd

In [3]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
!pip install torch-geometric
!pip install torch-geometric-temporal

Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html


In [4]:
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

def transform_and_split(data):
    # Normalize node features and transform data type
    data.x = normalize(data.x, axis=1, norm='max')
    data.x = torch.from_numpy(data.x).to(torch.float64)
    data.y = data.y.apply_(lambda x:  1 if (x > 0) else 0) # Change y into {0, 1} for binary classification
    data.y = data.y.to(torch.float64)    
    data.edge_attr = data.edge_attr.to(torch.double)


    # Split into train/test set
#    split = nodeSplit(split="train_rest", num_splits = 1, num_val = 0.0, num_test= 0.2)
#    masked_data = split(data)

#    print("Training samples:", torch.sum(masked_data.train_mask).item())
#    print("Validation samples:", torch.sum(masked_data.val_mask ).item())
#    print("Test samples:", torch.sum(masked_data.test_mask ).item())
    print_basic_info(data)
    return data

In [5]:
def print_basic_info(data):
    print()
    print(data)
    print('===========================================================================================================')

    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Has isolated nodes: {data.has_isolated_nodes()}')
    print(f'Has self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

### Get and split data

In [7]:
df = pd.read_csv('../data/raw/news_data.csv', index_col = 0)

In [8]:
df = df.dropna()

In [9]:
df = df.sort_values('Date')

In [10]:
df

Unnamed: 0,Date,url,full_text
1257,2016-10-02,https://www.nytimes.com/2016/09/02/science/spa...,A spectacular explosion of a SpaceX rocket on...
3954,2016-10-02,https://www.nytimes.com/2016/09/19/business/tr...,The Treasury’s schedule of financing this wee...
6585,2016-10-02,https://www.nytimes.com/2016/09/08/technology/...,SAN FRANCISCO — Michael Dell and Meg Whitman ...
6584,2016-10-02,https://www.nytimes.com/2016/10/01/business/de...,Remember the echo of doom we mentioned yester...
4535,2016-10-02,https://www.nytimes.com/2016/09/23/business/de...,"For the last three years, JPMorgan Chase’s hi..."
...,...,...,...
7242,2021-09-04,https://www.nytimes.com/2021/08/05/us/politics...,WASHINGTON — The Biden administration is deve...
7243,2021-09-04,https://www.nytimes.com/2021/08/20/world/asia/...,The members of the Bordia family thought they...
7244,2021-09-04,https://www.nytimes.com/2021/08/18/travel/blin...,"April DeMuth and her partner, Warren Watson, ..."
6188,2021-09-04,https://www.nytimes.com/2021/09/02/business/de...,The Republican-controlled Texas Legislature t...


In [11]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

In [12]:
df_list = [part for _, part in df.groupby(pd.Grouper(key='Date', freq='Q'))]

In [13]:
df_list[0]

Unnamed: 0,Date,url,full_text
1257,2016-10-02,https://www.nytimes.com/2016/09/02/science/spa...,A spectacular explosion of a SpaceX rocket on...
3954,2016-10-02,https://www.nytimes.com/2016/09/19/business/tr...,The Treasury’s schedule of financing this wee...
6585,2016-10-02,https://www.nytimes.com/2016/09/08/technology/...,SAN FRANCISCO — Michael Dell and Meg Whitman ...
6584,2016-10-02,https://www.nytimes.com/2016/10/01/business/de...,Remember the echo of doom we mentioned yester...
4535,2016-10-02,https://www.nytimes.com/2016/09/23/business/de...,"For the last three years, JPMorgan Chase’s hi..."
...,...,...,...
2624,2016-12-03,https://www.nytimes.com/2016/11/16/business/de...,"Until last year, John S. Weinberg was a top c..."
2623,2016-12-03,https://www.nytimes.com/2016/11/27/us/politics...,When Julia Jones arrived at her office in San...
2622,2016-12-03,https://www.nytimes.com/2016/11/29/us/politics...,"WASHINGTON — Steven Mnuchin, a financier with..."
2244,2016-12-03,https://www.nytimes.com/2016/11/22/upshot/what...,"Donald J. Trump, through decades in public li..."


In [14]:
import yaml

with open('../configs/dow_jones.yaml') as f:
    
    data = yaml.load(f, Loader=yaml.FullLoader)
    print(data)

{'companies': [{'wba': {'alias': ['$wba', 'wba', 'walgreen boots alliance inc', 'walgreen boots alliance', 'walgreenbootsalliance']}}, {'v': {'alias': ['$v', 'v', 'visa inc class a', 'visa']}}, {'crm': {'alias': ['$crm', 'crm', 'salesforce.com inc', 'salesforce']}}, {'cvx': {'alias': ['$cvx', 'cvx', 'chevron corp', 'chevron']}}, {'pg': {'alias': ['$pg', 'pg', 'procter & gamble', 'procter&gamble']}}, {'vz': {'alias': ['$vz', 'vz', 'verizon communications inc', 'verizon']}}, {'wmt': {'alias': ['$wmt', 'wmt', 'walmart stores inc', 'walmart stores', 'walmart']}}, {'unh': {'alias': ['$unh', 'unh', 'unitedhealth group inc', 'unitedhealth group', 'unitedhealthgroup']}}, {'trv': {'alias': ['$trv', 'trv', 'travelers companies inc', 'travelers companies', 'travelers', 'travelerscompanies']}}, {'mcd': {'alias': ['$mcd', 'mcd', 'mcdonalds corp', 'mcdonalds']}}, {'mmm': {'alias': ['$mmm', 'mmm', '3m', '3m']}}, {'nke': {'alias': ['$nke', 'nke', 'nike inc class b', 'nike']}}, {'mrk': {'alias': ['$mrk

In [15]:
def get_matrix(df):
    companies = [list(com.keys())[0] for com in data['companies']]
    alias = list(map(lambda x: list(x.items())[0][1]["alias"], data['companies']))
    res = pd.DataFrame(0, index=companies, columns=companies)
    for company1, search_items1 in zip(companies, alias):
        for company2, search_items2 in zip(companies, alias):
            if company1 != company2:
                search_items = search_items1 + search_items2
            else:
                search_items = search_items1
            pat = "|".join(search_items)
            res[company1][company2] += df.full_text.str.contains(
                pat
            ).sum()
    return res.values

In [16]:
mat_list = [get_matrix(df) for df in df_list]

In [17]:
len(mat_list)

20

In [55]:
year = 2016
quarter = 4
quarter_year = []
for i in range(20):
    if quarter == 5:
        quarter = 1
        year += 1
    if (quarter != 3) or (year != 2021):
        quarter_year.append([year, quarter])
    np.save('../data/raw/news_1/'+str(year)+'_q'+str(quarter)+'.npy', mat_list[i])
    quarter += 1
        

In [56]:
quarter_year

[[2016, 4],
 [2017, 1],
 [2017, 2],
 [2017, 3],
 [2017, 4],
 [2018, 1],
 [2018, 2],
 [2018, 3],
 [2018, 4],
 [2019, 1],
 [2019, 2],
 [2019, 3],
 [2019, 4],
 [2020, 1],
 [2020, 2],
 [2020, 3],
 [2020, 4],
 [2021, 1],
 [2021, 2]]

In [57]:
import os
stock_df = pd.read_csv(
            os.path.join('../data/raw',"stock","raw.csv"),
            usecols=["ticker_symbol", "Date", "Close"],
            parse_dates=["Date"],
        )

In [58]:
stock_df

Unnamed: 0,Date,Close,ticker_symbol
0,2016-10-03,28.129999,aapl
1,2016-10-04,28.250000,aapl
2,2016-10-05,28.262501,aapl
3,2016-10-06,28.472500,aapl
4,2016-10-07,28.514999,aapl
...,...,...,...
36506,2021-09-27,142.250000,wmt
36507,2021-09-28,140.500000,wmt
36508,2021-09-29,140.440002,wmt
36509,2021-09-30,139.380005,wmt


In [59]:
X_y = []
for yr_quarter in quarter_year:
    yr = yr_quarter[0]
    quarter = yr_quarter[1]
    
    ######################################################## 
    # prepare X (change this if you want to add SEC emb, etc.)
    ########################################################
    curr = stock_df[(stock_df.Date.dt.year == yr) & (stock_df.Date.dt.quarter == quarter)]
    X = curr.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    X_tensor = torch.tensor(X)
    
    ########################################################
    # prepare y (change this if you want to change labels)
    ########################################################
    if quarter == 4:
        y_yr = yr+1
        y_quarter = 1
    else:
        y_yr = yr
        y_quarter = quarter + 1
    nxt = stock_df[(stock_df.Date.dt.year == y_yr) & (stock_df.Date.dt.quarter == y_quarter)]
    y = nxt.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    print(y.shape)
    print(X.shape)
    y = (y.mean(1) - X.mean(1)) / X.mean(1)
    y_tensor = torch.tensor(y)
    X_y.append((X_tensor,y_tensor))

(29, 62)
(29, 63)
(29, 63)
(29, 62)
(29, 63)
(29, 63)
(29, 63)
(29, 63)
(29, 61)
(29, 63)
(29, 64)
(29, 61)
(29, 63)
(29, 64)
(29, 63)
(29, 63)
(29, 61)
(29, 63)
(29, 63)
(29, 61)
(29, 64)
(29, 63)
(29, 64)
(29, 64)
(29, 62)
(29, 64)
(29, 63)
(29, 62)
(29, 64)
(29, 63)
(29, 64)
(29, 64)
(29, 61)
(29, 64)
(29, 63)
(29, 61)
(29, 64)
(29, 63)


In [109]:
X_y

[(tensor([[ 28.1300,  28.2500,  28.2625,  ...,  29.1900,  29.1825,  28.9550],
          [167.3400, 167.5500, 167.2400,  ..., 147.6700, 147.7800, 146.2100],
          [ 63.8100,  63.9100,  64.3600,  ...,  74.3700,  73.9200,  74.0800],
          ...,
          [ 51.8800,  51.2600,  50.2700,  ...,  53.4400,  53.7400,  53.3800],
          [ 80.3800,  80.2400,  80.5100,  ...,  83.5900,  83.4800,  82.7600],
          [ 72.0100,  71.7500,  71.6700,  ...,  69.3100,  69.2600,  69.1200]],
         dtype=torch.float64),
  tensor([ 0.1615,  0.1041,  0.1293,  0.1663,  0.0445,  0.1008,  0.0638,  0.0300,
           0.1293,  0.1891,  0.1003,  0.0860,  0.1008,  0.0106,  0.0342,  0.1569,
           0.0023,  0.0701,  0.0630,  0.0367,  0.0655,  0.0771,  0.0403,  0.0526,
           0.0964,  0.0700,  0.0038,  0.0141, -0.0140], dtype=torch.float64)),
 (tensor([[ 29.0375,  29.0050,  29.1525,  ...,  36.0300,  35.9825,  35.9150],
          [150.7300, 152.8700, 152.9800,  ..., 163.0600, 164.3800, 164.0700],
    

In [60]:
from torch_geometric.utils import dense_to_sparse
edge_idx = []
edge_att = []
for i in range(19):
    edge_index, edge_attr = dense_to_sparse(torch.from_numpy(mat_list[i]))
    edge_idx.append(edge_index)
    edge_att.append(edge_attr)

In [61]:
edge_indices = [i.numpy() for i in edge_idx]
edge_weights = [i.numpy() for i in edge_att]

In [115]:
mat_list

[array([[ 17, 402,  17,  19,  27,  17,  17,  33,  34,  17,  17, 135,  17,
          17,  17,  62,  17, 301,  62, 163,  17,  81, 336, 281,  63,  24,
         389,  17,  17],
        [402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402,
         402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402,
         402, 402, 402],
        [ 17, 402,   0,   2,  10,   0,   0,  16,  17,   0,   0, 119,   0,
           0,   0,  46,   0, 296,  50, 149,   0,  64, 327, 271,  51,   7,
         389,   0,   0],
        [ 19, 402,   2,   2,  12,   2,   2,  18,  19,   2,   2, 120,   2,
           2,   2,  47,   2, 296,  52, 149,   2,  65, 327, 271,  53,   9,
         389,   2,   2],
        [ 27, 402,  10,  12,  10,  10,  10,  26,  25,  10,  10, 129,  10,
          10,  10,  56,  10, 296,  60, 155,  10,  73, 330, 276,  61,  17,
         389,  10,  10],
        [ 17, 402,   0,   2,  10,   0,   0,  16,  17,   0,   0, 119,   0,
           0,   0,  46,   0, 296,  50, 149,   0,  64, 327, 27

In [62]:
len(X_y)

19

In [63]:
features = []
targets = []
for i in range(19):
    features.append(normalize(X_y[i][0].numpy(), axis=1, norm='max'))
    #features.append(X_y[i][0].numpy())
    targets.append([1 if a > 0 else 0 for a in X_y[i][1].numpy()])
targets = np.asarray(targets)

In [110]:
targets[3]

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 0, 1])

In [65]:
mat_list

[array([[ 17, 402,  17,  19,  27,  17,  17,  33,  34,  17,  17, 135,  17,
          17,  17,  62,  17, 301,  62, 163,  17,  81, 336, 281,  63,  24,
         389,  17,  17],
        [402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402,
         402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402, 402,
         402, 402, 402],
        [ 17, 402,   0,   2,  10,   0,   0,  16,  17,   0,   0, 119,   0,
           0,   0,  46,   0, 296,  50, 149,   0,  64, 327, 271,  51,   7,
         389,   0,   0],
        [ 19, 402,   2,   2,  12,   2,   2,  18,  19,   2,   2, 120,   2,
           2,   2,  47,   2, 296,  52, 149,   2,  65, 327, 271,  53,   9,
         389,   2,   2],
        [ 27, 402,  10,  12,  10,  10,  10,  26,  25,  10,  10, 129,  10,
          10,  10,  56,  10, 296,  60, 155,  10,  73, 330, 276,  61,  17,
         389,  10,  10],
        [ 17, 402,   0,   2,  10,   0,   0,  16,  17,   0,   0, 119,   0,
           0,   0,  46,   0, 296,  50, 149,   0,  64, 327, 27

In [66]:
features[1].shape

(29, 62)

In [91]:
padded_features = []
for i in features:
    padded_features.append(np.pad(i, [(0, 0), (0, 64-i.shape[1])], 'mean'))

In [73]:
comp_emb = []
for fp in sorted(os.listdir("../data/raw/sec/")):
    full_path = os.path.join("../data/raw", "sec", fp)
    if fp.split(".")[-1]=='npy':
        comp_emb.append(torch.from_numpy(np.load(full_path)))
comp_emb = torch.stack(comp_emb)

In [74]:
comp_emb = np.asarray([comp_emb.numpy() for i in range(19)])

In [75]:
padded_features = np.concatenate((padded_features, comp_emb), axis = 2)

In [93]:
padded_features = np.asarray(padded_features)
padded_features.shape

(19, 29, 64)

In [94]:
from torch_geometric_temporal.signal import DynamicGraphTemporalSignal

In [95]:
temporal_signal = DynamicGraphTemporalSignal(edge_indices = edge_indices , edge_weights = edge_weights, features = padded_features, targets = targets)

In [96]:
temporal_signal

<torch_geometric_temporal.signal.dynamic_graph_temporal_signal.DynamicGraphTemporalSignal at 0x28b57ebeb20>

In [97]:
from torch_geometric_temporal.signal import temporal_signal_split

train_dataset, test_dataset = temporal_signal_split(temporal_signal, train_ratio=0.8)

In [111]:
import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import DCRNN
from torch_geometric_temporal import TemporalConv
from torch_geometric_temporal import EvolveGCNO
from torch_geometric_temporal import GConvGRU
class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features):
        super(RecurrentGCN, self).__init__()
        self.evol = EvolveGCNO(node_features)
        self.recurrent = DCRNN(node_features, 16, 1)
        self.conv = GConvGRU(node_features, 16, 2)
        #self.linear = torch.nn.Linear(16, 2)
        self.linear = torch.nn.Linear(64, 2)
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x, edge_index, edge_weight):
#        h = self.recurrent(x, edge_index, edge_weight)
#        h = self.dropout(h)
#        h = self.conv(x, edge_index, edge_weight)
        #h = self.dropout(h)
        h = self.evol(x, edge_index, edge_weight)
        h = F.relu(h)
        h = self.linear(h)
        h = torch.sigmoid(h)
        return h

In [112]:
from tqdm import tqdm

model = RecurrentGCN(node_features = 64)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

model.train()

for epoch in tqdm(range(100)):
    loss = 0
    for time, snapshot in enumerate(train_dataset):
        y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        loss += torch.nn.CrossEntropyLoss()(y_hat, snapshot.y.long())
#        loss += torch.mean((y_hat-snapshot.y)**2)
#        loss = loss / (time+1)
        
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

100%|██████████| 100/100 [00:03<00:00, 26.12it/s]


In [113]:
y_hat_l = []
model.eval()
cost = 0
for time, snapshot in enumerate(test_dataset):
    y_hat = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
    #cost = cost + torch.mean((y_hat-snapshot.y)**2)
    y_hat_l.append(y_hat)
#cost = cost / (time+1)
#cost = cost.item()
#print("MSE: {:.4f}".format(cost))


In [114]:
y_hat_l

[tensor([[0.3385, 0.9999],
         [0.5066, 1.0000],
         [0.3256, 0.9996],
         [0.3259, 0.9996],
         [0.3459, 0.9999],
         [0.3254, 0.9996],
         [0.3260, 0.9996],
         [0.3325, 0.9998],
         [0.3630, 1.0000],
         [0.3258, 0.9996],
         [0.3257, 0.9996],
         [0.4053, 1.0000],
         [0.3258, 0.9996],
         [0.3258, 0.9996],
         [0.3258, 0.9996],
         [0.3873, 1.0000],
         [0.3256, 0.9996],
         [0.4838, 1.0000],
         [0.3686, 1.0000],
         [0.4249, 1.0000],
         [0.3258, 0.9996],
         [0.3687, 1.0000],
         [0.4954, 1.0000],
         [0.4774, 1.0000],
         [0.3495, 1.0000],
         [0.3388, 0.9999],
         [0.5060, 1.0000],
         [0.3260, 0.9996],
         [0.3257, 0.9996]], grad_fn=<SigmoidBackward>),
 tensor([[0.3017, 0.9999],
         [0.0939, 1.0000],
         [0.3259, 0.9996],
         [0.3257, 0.9996],
         [0.2900, 0.9999],
         [0.3264, 0.9996],
         [0.3259, 0.9996],

In [116]:
y_hat_l = [list(np.squeeze(i.detach().numpy())) for i in y_hat_l]
y_hat_l = [z for y in y_hat_l for z in y]

In [117]:
y_hat_l = [y[1] for y in y_hat_l]

In [118]:
y_hat_l

[0.9998522,
 1.0,
 0.99959296,
 0.99959296,
 0.99992037,
 0.9995927,
 0.99959296,
 0.999803,
 0.99998426,
 0.9995931,
 0.9995931,
 0.99999964,
 0.9995931,
 0.99959284,
 0.9995931,
 0.9999988,
 0.9995931,
 1.0,
 0.9999906,
 1.0,
 0.99959296,
 0.99998534,
 1.0,
 1.0,
 0.9999585,
 0.99987423,
 1.0,
 0.99959296,
 0.99959296,
 0.99986374,
 1.0,
 0.99956197,
 0.99956185,
 0.99993074,
 0.99956197,
 0.9995621,
 0.99987435,
 0.9999771,
 0.99956185,
 0.9995621,
 0.99999976,
 0.9995621,
 0.99956197,
 0.9995621,
 0.99999607,
 0.9995621,
 1.0,
 0.9999893,
 1.0,
 0.9995622,
 0.9999889,
 1.0,
 1.0,
 0.99991643,
 0.9998053,
 1.0,
 0.99956197,
 0.9995622,
 0.9997795,
 1.0,
 0.999546,
 0.9995944,
 0.9999037,
 0.999546,
 0.9995461,
 0.99994147,
 0.99996054,
 0.999546,
 0.9995461,
 0.99999976,
 0.9995461,
 0.9995461,
 0.99954623,
 0.99999976,
 0.99954623,
 1.0,
 0.9999627,
 1.0,
 0.99954623,
 0.99995315,
 1.0,
 1.0,
 0.9999473,
 0.99977666,
 1.0,
 0.9995461,
 0.99954623,
 0.99991703,
 1.0,
 0.99963987,
 0

In [119]:
true_label = []
for time, snapshot in enumerate(test_dataset):
    true_label.append(list(snapshot.y.detach().numpy()))

In [120]:
true_label = [int(z) for y in true_label for z in y]

In [121]:
import numpy as np
from sklearn import metrics
y = true_label
pred = np.array(y_hat_l)
fpr, tpr, thresholds = metrics.roc_curve(y, pred)
metrics.auc(fpr, tpr)

0.44011627906976747

In [None]:
y_hat_list = [1 if x > 0.5 else 0 for x in y_hat_l]

In [None]:
from sklearn.metrics import classification_report
y_true = true_label
target_names = ['class 0', 'class 1']
print(classification_report(y_true, y_hat_list, target_names=target_names))