In [10]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.nn as dglnn

from collections import Counter

In [2]:
data = np.load('./phase1_gdata.npz')

node_feat = data['x']
node_label = data['y']
edge_pair = data['edge_index']
edge_type = data['edge_type']
edge_time = data['edge_timestamp']
    
train_mask = data['train_mask']
test_mask = data['test_mask']

g = dgl.graph(edge_pair.tolist())
ss = StandardScaler()
g.ndata['feat'] = torch.Tensor(ss.fit_transform(node_feat))
g.ndata['label'] = torch.Tensor(node_label)
g.edata['type'] = torch.Tensor(edge_type)
g.edata['time'] = torch.Tensor(edge_time)

g

Graph(num_nodes=4059035, num_edges=4962032,
      ndata_schemes={'feat': Scheme(shape=(17,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.float32)}
      edata_schemes={'type': Scheme(shape=(), dtype=torch.float32), 'time': Scheme(shape=(), dtype=torch.float32)})

In [3]:
class SAGE(nn.Module):
    def __init__(self, in_feat, h_feat, out_feat):
        super(SAGE, self).__init__()
        self.conv1 = dglnn.SAGEConv(in_feat, h_feat, aggregator_type="mean")
        self.conv2 = dglnn.SAGEConv(h_feat, out_feat, aggregator_type="mean")
#         self.linear = nn.Linear(h_feat_2, out_feat)
#         self.dropout = nn.Dropout(0.2)
        
    def forward(self, g, inputs):
        h = self.conv1(g, inputs)
        h = F.relu(h)
#         h = self.dropout(h)
        h = self.conv2(g, h)
        return h

In [33]:
model = SAGE(node_feat.shape[1], 128, 2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.02)

In [61]:
np.random.shuffle(train_mask)
train_idx = train_mask[:int(len(train_mask)/100*80)]
val_idx = train_mask[int(len(train_mask)/100*80):]

features = g.ndata['feat']
labels = g.ndata['label'].type(torch.long)

In [63]:
epochs = 100

for epoch in range(epochs):
    model.train()
        
    logits = model(g, features)
#     pred = logits.argmax(1)
    loss = F.nll_loss(logits[train_idx], labels[train_idx])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print("epoch {}, loss {}".format(epoch, loss))

            
    model.eval()
    with torch.no_grad():
        logits = model(g, features).view(-1)
        train_logits = logits[train_idx].numpy()
        val_logits = logits[val_idx].numpy()
        
        train_labels = labels[train_idx]
        val_labels = labels[val_idx]
        
        train_auc = roc_auc_score(train_labels, train_logits)
        val_auc = roc_auc_score(val_labels, val_logits)
        print("epoch {}, train_auc {}, val_auc {}".format(epoch, train_auc, val_auc))
        
    if epoch % 50 == 49:
        print("reduce lr...")
        for p in optimizer.param_groups:
            p['lr'] *= 0.5

epoch 0, loss -9.577591896057129
epoch 0, train_auc 0.5039595956594605, val_auc 0.5097437584523443
epoch 1, loss -12.918706893920898
epoch 1, train_auc 0.5039644630937843, val_auc 0.5096536293974707
epoch 2, loss -17.31730079650879
epoch 2, train_auc 0.503971491264474, val_auc 0.5095182542472245
epoch 3, loss -22.681764602661133


KeyboardInterrupt: 

In [7]:
model.eval()
with torch.no_grad():
    test_logits = torch.sigmoid(model(g, features)[test_mask]).numpy()

In [9]:
res = np.concatenate((1-test_logits, test_logits), axis=1)
np.save("sage_full.npy", res)

In [12]:
val_logits

array([-5.973411 , -4.1339426, -5.5519123, ..., -3.489771 , -6.6762376,
       -4.7445393], dtype=float32)

In [23]:
roc_auc_score(val_labels, logits[val_idx])

0.7487288829799463

In [25]:
sum(labels[train_idx])

tensor(7769.)

In [26]:
len(labels[train_idx])

661877

In [27]:
7769/661877

0.011737830442816415