In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import os
import json
import time
import argparse

import torch
import pandas as pd
import numpy as np
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T
from node2vec_impl_dp import Node2Vec

NUM_EPOCHS = 100
BATCH_SIZE = 64

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
device

'cuda'

In [13]:
SAVE_PATH = 'results/'
DATA_PATH = '../data/'
NUM_GPUS = 1
NUM_GPUS_STR = str(NUM_GPUS)
TYPE = 'v100'

In [None]:
# best parameters from tuning experiments
P = 1
Q = 1
WALK = 50
BATCH_SIZE = 32
LR = 0.0025

In [14]:
# load index map
with open(f'{DATA_PATH}reddit_index.json') as f:
    reddit_dict = json.load(f)

# read reddit s2d as a Data object in pytorch geometric
## read the entire dataset in memory as a pandas dataframe
df = pd.read_csv(f'{DATA_PATH}reddit_subreddit_to_domain__gt-01-urls.csv', header=None)

## extract source and target nodes and map to corresponding integer indices
source_nodes = df.iloc[:,0].apply(lambda x: reddit_dict[x]).values.tolist()
target_nodes = df.iloc[:,1].apply(lambda x: reddit_dict[x]).values.tolist()
num_nodes = len(set(source_nodes).union(set(target_nodes)))
weight = df.iloc[:,2].values.tolist()

## convert to pytorch geometric Data object
edge_index = torch.tensor([source_nodes, target_nodes])
edge_attr = torch.tensor(weight)[:,None]
data = Data(edge_index=edge_index, edge_attr=edge_attr)
data.num_nodes = num_nodes
transform = T.ToUndirected()
data = transform(data)

# read domain ideology for evaluation
domain_ideology = pd.read_csv(f'{DATA_PATH}robertson_et_al.csv')
domain_ideology = domain_ideology[['domain', 'score']].copy()
domain_ideology['id'] = domain_ideology['domain'].apply(lambda x: reddit_dict[x] if x in reddit_dict else None)
domain_ideology = domain_ideology[domain_ideology['id'].notna()].reset_index(drop=True)
domain_ideology['id'] = domain_ideology['id'].astype('int64')

# train, test, val split
train = domain_ideology.sample(frac=0.8,random_state=42)
test = domain_ideology[~domain_ideology.index.isin(train.index)]
train_sub = train.sample(frac=0.8, random_state=24)
val = train[~train.index.isin(train_sub.index)]

train_x, train_y = train_sub['id'].tolist(), train_sub['score'].tolist()
val_x, val_y = val['id'].tolist(), val['score'].tolist()

In [15]:
# model and optimizer specification 
model = Node2Vec(data.edge_index, embedding_dim=128, 
              walk_length=WALK, context_size=10, walks_per_node=10, 
              num_negative_samples=1, p=P, q=Q, sparse=True)
model = torch.nn.DataParallel(model)
model.to(device)
loader = model.module.loader(batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=LR)

def train():
    """
    Train node2vec batch by batch using postive and negative samples from loader.
    Returns training loss (log-likelihood).
    """
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()

        # concatenate along last dimension
        batch = torch.cat((pos_rw, neg_rw), -1)

        # for calling data parallel, call model.forward
        # for calling forward without dataparallel, call model.module
        loss = model(batch.to(device))
        loss = loss.sum()/NUM_GPUS

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

@torch.no_grad()
def test():
    """
    Evaluate embedding on downstream ideology scoring task using default predictor (Ridge).
    Returns train and validation MSE.
    """
    model.eval()
    z = model.module()

    from sklearn.linear_model import Ridge
    from sklearn.metrics import mean_squared_error
    clf = Ridge(alpha=0.01).fit(z[train_x].detach().cpu().numpy(), train_y)
    train_mse = mean_squared_error(train_y, clf.predict(z[train_x].detach().cpu().numpy()))

    preds = clf.predict(z[val_x].detach().cpu().numpy())
    val_mse = mean_squared_error(val_y, preds)

    return train_mse, val_mse

with open(f'results/jupyter_{TYPE}_{NUM_GPUS_STR}_log.txt', 'a') as f:
    f.write(f'Loss,Train MSE,Val MSE,Total Time,Train Time,Val Time\n')
    
best_val_mse = 0.0
for epoch in range(1, NUM_EPOCHS):
    # train
    loss = train()

    # validation
    train_mse, val_mse = test()

    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train MSE: {train_mse:.4f}, Val MSE: {val_mse:.4f}, Total Time: {(end_time-start_time)/60:.2f} mins, Train time: {train_time/60:.2f} mins, Val Time: {val_time/60:.2f} mins')
    
    with open(f'{SAVE_PATH}jupyter_{TYPE}_{NUM_GPUS_STR}_log.txt', 'a') as f:
        f.write(f'{str(loss)},{str(train_mse)},{str(val_mse)},{str(end_time-start_time)},{str(train_time)},{str(val_time)}\n')
        
    # checkpoint
    if val_mse <= best_val_mse:
        torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'val_mse': val_mse,
        }, str(epoch)+".pth")
        best_val_mse = val_mse

# save model
torch.save(model.state_dict(), f'{SAVE_PATH}jupyter_{TYPE}_{NUM_GPUS}.pth')

Epoch: 01, Loss: 3.5461, Train MSE: 0.1857, Val MSE: 0.1856, Total Time: 2.46 mins, Train time: 2.46 mins, Val Time: 0.00 mins
Epoch: 02, Loss: 1.3729, Train MSE: 0.1858, Val MSE: 0.1841, Total Time: 4.93 mins, Train time: 2.47 mins, Val Time: 0.00 mins
Epoch: 03, Loss: 0.9798, Train MSE: 0.1865, Val MSE: 0.1835, Total Time: 7.39 mins, Train time: 2.45 mins, Val Time: 0.00 mins
Epoch: 04, Loss: 0.8827, Train MSE: 0.1864, Val MSE: 0.1843, Total Time: 9.85 mins, Train time: 2.47 mins, Val Time: 0.00 mins
Epoch: 05, Loss: 0.8560, Train MSE: 0.1859, Val MSE: 0.1834, Total Time: 12.31 mins, Train time: 2.46 mins, Val Time: 0.00 mins
Epoch: 06, Loss: 0.8447, Train MSE: 0.1839, Val MSE: 0.1813, Total Time: 14.78 mins, Train time: 2.47 mins, Val Time: 0.00 mins
Epoch: 07, Loss: 0.8383, Train MSE: 0.1822, Val MSE: 0.1798, Total Time: 17.24 mins, Train time: 2.46 mins, Val Time: 0.00 mins
Epoch: 08, Loss: 0.8342, Train MSE: 0.1736, Val MSE: 0.1734, Total Time: 19.70 mins, Train time: 2.46 mins, 

Epoch: 65, Loss: 0.8181, Train MSE: 0.1225, Val MSE: 0.1260, Total Time: 160.89 mins, Train time: 2.48 mins, Val Time: 0.00 mins
Epoch: 66, Loss: 0.8181, Train MSE: 0.1228, Val MSE: 0.1274, Total Time: 163.37 mins, Train time: 2.48 mins, Val Time: 0.00 mins
Epoch: 67, Loss: 0.8180, Train MSE: 0.1228, Val MSE: 0.1238, Total Time: 165.84 mins, Train time: 2.46 mins, Val Time: 0.00 mins
Epoch: 68, Loss: 0.8181, Train MSE: 0.1227, Val MSE: 0.1198, Total Time: 168.29 mins, Train time: 2.45 mins, Val Time: 0.00 mins
Epoch: 69, Loss: 0.8181, Train MSE: 0.1226, Val MSE: 0.1203, Total Time: 170.76 mins, Train time: 2.46 mins, Val Time: 0.00 mins
Epoch: 70, Loss: 0.8180, Train MSE: 0.1244, Val MSE: 0.1222, Total Time: 173.23 mins, Train time: 2.47 mins, Val Time: 0.00 mins
Epoch: 71, Loss: 0.8181, Train MSE: 0.1246, Val MSE: 0.1235, Total Time: 175.70 mins, Train time: 2.47 mins, Val Time: 0.00 mins
Epoch: 72, Loss: 0.8180, Train MSE: 0.1241, Val MSE: 0.1247, Total Time: 178.17 mins, Train time: