## Tuning p

In [None]:
# installing packages
# # !pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio==0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
# # !pip3 install jupyter jupyterhub pandas matplotlib scipy scikit-learn scikit-image Pillow
# !pip3 install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
# !pip3 install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
# !pip3 install torch-cluster -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
# !pip3 install torch-geometric -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
# !pip3 install xgboost

# importing libraries
import os
import json
import csv 
import warnings
warnings.filterwarnings(action="ignore")

import numpy as np
import torch
import pandas as pd
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T
from torch_geometric.nn import Node2Vec

from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau, CyclicLR

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
import xgboost as xg
import time

import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# !nvidia-smi

In [None]:
#domain-index mapping
with open(os.path.join('../data/reddit_index.json')) as f:
    reddit_dict = json.load(f)

In [None]:
#subreddit-domain network
df = pd.read_csv('../data/reddit_subreddit_to_domain__gt-01-urls.csv', header=None)
source_nodes = df.iloc[:,0].apply(lambda x: reddit_dict[x]).values.tolist()
target_nodes = df.iloc[:,1].apply(lambda x: reddit_dict[x]).values.tolist()
num_nodes = len(set(source_nodes).union(set(target_nodes)))
weight = df.iloc[:,2].values.tolist()
edge_index = torch.tensor([source_nodes, target_nodes])
edge_attr = torch.tensor(weight)[:,None]
data = Data(edge_index=edge_index, edge_attr=edge_attr)

data.num_nodes = num_nodes
transform = T.ToUndirected()
data = transform(data)

In [None]:
#target labels (political ideology score)
domain_ideology = pd.read_csv('../data/robertson_et_al.csv')
domain_ideology = domain_ideology[['domain', 'score']].copy()
domain_ideology['id'] = domain_ideology['domain'].apply(lambda x: reddit_dict[x] if x in reddit_dict else None)
domain_ideology = domain_ideology[domain_ideology['id'].notna()].reset_index(drop=True)
domain_ideology['id'] = domain_ideology['id'].astype('int64')

In [None]:
#train-test-split
train = domain_ideology.sample(frac=0.8,random_state=42)
test = domain_ideology[~domain_ideology.index.isin(train.index)]
train_sub = train.sample(frac=0.8, random_state=24)
val = train[~train.index.isin(train_sub.index)]

train_x, train_y = train_sub['id'].tolist(), train_sub['score'].tolist()
val_x, val_y = val['id'].tolist(), val['score'].tolist()
test_x, test_y = test['id'].tolist(), test['score'].tolist()

In [None]:
'''
Predict the political ideology of each node in the subreddit-domain network using:
1. node2vec for creating(embedding) feature representations of the nodes of the graph
2. ridge regression for predicting the political inclination of each node in the graph
'''
def predict(p=1, q=1, walk_length=10, classifier="Ridge", start_epochs=0, num_epochs=30,
            batch_size=128, learning_rate=0.01):
    
    model = Node2Vec(data.edge_index, embedding_dim=128, 
                 walk_length=walk_length, context_size=10,
                 walks_per_node=10, num_negative_samples=1,
                 p=p, q=q, sparse=True).to(device)
    
    optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=learning_rate)
    loader = model.loader(batch_size=batch_size, shuffle=True, num_workers=8)
    
    # scheduler = ExponentialLR(optimizer, gamma=0.9)
    # scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.001, threshold_mode='rel', eps=1e-04)
    
    train_loss_list =[]
    val_mse_list = []
    
    def train():
        model.train()
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    #     scheduler.step(loss)
        return total_loss / len(loader)

    @torch.no_grad()
    def test(classifier="Ridge"):
        model.eval()
        z = model()

        if classifier=="Ridge":
            clf = Ridge(alpha=0.01).fit(z[train_x].detach().cpu().numpy(), train_y)
        elif classifier=="RF":
            clf = RandomForestRegressor(max_depth=10, random_state=0).fit(z[train_x].detach().cpu().numpy(), train_y)  
        elif classifier=="XGB":
            clf = xg.XGBRegressor(objective ='reg:squarederror',
                                  n_estimators = 10,
                                  seed = 0).fit(z[train_x].detach().cpu().numpy(), train_y)

        val_preds = clf.predict(z[val_x].detach().cpu().numpy())
        train_preds = clf.predict(z[train_x].detach().cpu().numpy())
        return mean_squared_error(train_y, train_preds), mean_squared_error(val_y, val_preds) 
    
    
    start_time = time.time()
    for epoch in range(start_epochs+1, num_epochs+1):
        loss = train()
        train_mse, val_mse = test()
        end_time = time.time()
        
        train_loss_list.append(loss)
        val_mse_list.append(val_mse)
        
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train MSE: {train_mse:.4f}, Val MSE: {val_mse:.4f}, Time: {(end_time-start_time)/60:.2f} mins')
    return train_loss_list, val_mse_list

In [None]:
'''
Visualizing train loss and validation MSE
for a specific hyperparameter across multiple experiments
'''
def visualization(train_losses, val_mses, flag, tune_range):
    if flag=="p":
        title="Tuning p for Network Embedding"
    elif flag=="q":
        title="Tuning q for Network Embedding"
    elif flag=="walk":
        title="Tuning Random Walk Length for Network Embedding"   
    elif flag=="batch_size":
        title="Tuning Batch-Size for Network Embedding"   
    elif flag=="learning_rate":
        title="Tuning Learning Rate for Network Embedding"           
        
    n_epochs = len(train_losses[0])
    
    plt.figure(figsize=(10,5))
    for i, param in enumerate(tune_range):
        print(param)
        plt.plot(train_losses[i], marker="o", label=flag + " = " + str(param))
    plt.xlabel(flag,size=14)
    plt.ylabel("Train Loss",size=14)
    plt.title(title, size=16)
    plt.legend()
    plt.savefig(flag+'_train_loss.png')
    plt.show()
    
    
    plt.figure(figsize=(10,5))
    for i,param in enumerate(tune_range):
        plt.plot(val_mses[i], marker="o", label = flag + " = " + str(param))
    plt.xlabel(flag,size=14)
    plt.ylabel("Validation MSE", size=14)
    plt.title(title, size=16)
    plt.legend()
    plt.savefig(flag+'_val_mse.png')
    plt.show()

In [None]:
#ranges for tuning hyperparameters for embedding algorithm
p_range = [1, 2, 5, 10]
q_range = [1, 2, 5, 10]
walk_range = [10, 20, 50]
batch_size_range = [32, 64, 128, 256]
learning_rate_range = [0.001, 0.01, 0.1, 1]

#selecting optimal p
def p_tune(flag="p", tune_range=p_range):
    train_losses = []
    val_mses = []
    for param_value in tune_range:    
        print("\n\n", flag+" : ", param_value, "\n\n")
        train_loss_list, val_mse_list = predict(p=param_value)
        train_losses.append(train_loss_list)
        val_mses.append(val_mse_list)
    np.save(flag+"_val_mses.npy", np.array(val_mses))
    np.save(flag+"_train_losses.npy",np.array(train_losses))
    val_mses = np.load(flag+"_val_mses.npy")
    train_losses = np.load(flag+"_train_losses.npy")

    visualization(train_losses, val_mses, flag=flag)

#selecting optimal q
def q_tune(flag="q", tune_range=q_range):
    train_losses = []
    val_mses = []
    for param_value in tune_range:    
        print("\n\n", flag+" : ", param_value, "\n\n")
        train_loss_list, val_mse_list = predict(p=param_value)
        train_losses.append(train_loss_list)
        val_mses.append(val_mse_list)
    np.save(flag+"_val_mses.npy", np.array(val_mses))
    np.save(flag+"_train_losses.npy",np.array(train_losses))
    val_mses = np.load(flag+"_val_mses.npy")
    train_losses = np.load(flag+"_train_losses.npy")

    visualization(train_losses, val_mses, flag=flag)
    
#selecting optimal walk length
def walk_tune(flag="walk", tune_range=walk_range):
    train_losses = []
    val_mses = []
    for param_value in tune_range:    
        print("\n\n", flag+" : ", param_value, "\n\n")
        train_loss_list, val_mse_list = predict(walk_length=param_value)
        train_losses.append(train_loss_list)
        val_mses.append(val_mse_list)
    np.save(flag+"_val_mses.npy", np.array(val_mses))
    np.save(flag+"_train_losses.npy",np.array(train_losses))
    val_mses = np.load(flag+"_val_mses.npy")
    train_losses = np.load(flag+"_train_losses.npy")

    visualization(train_losses, val_mses, flag=flag)

#selecting optimal batch size
def batch_size_tune(flag="batch_size", tune_range=batch_size_range):
    train_losses = []
    val_mses = []
    for param_value in tune_range:    
        print("\n\n", flag+" : ", param_value, "\n\n")
        train_loss_list, val_mse_list = predict(batch_size=param_value, learning_rate = 0.01*(param_value/128))
        train_losses.append(train_loss_list)
        val_mses.append(val_mse_list)
    np.save(flag+"_val_mses.npy", np.array(val_mses))
    np.save(flag+"_train_losses.npy",np.array(train_losses))
    val_mses = np.load(flag+"_val_mses.npy")
    train_losses = np.load(flag+"_train_losses.npy")

    visualization(train_losses, val_mses, flag=flag, tune_range=tune_range)

#selecting optimal learning rate
def learning_rate_tune(flag="learning_rate", tune_range=learning_rate_range):
    train_losses = []
    val_mses = []
    for param_value in tune_range:    
        print("\n\n", flag+" : ", param_value, "\n\n")
        train_loss_list, val_mse_list = predict(learning_rate=param_value)
        train_losses.append(train_loss_list)
        val_mses.append(val_mse_list)
    np.save(flag+"_val_mses.npy", np.array(val_mses))
    np.save(flag+"_train_losses.npy",np.array(train_losses))
    val_mses = np.load(flag+"_val_mses.npy")
    train_losses = np.load(flag+"_train_losses.npy")

    visualization(train_losses, val_mses, flag=flag, tune_range=tune_range)

In [None]:
p_tune()