# Imports 

In [1]:
import pandas as pd 
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import preprocessing
import matplotlib.pyplot as plt
import csv
import torch
import itertools 

In [2]:
!pip install torch 



# Useful functions

In [2]:
def normalize(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

# Define the function to load the data 

**Building the node-node-label graph**:


This graph is technically the original graph (constructed from *edges.csv*) with added edges. These added edges are defined by the labeling relationships with the common nodes and the label nodes. The idea would be to add the label nodes by giving them indices starting from $n+1$ to $n+40$. The edges will be deduced from the labeling relationships we have in the file *group-edges.csv*.

In [10]:
def load_data1(data_name): 
    print("Loading {} dataset...".format(data_name))
    edges_file = data_name + "/edges.csv"
    node_label_file = data_name + "/group-edges.csv"
    nnlg_file = data_name + "/edges_node_node_label.csv"
    llng_file = data_name + "/edges_label_label_node.csv"
    label_raw, nodes = [], []
    with open(node_label_file) as file_to_read: 
        while True:
            lines = file_to_read.readline()
            if not lines:
                break 
            node, label = lines.split(",")
            label_raw.append(int(label))
            nodes.append(int(node))

    label_raw = np.array(label_raw)
    nodes = np.array(nodes)
    unique_nodes = np.unique(nodes)
    labels = np.zeros((unique_nodes.shape[0], 39))
    for l in range(1, 40, 1):
        indices = np.argwhere(label_raw == l).reshape(-1)
        n_l = nodes[indices]
        for n in n_l:
            labels[n-1][l-1] = 1
            
    label_nodes = label_raw + unique_nodes.shape[0]
    n_n_l_nodes = np.concatenate((unique_nodes, np.unique(label_nodes)))
    df = pd.DataFrame(list())
    df.to_csv(nnlg_file)
    f = open(nnlg_file, "r+")
    file_to_read = open(edges_file, "r")
    f.writelines(file_to_read.readlines())
    
    
    
    
    
    
    return labels

In [11]:
load_data1("BlogCatalog")

Loading BlogCatalog dataset...


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [3]:
def load_data(data_name):
    print("Loading {} dataset...".format(data_name))
    edges_file = data_name + "/edges.csv"
    node_label_file = data_name + "/group-edges.csv"
    nnlg_file = data_name + "/edges_node_node_label.csv"
    llng_file = data_name + "/edges_label_label_node.csv"
    # We'll first dive into the group_edges.csv files to extract the nodes and their corresponding labels for the training
    label_raw, nodes = [], []
    with open(node_label_file) as file_to_read: 
        while True:
            lines = file_to_read.readline()
            if not lines:
                break 
            node, label = lines.split(",")
            label_raw.append(int(label))
            nodes.append(int(node))

    label_raw = np.array(label_raw)
    nodes = np.array(nodes)
    unique_nodes = np.unique(nodes)
    labels = np.zeros((unique_nodes.shape[0], 39))
    for l in range(1, 40, 1):
        indices = np.argwhere(label_raw == l).reshape(-1)
        n_l = nodes[indices]
        for n in n_l:
            labels[n-1][l-1] = 1
    
    
    

            
    # In this section, we will build our node-node-label graph 
    label_nodes = label_raw + unique_nodes.shape[0]
    ## hereby are the nodes of the node-node-label graph 
    n_n_l_nodes = np.concatenate((unique_nodes, np.unique(label_nodes)))
    ## Let's create a csv file with all the edges to create the graph 
    ## Let's begin with the common nodes
    df = pd.DataFrame(list())
    df.to_csv(nnlg_file)
    f = open(nnlg_file, "r+")
    file_to_read = open(edges_file, "r")
    f.writelines(file_to_read.readlines())
    
    ## Let's add the label edges
    a = np.dstack((label_nodes,nodes)).reshape(label_nodes.shape[0],2)
    e = [",".join(item)+"\n" for item in a.astype(str)]
    f.writelines(e)
    
    
    # Now let's create the node-node-label graph 
    nnl_graph = nx.read_edgelist(f, delimiter = ",", nodetype = int)
    E = nx.adjacency_matrix(nnl_graph, nodelist = n_n_l_nodes)
    E = sp.coo_matrix(E.todense())
    
    # Let's extract the feature matrix of the common nodes 
    A = nx.adjacency_matrix(nx.read_edgelist(file_to_read, delimiter = ",", nodetype = int), nodelist = unique_nodes) 
    A = sp.coo_matrix(A.todense())
    X = sp.csr_matrix(A)
    
    f.close()
    file_to_read.close()
    
    
    
        
    
    # In this section, we will build our label-label-node graph
    ## This graph is based on the co-occurence relationship between the labels
    ### We'll use the matrix "labels" to extract this information for every pair of labels
    ### Let's construct the edges between the labels
    edges = []
    list_edges = []
    C_1 = np.zeros((39,39))
    for k in range(labels.shape[0]):
        indices = np.argwhere(labels[k] == 1).reshape(-1)
        if indices.shape[0] > 1:
            for subset in itertools.combinations(indices, 2):
                if (list(subset) not in list_edges) or ([subset[1], subset[0]] not in list_edges):
                    list_edges.append([subset[0]+labels.shape[0], subset[1]+labels.shape[0]] )
                    edges.append(str(subset[0]+1 + labels.shape[0]) + "," + str(subset[1] +1 + labels.shape[0]) + "\n")
                    C_1[subset[0],subset[1]] = 1
                    C_1[subset[1],subset[0]] = 1
                    
    
    df_1 = pd.DataFrame(list())
    df_1.to_csv(llng_file)
    f_1 = open(llng_file, "r+")
    f_1.writelines(edges)
    
    # Let's extract the adjacency matrix of the cooccurences 
    C = nx.adjacency_matrix(nx.read_edgelist(f_1,delimiter=",", nodetype = int), nodelist = np.unique(label_raw + labels.shape[0]))
    
    labels_ind = label_raw + labels.shape[0]
    a_1 = np.dstack((labels_ind,nodes)).reshape(labels_ind.shape[0],2)
    e_1 = [",".join(item)+"\n" for item in a_1.astype(str)]
    
    f_1.writelines(e_1)
    
    ## Now let's create a graph from what we just created 
    l_l_n_nodes = np.concatenate((np.unique(nodes),np.unique(labels_ind)))
    lln_graph = nx.read_edgelist(f_1, delimiter = ",", nodetype = int)
    F = nx.adjacency_matrix(lln_graph, nodelist = l_l_n_nodes)
    F = sp.coo_matrix(F.todense())
    
                
    # In this section, we will extract useful matrices and vectors from our graph to feed the model
    
    
    f_1.close()
    
    
    
    

    return E, F, X, C, labels, C_1


# This function should return the following Y_star, X_star, C_tilde, F, E, A_tilde

In [251]:
E, F, X, C, labels, C_1 = load_data("BlogCatalog")

Loading BlogCatalog dataset...


In [4]:
from Code.models import High_Layer, Low_Layer
import time
from __future__ import division
from __future__ import print_function

import time
import argparse
import numpy as np

import torch
import torch.nn.functional as F
import torch.optim as optim

In [7]:
parser = argparse.ArgumentParser()
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='Disables CUDA training.')
parser.add_argument('--fastmode', action='store_true', default=False,
                    help='Validate during training pass.')
parser.add_argument('--seed', type=int, default=42, help='Random seed.')
parser.add_argument('--epochs', type=int, default=300,
                    help='Number of epochs to train.')
parser.add_argument('--lr', type=float, default=0.02,
                    help='Initial learning rate.')
parser.add_argument('--weight_decay', type=float, default=0,
                    help='Weight decay (L2 loss on parameters).')
parser.add_argument('--hidden', type=int, default=400,
                    help='Number of hidden units.')
parser.add_argument('--dropout', type=float, default=0.5,
                    help='Dropout rate (1 - keep probability).')
parser.add_argument('-f')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [6]:
# high_layer = High_Layer()
# optimizer = optim.SGD(high_layer.parameter(), lr = args.lr, weight_decay = args.weight_decay)

In [1]:
def threshold(output):
    output[output > 0.5] = 1
    output[output <= 0.5] = 0
    return output

In [None]:
def train_high_layer(Y_star):
    high_layer = High_Layer(nfeat = C_tilde.shape[0],
                            nhid = args.hidden,
                            nclass = C_tilde.shape[0],
                            dropout = args.dropout)
    optimize_highLayer = optim.SGD(high_layer.parameter(), lr = args.lr, weight_decay = args.weight_decay)
    high_layer.train()
    optimizer_highLayer.zero_grad()
    Y_new = high_layer(Y_star, F, C_tilde)
    # Calculate the train loss (Cross-Entropy)
    loss_train = F.cross_entropy(output[idx_train], labels[idx_train])

    loss_train.backward()
    
    
    optimizer_highLayer.step()
    
    loss_val = F.cross_entropy(output[idx_val], labels[idx_val])
    
    return Y_new, loss_train, loss_val 


def train_low_layer(X_star):
    low＿layer = Low_Layer(nfeat = A_tilde.shape[0],
                            nhid = args.hidden,
                            nclass = C_tilde.shape[0],
                            dropout = args.dropout)
    optimize_lowLayer = optim.SGD(low_layer.parameter(), lr = args.lr, weight_decay = args.weight_decay)
    low_layer.train()
    optimizer_lowLayer.zero_grad()
    Y_new = low_layer(X_star, E, A_tilde)
    # Calculate the train loss (Binary Cross Entropy)
    loss_train = np.sum([F.binary_cross_entropy_with_logits(output[idx_train][:,i], labels[idx_train][:,i]) for i in range(C_tilde.shape[0])])
    loss_train.backward()
    optimizer_lowLayer.step()
    
    loss_val = np.sum([F.binary_cross_entropy_with_logits(output[idx_val][:,i], labels[idx_val][:,i]) for i in range(C_tilde.shape[0])])
    
    return Y_new, loss_train, loss_val
    

In [None]:
# global train function for having a loop 
# epochs M N for
def global_train(epochs, M, N):
    for i in range(epochs):
        
        Y_new, loss_train_hl, loss_val_hl = train_high_layer(Y_star)
        X_new, loss_train_ll, loss_val_ll = train_low_layer(X_star)
        
        if i%M:
            X_star = np.concatenate((Y_new, X_new),axis = 1)
            
        if i%N:
            Y_star = np.concatenate((X_new, Y_new), axis = 1)
        
        
        loss_train = loss_train_hl + loss_train_ll 
        
        
        # global loss function = combine the two loss functions
        # optimizer for global loss function
        params = list(high_layer.parameters()) + list(low_layer.parameters())
        global_optimizer = optim.SGD(params, lr = args.lr, weight_decay = args.weight_decay)
        global_loss_train.backward()
        global_optimizer.step()
        
        loss_val = loss_val_hl + loss_val_ll
   
        print('Epoch: {:04d}'.format(i+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'loss_val: {:.4f}'.format(loss_val.item())

