In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# Helper function for visualization.
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE


[K     |████████████████████████████████| 7.9 MB 4.5 MB/s 
[K     |████████████████████████████████| 3.5 MB 4.0 MB/s 
[K     |████████████████████████████████| 407 kB 4.1 MB/s 
[K     |████████████████████████████████| 45 kB 2.9 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


# Importing the data
Using networkx, we load the edge list, combined with features obtained from encoding, into the right tensors.

In [None]:
import networkx as nx
g = nx.read_edgelist("drive/MyDrive/Hindex/coauthorship.edgelist",
                     create_using=nx.Graph(), nodetype=int)
nx.info(g)

'Graph with 217801 nodes and 1718164 edges'

Retrieve target attributes. We will give a arbitrary 0 hindex for test set.

In [None]:
import csv

with open('drive/MyDrive/Hindex/train.csv', mode='r') as file:
    reader = csv.reader(file)
    next(file)
    nodes_attr = {rows[0] : float(rows[1]) for rows in reader} #string:float dic
nx.set_node_attributes(g, nodes_attr)
nx.info(g)
len(list(set(nodes_attr.keys())))

174241

## Preprocessing


In [None]:
import torch
import scipy.sparse
from torch import Tensor
from torch.utils.dlpack import to_dlpack, from_dlpack
import torch_geometric.data

data=torch_geometric.utils.from_networkx(g)

Because data at hand is huge, we cannot give the whole labeled set as training data. We need to split it into training and test validation test, that will help us fine-tuning the hyperparameters.

In [None]:
def split_train_val(nodes, train_pct):
  """
  Split dataset into train/val according to desired training percentage
  Args:
    nodes--dictionnary of annotated nodes (string : float)
    train_pct--desired percentage of training nodes out of all annotated nodes (that represent 80% of all nodes)
  """
  from sklearn.model_selection import train_test_split
  ids_train, ids_test_val, _ , _ = train_test_split(list(nodes.keys()),list(nodes.values()), test_size=int(len(nodes)*(1-train_pct)), shuffle=True)
  dic_id_set={}
  for id in ids_train:
    dic_id_set[id]=True
  for id in ids_test_val:
    dic_id_set[id]=False
  return dic_id_set

## Auth

In [None]:
import csv
with open("/content/drive/MyDrive/Hindex/features_array.csv","w+", newline='') as my_csv:
    csvWriter = csv.writer(my_csv,delimiter=';')
    csvWriter.writerows(features_array)
  

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Hindex/features_array.csv", header=None, sep=";")

features_array = df.values.tolist()

In [None]:
training_pct=0.2
data.x = torch.tensor(features_array)
dic_id_set = split_train_val(nodes_attr, 0.1)
data.train_mask=torch.tensor([bool(dic_id_set[str(id)]) if str(id) in nodes_attr.keys() else False for id in g]) #boolean mask for nodes with hindex
data.test_mask=torch.tensor([not bool(dic_id_set[str(id)]) if str(id) in nodes_attr.keys() else False for id in g]) #boolean mask for test nodes
data.y=torch.tensor(np.array([nodes_attr[str(id)] if str(id) in nodes_attr.keys() else 0 for id in g], dtype=np.float32)) #node labels

In [None]:
print(f"Shape of the nodes features: {data.x.shape}")
print("First 2 values: ", data.x[:2], "\n")
print(f"Shape of the train mask tensor: {data.train_mask.shape}")
print("First ten values: ", data.train_mask[:10], "\n")
print(f"Shape of the test mask tensor: {data.test_mask.shape}")
print("First ten values: ", data.test_mask[:10], "\n")
print(f"Shape of the nodes attributes (what to predict): {data.y.shape}")
print("First ten values: ", data.y[:10], "\n")

Shape of the nodes features: torch.Size([217801, 50])
First 2 values:  tensor([[ 0.2918, -0.2287, -0.1525, -0.1800,  0.2448, -0.2673, -0.1754,  0.1103,
          0.2178, -0.0363,  0.0883, -0.0080,  0.1367, -0.1285, -0.4137,  0.1007,
          0.0897,  0.3045,  0.0669,  0.2519,  0.2660, -0.0081, -0.3811, -0.1297,
          0.5370, -0.0743, -0.4923, -0.0941,  0.0967,  0.0494, -0.2042,  0.3060,
         -0.0513,  0.3627, -0.0247,  0.3349, -0.5054,  0.1124,  0.6030,  0.1095,
          0.1047, -0.4054, -0.2081,  0.3487,  0.3005, -0.3497, -0.1689, -0.0650,
          0.1596,  0.1587],
        [ 0.4811, -0.9494,  0.1138, -0.7336,  1.0302, -1.5913, -1.0252, -0.3283,
          0.4168,  0.6787, -0.1083, -0.7383,  0.5772,  0.1212, -1.7845,  0.3229,
          0.5518,  1.8893,  0.1943,  1.0873,  1.0742, -0.7111, -1.3113,  0.3635,
          2.3933, -0.2809, -1.6479, -0.0770,  0.1659,  0.5000, -0.4000, -0.0677,
          0.1566,  0.0181, -0.4468,  1.0730, -1.3070,  0.5717,  1.3567, -0.1255,
          

In [None]:
print(data)
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Data(edge_index=[2, 3436328], num_nodes=217801, x=[217801, 50], train_mask=[217801], test_mask=[217801], y=[217801])
Number of nodes: 217801
Number of edges: 3436328
Average node degree: 15.78
Number of training nodes: 17425
Training node label rate: 0.08
Has isolated nodes: False
Has self-loops: False
Is undirected: True


## Multi-layer Perception Network (MLP)
A simple MLP that operates on input node features, not on the graph structure. 

In [None]:
import torch
from torch.nn import Linear
import torch.nn.functional as F

num_features = len(features_array[0])

class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(MLP, self).__init__()
        torch.manual_seed(12345)
        self.lin_layer_1 = Linear(num_features, hidden_channels)
        self.lin_layer_2 = Linear(hidden_channels, 1)

    def forward(self, x):
        x = self.lin_layer_1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin_layer_2(x)
        return x

model = MLP(hidden_channels=16)
print(model)

MLP(
  (lin_layer_1): Linear(in_features=50, out_features=16, bias=True)
  (lin_layer_2): Linear(in_features=16, out_features=1, bias=True)
)


In [None]:
model = MLP(hidden_channels=128)
criterion = torch.nn.MSELoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x)
      print(out)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

tensor([[ 0.0716],
        [-0.0133],
        [-0.2232],
        ...,
        [ 0.1683],
        [10.7653],
        [ 0.1511]], grad_fn=<AddmmBackward0>)


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 001, Loss: 340.4165
tensor([[ 1.3083e-02],
        [-7.0432e-01],
        [-1.2452e+00],
        ...,
        [-1.9554e-01],
        [-6.5808e+01],
        [-2.3271e+00]], grad_fn=<AddmmBackward0>)
Epoch: 002, Loss: 791.9222
tensor([[ 0.1777],
        [ 0.2176],
        [-0.2927],
        ...,
        [-0.0988],
        [ 1.9274],
        [ 0.2347]], grad_fn=<AddmmBackward0>)
Epoch: 003, Loss: 359.0154
tensor([[ 0.1959],
        [ 0.5064],
        [ 0.7297],
        ...,
        [ 0.2209],
        [41.5082],
        [ 2.1801]], grad_fn=<AddmmBackward0>)
Epoch: 004, Loss: 324.9524
tensor([[ 0.2908],
        [ 0.8592],
        [ 0.6202],
        ...,
        [ 0.5401],
        [15.7827],
        [ 2.6244]], grad_fn=<AddmmBackward0>)
Epoch: 005, Loss: 555.5103
tensor([[ 0.2305],
        [ 0.3310],
        [ 0.5949],
        ...,
        [-0.2329],
        [ 5.9315],
        [ 1.1048]], grad_fn=<AddmmBackward0>)
Epoch: 006, Loss: 368.3228
tensor([[  0.2239],
        [  0.2402],
    

## Graph Neural Network (GNN)

We can easily convert our MLP to a GNN by swapping the `torch.nn.Linear` layers with PyG's GNN operators.

In [None]:
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=128)
print(model)

GCN(
  (conv1): GCNConv(50, 128)
  (conv2): GCNConv(128, 1)
)


In [None]:
model = GCN(hidden_channels=128)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.MSELoss()

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, data.edge_index)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

for epoch in range(1, 101):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 001, Loss: 1323.5835
Epoch: 002, Loss: 2849.7065
Epoch: 003, Loss: 536.1916
Epoch: 004, Loss: 1004.7610
Epoch: 005, Loss: 2706.7317
Epoch: 006, Loss: 1573.7823
Epoch: 007, Loss: 3443.7905
Epoch: 008, Loss: 3125.0439
Epoch: 009, Loss: 1248.5557
Epoch: 010, Loss: 1056.4447
Epoch: 011, Loss: 4458.5669
Epoch: 012, Loss: 1490.4191
Epoch: 013, Loss: 1379.6553
Epoch: 014, Loss: 1014.0363
Epoch: 015, Loss: 769.5665
Epoch: 016, Loss: 2062.3748
Epoch: 017, Loss: 838.6727
Epoch: 018, Loss: 873.5323
Epoch: 019, Loss: 383.8441
Epoch: 020, Loss: 753.3388
Epoch: 021, Loss: 1534.9458
Epoch: 022, Loss: 992.3315
Epoch: 023, Loss: 698.8753
Epoch: 024, Loss: 1265.1472
Epoch: 025, Loss: 647.7196
Epoch: 026, Loss: 587.6000
Epoch: 027, Loss: 631.4578
Epoch: 028, Loss: 743.4376
Epoch: 029, Loss: 381.4366
Epoch: 030, Loss: 337.9151
Epoch: 031, Loss: 441.8435
Epoch: 032, Loss: 345.7970
Epoch: 033, Loss: 436.6377
Epoch: 034, Loss: 378.3740
Epoch: 035, Loss: 568.4542
Epoch: 036, Loss: 696.7635
Epoch: 037, 