# h-Index Prediction with GCNs
This notebook is the final submission for the X-INF554 Data Challenge at Ecole Polytechnique, where the aim was to predict authors' h-Index given:


*   Co-authorship graph between authors
*   List of at most 5 abstracts for each author.

The authors are represented as feature nodes where the features are:

*   aggregate abstract vectors (i.e. mean BERT embedding of authors' abstracts, doc2vec etc.)
*   Graph based features

Our approach achieved an MSE of **45.29** on the private leaderboard, putting us at the 5th place out of 63 teams. For reference, the first team achieved 42.66 and the mean score was at 81.80 (std 35.33).




In [None]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split

In [None]:
author_papers = pd.read_csv("author_papers.txt.zip", delimiter=':', names=['author', 'paper'])
author_papers['paper'] = author_papers['paper'].apply(lambda x: x.split('-'))
#author_papers['n_papers'] = author_papers['paper'].apply(len)
author_papers = author_papers.explode('paper')
author_papers

Unnamed: 0,author,paper
0,1036332,1510273386
0,1036332,1827736641
0,1036332,1588673897
0,1036332,2252711322
0,1036332,2123653597
...,...,...
217800,2908499439,2081432213
217800,2908499439,2070621672
217800,2908499439,2079679191
217800,2908499439,32110345


In [None]:
train = pd.read_pickle("train_author_agg_vectors.pkl").drop(['max_embedding', 'first_embedding'], axis=1).rename(columns={'mean_embedding': 'mean_sbert'})
test = pd.read_pickle("test_author_agg_vectors.pkl").drop(['max_embedding', 'first_embedding'], axis=1).rename(columns={'mean_embedding': 'mean_sbert'})
train

Unnamed: 0,author,hindex,n_papers,mean_sbert
0,1964267543,4.0,5.0,"[-0.052135132, 0.022680586, 0.041294187, -0.05..."
1,2153592714,13.0,5.0,"[-0.02547187, -0.02848028, -0.014876259, -0.04..."
2,217158525,8.0,5.0,"[-0.005974621, 0.006903825, -0.017989326, -0.0..."
3,2123103677,11.0,3.0,"[-0.033690836, 0.03599136, -0.052308355, 0.044..."
4,2067710487,3.0,2.0,"[0.0011541683, -0.09693919, -0.035319697, 0.01..."
...,...,...,...,...
174236,2225897966,1.0,1.0,"[-0.010317355, 0.07318564, -0.0676425, -0.0559..."
174237,2612161910,1.0,1.0,"[-0.065617844, 0.10500442, -0.09921767, -0.027..."
174238,2575614996,1.0,1.0,"[-0.03478216, -0.022265393, 0.0045300666, 0.04..."
174239,2078153944,1.0,2.0,"[0.0136339385, -0.05201685, -0.0034561865, 0.0..."


In [None]:
train["extra"] = pd.Series(list(np.load("train_graph_n2v_d2v.npy")))
test["extra"] = pd.Series(list(np.load("test_graph_n2v_d2v.npy")))
test

Unnamed: 0,author,n_papers,mean_sbert,extra
0,915630815,5.0,"[-0.07671193, 0.06058638, 0.017426852, -0.0379...","[1.0, 1.0, 0.0, 10.0, 10.0, 0.0, 1.0, 1.698462..."
1,1236455448,5.0,"[-0.011177505, 0.008547784, 0.0030871811, -0.0...","[5.0, 5.0, 1.0, 4.2, 4.4, 1.0, 3.0, 5.84279909..."
2,2694593333,4.0,"[0.006413276, -0.009554359, -0.043468893, -0.0...","[16.0, 16.0, 1.0, 10.75, 11.5625, 1.0, 7.0, 1...."
3,2137926699,2.0,"[-0.0077553736, 0.039409973, -0.07448004, 0.01...","[1.0, 1.0, 0.0, 26.0, 27.0, 0.0, 1.0, 1.182820..."
4,2883694285,2.0,"[-0.09723279, -0.020298773, -0.0020541456, 0.0...","[3.0, 4.0, 1.0, 10.0, 8.75, 1.0, 3.0, 2.742900..."
...,...,...,...,...
43555,2145559725,3.0,"[-0.01877177, -0.031387135, -0.06260405, 0.023...","[2.0, 2.0, 0.0, 13.0, 13.5, 0.0, 2.0, 1.636933..."
43556,2168342616,5.0,"[-0.0007464085, -0.04247803, 0.028256837, 0.00...","[3.0, 8.0, 2.0, 11.0, 14.25, 3.5, 3.0, 2.57296..."
43557,2162797290,5.0,"[-0.05684532, -0.026625771, 0.016216638, 0.060...","[6.0, 9.0, 1.0, 91.66666666666667, 96.22222222..."
43558,294576894,5.0,"[-0.06492782, -0.03364247, -0.0504133, -0.0294...","[7.0, 7.0, 0.0, 4.571428571428571, 5.0, 0.0, 4..."


In [None]:
train['extra_bert'] = pd.Series(list(np.load("embedding_all-MiniLM-L12-v2_mean_train.npy")))
test['extra_bert'] = pd.Series(list(np.load("embedding_all-MiniLM-L12-v2_mean_test.npy")))
train

Unnamed: 0,author,hindex,n_papers,mean_sbert,extra,extra_bert
0,1964267543,4.0,5.0,"[-0.052135132, 0.022680586, 0.041294187, -0.05...","[5.0, 6.0, 1.0, 22.8, 22.666666666666668, 1.0,...","[-0.05630162, 0.09752295, -0.1141102, -0.13589..."
1,2153592714,13.0,5.0,"[-0.02547187, -0.02848028, -0.014876259, -0.04...","[2.0, 6.0, 2.0, 10.0, 18.333333333333332, 2.5,...","[0.04320468, 0.14857285, -0.16441649, -0.21340..."
2,217158525,8.0,5.0,"[-0.005974621, 0.006903825, -0.017989326, -0.0...","[2.0, 3.0, 0.0, 3.5, 7.0, 0.0, 2.0, 3.42108641...","[0.00582764, 0.09294338, -0.15769197, -0.24381..."
3,2123103677,11.0,3.0,"[-0.033690836, 0.03599136, -0.052308355, 0.044...","[7.0, 7.0, 1.0, 15.714285714285714, 16.4285714...","[0.01378435, 0.07472001, -0.04285565, -0.10068..."
4,2067710487,3.0,2.0,"[0.0011541683, -0.09693919, -0.035319697, 0.01...","[2.0, 2.0, 0.0, 3.5, 3.5, 0.0, 1.0, 4.25552092...","[-0.01707689, 0.04883542, -0.02549759, -0.0859..."
...,...,...,...,...,...,...
174236,2225897966,1.0,1.0,"[-0.010317355, 0.07318564, -0.0676425, -0.0559...","[3.0, 3.0, 0.0, 12.666666666666666, 12.6666666...","[0.01870065, 0.03190672, -0.01903249, -0.02287..."
174237,2612161910,1.0,1.0,"[-0.065617844, 0.10500442, -0.09921767, -0.027...","[1.0, 1.0, 0.0, 6.0, 6.0, 0.0, 1.0, 1.68192524...","[0.00334274, 0.04985746, -0.04555392, -0.03742..."
174238,2575614996,1.0,1.0,"[-0.03478216, -0.022265393, 0.0045300666, 0.04...","[1.0, 1.0, 0.0, 13.0, 13.0, 0.0, 1.0, 1.284886...","[-0.00639988, 0.00703147, -0.01361149, -0.0672..."
174239,2078153944,1.0,2.0,"[0.0136339385, -0.05201685, -0.0034561865, 0.0...","[4.0, 4.0, 0.0, 6.75, 7.0, 0.0, 4.0, 4.3931628...","[0.02069795, 0.01978348, -0.03274036, -0.11306..."


In [None]:
graph = nx.read_edgelist('coauthorship.edgelist', delimiter=' ', nodetype=int)


In [None]:
graph.number_of_edges(), graph.number_of_nodes(), 

(1718164, 217801)

In [None]:
from tqdm import tqdm
core_number = nx.core_number(graph) # dict with node_number
n_neighbours = {node: graph.degree(node) for node in tqdm(core_number.keys())}

100%|██████████| 217801/217801 [00:00<00:00, 331454.45it/s]


In [None]:
train['neighbours'] = train.author.apply(lambda x : list(graph.neighbors(x)))
test['neighbours'] = test.author.apply(lambda x : list(graph.neighbors(x)))

mapdict = train[['author', 'hindex']].set_index('author').to_dict()['hindex']

def neighbours_to_index(neighbours):

  nums = []
  for n in neighbours:
    if n in mapdict.keys():
      nums.append(mapdict[n])

  if len(nums) > 0:
    return sum(nums) / len(nums)

  else:
    return None


train['mean_hindex_neighbours'] = train.neighbours.apply(neighbours_to_index)
test['mean_hindex_neighbours'] = test.neighbours.apply(neighbours_to_index)

train

Unnamed: 0,author,hindex,n_papers,mean_sbert,extra,extra_bert,neighbours,mean_hindex_neighbours
0,1964267543,4.0,5.0,"[-0.052135132, 0.022680586, 0.041294187, -0.05...","[5.0, 6.0, 1.0, 22.8, 22.666666666666668, 1.0,...","[-0.05630162, 0.09752295, -0.1141102, -0.13589...","[307593211, 2383048336, 1643434777, 834507041,...",21.800000
1,2153592714,13.0,5.0,"[-0.02547187, -0.02848028, -0.014876259, -0.04...","[2.0, 6.0, 2.0, 10.0, 18.333333333333332, 2.5,...","[0.04320468, 0.14857285, -0.16441649, -0.21340...","[1233913860, 2124461921]",20.000000
2,217158525,8.0,5.0,"[-0.005974621, 0.006903825, -0.017989326, -0.0...","[2.0, 3.0, 0.0, 3.5, 7.0, 0.0, 2.0, 3.42108641...","[0.00582764, 0.09294338, -0.15769197, -0.24381...","[2502610808, 2261647917]",2.000000
3,2123103677,11.0,3.0,"[-0.033690836, 0.03599136, -0.052308355, 0.044...","[7.0, 7.0, 1.0, 15.714285714285714, 16.4285714...","[0.01378435, 0.07472001, -0.04285565, -0.10068...","[2064546257, 2147281624, 2284716930, 201270321...",11.833333
4,2067710487,3.0,2.0,"[0.0011541683, -0.09693919, -0.035319697, 0.01...","[2.0, 2.0, 0.0, 3.5, 3.5, 0.0, 1.0, 4.25552092...","[-0.01707689, 0.04883542, -0.02549759, -0.0859...","[2168344074, 2043762735]",
...,...,...,...,...,...,...,...,...
174236,2225897966,1.0,1.0,"[-0.010317355, 0.07318564, -0.0676425, -0.0559...","[3.0, 3.0, 0.0, 12.666666666666666, 12.6666666...","[0.01870065, 0.03190672, -0.01903249, -0.02287...","[2110438662, 2191889675, 2071929315]",10.000000
174237,2612161910,1.0,1.0,"[-0.065617844, 0.10500442, -0.09921767, -0.027...","[1.0, 1.0, 0.0, 6.0, 6.0, 0.0, 1.0, 1.68192524...","[0.00334274, 0.04985746, -0.04555392, -0.03742...",[271172287],8.000000
174238,2575614996,1.0,1.0,"[-0.03478216, -0.022265393, 0.0045300666, 0.04...","[1.0, 1.0, 0.0, 13.0, 13.0, 0.0, 1.0, 1.284886...","[-0.00639988, 0.00703147, -0.01361149, -0.0672...",[2222643512],
174239,2078153944,1.0,2.0,"[0.0136339385, -0.05201685, -0.0034561865, 0.0...","[4.0, 4.0, 0.0, 6.75, 7.0, 0.0, 4.0, 4.3931628...","[0.02069795, 0.01978348, -0.03274036, -0.11306...","[2597672571, 2306030925, 1270584344, 1991427769]",5.500000


In [None]:
fill_mean_hindex = train['mean_hindex_neighbours'].mean()

train.loc[train.mean_hindex_neighbours.isna(), 'mean_hindex_neighbours'] = train.loc[train.mean_hindex_neighbours.isna(), 'mean_hindex_neighbours'].apply(lambda x: fill_mean_hindex)
test.loc[test.mean_hindex_neighbours.isna(), 'mean_hindex_neighbours'] = test.loc[test.mean_hindex_neighbours.isna(), 'mean_hindex_neighbours'].apply(lambda x: fill_mean_hindex)

In [None]:
noise = train['mean_hindex_neighbours'].std() * .5

In [None]:
train['core_number'] = train['author'].map(core_number)
train['n_neighbours'] = train['author'].map(n_neighbours)

test['core_number'] = test['author'].map(core_number)
test['n_neighbours'] = test['author'].map(n_neighbours)

In [None]:
from networkx.linalg.graphmatrix import adjacency_matrix

A = adjacency_matrix(graph)

In [None]:
nodes = list(graph.nodes)
name2idx = {nodes[i]: i for i in range(len(nodes))}

In [None]:
train['nodeidx'] = train.author.map(name2idx)
test['nodeidx'] = test.author.map(name2idx)

train

Unnamed: 0,author,hindex,n_papers,mean_sbert,extra,extra_bert,neighbours,mean_hindex_neighbours,core_number,n_neighbours,nodeidx
0,1964267543,4.0,5.0,"[-0.052135132, 0.022680586, 0.041294187, -0.05...","[5.0, 6.0, 1.0, 22.8, 22.666666666666668, 1.0,...","[-0.05630162, 0.09752295, -0.1141102, -0.13589...","[307593211, 2383048336, 1643434777, 834507041,...",21.800000,5,5,43275
1,2153592714,13.0,5.0,"[-0.02547187, -0.02848028, -0.014876259, -0.04...","[2.0, 6.0, 2.0, 10.0, 18.333333333333332, 2.5,...","[0.04320468, 0.14857285, -0.16441649, -0.21340...","[1233913860, 2124461921]",20.000000,2,2,132559
2,217158525,8.0,5.0,"[-0.005974621, 0.006903825, -0.017989326, -0.0...","[2.0, 3.0, 0.0, 3.5, 7.0, 0.0, 2.0, 3.42108641...","[0.00582764, 0.09294338, -0.15769197, -0.24381...","[2502610808, 2261647917]",2.000000,2,2,165398
3,2123103677,11.0,3.0,"[-0.033690836, 0.03599136, -0.052308355, 0.044...","[7.0, 7.0, 1.0, 15.714285714285714, 16.4285714...","[0.01378435, 0.07472001, -0.04285565, -0.10068...","[2064546257, 2147281624, 2284716930, 201270321...",11.833333,6,7,89808
4,2067710487,3.0,2.0,"[0.0011541683, -0.09693919, -0.035319697, 0.01...","[2.0, 2.0, 0.0, 3.5, 3.5, 0.0, 1.0, 4.25552092...","[-0.01707689, 0.04883542, -0.02549759, -0.0859...","[2168344074, 2043762735]",16.135587,1,2,209408
...,...,...,...,...,...,...,...,...,...,...,...
174236,2225897966,1.0,1.0,"[-0.010317355, 0.07318564, -0.0676425, -0.0559...","[3.0, 3.0, 0.0, 12.666666666666666, 12.6666666...","[0.01870065, 0.03190672, -0.01903249, -0.02287...","[2110438662, 2191889675, 2071929315]",10.000000,3,3,130257
174237,2612161910,1.0,1.0,"[-0.065617844, 0.10500442, -0.09921767, -0.027...","[1.0, 1.0, 0.0, 6.0, 6.0, 0.0, 1.0, 1.68192524...","[0.00334274, 0.04985746, -0.04555392, -0.03742...",[271172287],8.000000,1,1,216172
174238,2575614996,1.0,1.0,"[-0.03478216, -0.022265393, 0.0045300666, 0.04...","[1.0, 1.0, 0.0, 13.0, 13.0, 0.0, 1.0, 1.284886...","[-0.00639988, 0.00703147, -0.01361149, -0.0672...",[2222643512],16.135587,1,1,65960
174239,2078153944,1.0,2.0,"[0.0136339385, -0.05201685, -0.0034561865, 0.0...","[4.0, 4.0, 0.0, 6.75, 7.0, 0.0, 4.0, 4.3931628...","[0.02069795, 0.01978348, -0.03274036, -0.11306...","[2597672571, 2306030925, 1270584344, 1991427769]",5.500000,4,4,43677


In [None]:
train = train.set_index('nodeidx').sort_index()
test = test.set_index('nodeidx').sort_index()

In [None]:
na_train = train.isna().any(axis=1)
na_test = test.isna().any(axis=1)

In [None]:
# Fill N/A values with mean
fill_n_papers = train[~na_train].n_papers.mean() * 0
fill_mean_sbert = train[~na_train].mean_sbert.mean() * 0
#fill_mean_glove = train[~na_train].mean_glove.mean()


train.loc[na_train, "mean_sbert"] = train.loc[na_train, "mean_sbert"].apply(lambda x: fill_mean_sbert)
#train.loc[na_train, "mean_glove"] = train.loc[na_train, "mean_glove"].apply(lambda x: fill_mean_glove)
train.loc[na_train, "n_papers"] = train.loc[na_train, "n_papers"].apply(lambda x: fill_n_papers)


test.loc[na_test, "mean_sbert"] = test.loc[na_test, "mean_sbert"].apply(lambda x: fill_mean_sbert)
#test.loc[na_test, "mean_glove"] = test.loc[na_test, "mean_glove"].apply(lambda x: fill_mean_glove)
test.loc[na_test, "n_papers"] = test.loc[na_test, "n_papers"].apply(lambda x: fill_n_papers)

In [None]:
feature_cols = ['n_papers', 'core_number', 'n_neighbours',]

train_idx = train.index.values
train_idx, val1_idx = train_test_split(train_idx, test_size=.3, )
test_idx = test.index.values

X = pd.concat([train,test]).sort_index()
y = X.hindex.values#[:, None]


X = np.hstack((np.vstack(X.mean_sbert.values), 
               np.vstack(X.extra.values), 
               np.vstack(X.mean_hindex_neighbours.values), ))

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
feature_scaler = StandardScaler()
feature_scaler.fit(X)
X = feature_scaler.transform(X)


In [None]:
A_coo = A.tocoo()
edge_index = np.vstack([A_coo.col, A_coo.row])

## Graph Convolution Networks

In [None]:
# Add this in a Google Colab cell to install the correct version of Pytorch Geometric.
import torch

if NEW_SESSION:
  def format_pytorch_version(version):
    return version.split('+')[0]

  TORCH_version = torch.__version__
  TORCH = format_pytorch_version(TORCH_version)

  def format_cuda_version(version):
    return 'cu' + version.replace('.', '')

  CUDA_version = torch.version.cuda
  CUDA = format_cuda_version(CUDA_version)

  !pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html --quiet
  !pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html --quiet
  !pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html --quiet
  !pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html --quiet
  !pip install torch-geometric --quiet

[K     |████████████████████████████████| 7.9 MB 268 kB/s 
[K     |████████████████████████████████| 3.5 MB 3.7 MB/s 
[K     |████████████████████████████████| 2.3 MB 2.6 MB/s 
[K     |████████████████████████████████| 747 kB 2.7 MB/s 
[K     |████████████████████████████████| 325 kB 5.2 MB/s 
[K     |████████████████████████████████| 407 kB 31.1 MB/s 
[K     |████████████████████████████████| 45 kB 3.3 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [None]:
from torch_geometric.data import Data, HeteroData
from torch_geometric.loader import DataLoader, RandomNodeSampler, NeighborLoader, ShaDowKHopSampler
from torch_geometric.datasets import TUDataset
from torch_geometric.utils.convert import from_networkx
from torch_geometric.data import Data
import torch_geometric as tg
import torch.nn as nn

In [None]:
# Let's define our masks

val1_idx, val2_idx = train_test_split(val1_idx, test_size=.5)

mask_shape = (X.shape[0],)

train_mask = torch.zeros(mask_shape, dtype=torch.bool)
train_mask[train_idx] = True

val1_mask = torch.zeros(mask_shape, dtype=torch.bool)
val1_mask[val1_idx] = True

val2_mask = torch.zeros(mask_shape, dtype=torch.bool)
val2_mask[val2_idx] = True

test_mask = torch.zeros(mask_shape, dtype=torch.bool)
test_mask[test_idx] = True

In [None]:
n_features = X.shape[1]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
features = torch.tensor(X)
targets = torch.tensor(y).unsqueeze(-1)
edge_idx = torch.tensor(edge_index, dtype=torch.long)

data = Data(x=features, edge_index=edge_idx, y=targets)
node_idx = torch.linspace(0, data.num_nodes-1, steps=data.num_nodes, dtype=torch.int64)

# Let's define the masks
data.train_mask = train_mask
data.val1_mask = val1_mask
data.val2_mask = val2_mask
data.test_mask = test_mask
data.node_index = node_idx

In [None]:
from torch_geometric.nn import GCNConv, SAGEConv, TransformerConv, GATConv, Linear, GCN2Conv
from torch.nn import functional as F
from torch.nn import Linear
import torch.nn as nn

class ResGCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(ResGCN, self).__init__()

        self.lin1 = Linear(data.num_node_features, hidden_channels)

        #self.embedding = nn.Embedding(num_embeddings=data.num_nodes, embedding_dim=data.num_node_features)
        self.conv1 = GCN2Conv(hidden_channels, alpha=.5)
        self.conv2 = GCN2Conv(hidden_channels, alpha=.5)
        self.conv3 = GCN2Conv(hidden_channels, alpha=.5)
        self.conv4 = GCN2Conv(hidden_channels, alpha=.5)
        self.lin_out = Linear(hidden_channels, 1)

        self.act = nn.SiLU()
        self.dropout = nn.Dropout(p=.5)

    def forward(self, x_0, edge_index, node_index):
        #embs = self.embedding(node_index)
        x_0 = x_0 #+ torch.nn.functional.normalize(embs, dim=1, p=2)

        x_0 = self.lin1(x_0)
        x_1 = self.act(x_0)
        x_1 = self.dropout(x_1)

        x_2 = self.conv1(x=x_1, x_0=x_0, edge_index=edge_index) 
        x_2 = self.act(x_2)
        x_2 = self.dropout(x_2)

        x_3 = self.conv2(x=x_2, x_0=x_1, edge_index=edge_index) 
        x_3 = self.act(x_3)
        x_3 = self.dropout(x_3)

        x_4 = self.conv3(x=x_3, x_0=x_1, edge_index=edge_index) 
        x_4 = self.act(x_4)
        x_4 = self.dropout(x_4)

        x_5 = self.lin_out(x_4)
        
        return x_5

class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(MLP, self).__init__()

        self.lin1 = Linear(data.num_node_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.lin3 = Linear(hidden_channels, 1)

        self.act = nn.ReLU()
        self.dropout = nn.Dropout()

    def forward(self, x_0, edge_index, node_index):
        x = self.lin1(x_0)
        x = self.act(x)
        x = self.dropout(x)

        skip = x

        x = self.lin2(x)
        x = self.act(x)
        x = self.dropout(x)

        x = x + self.lin3(skip)

        return x

class SAGENet(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(SAGENet, self).__init__()
        #self.bn1 = nn.BatchNorm1d(data.num_node_features)
        #self.embedding = nn.Embedding(num_embeddings=data.num_nodes, embedding_dim=data.num_node_features)
        self.conv1 = SAGEConv(data.num_node_features, hidden_channels)
        #self.bn2 = nn.BatchNorm1d(hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, 1)

        self.act = nn.ReLU()
        self.dropout = nn.Dropout(p=.5)

    def forward(self, x_0, edge_index, node_index):

        x = self.conv1(x_0, edge_index)
        x = self.act(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        
        return x

model = SAGENet(hidden_channels=250).to(device).double()
model

SAGENet(
  (conv1): SAGEConv(721, 250)
  (conv2): SAGEConv(250, 1)
  (act): LeakyReLU(negative_slope=0.01)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
epochs = 1000

criterion = nn.HuberLoss(delta=200) # The loss to optimize
criterion_viz = nn.MSELoss() # The loss to print

lowest_error = 1e9
best_epoch = 0

optimizer = torch.optim.Adam(model.parameters(), lr=2e-3, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.4, verbose=True, patience=15)

loader = RandomNodeSampler(data=data, num_parts=4, shuffle=True)
#loader = ShaDowKHopSampler(data=data, depth=3, num_neighbors=5,)
#loader = NeighborLoader(data=data, num_neighbors=[5], shuffle=True)

def mean(ls): 
  return sum(ls)/len(ls)

for epoch in range(epochs):
  losses = {'train': [], 'val': []}
  for phase in ['train', 'val',]:

    if phase == 'train':
      model.train()
    else:
      model.eval()

    for batch in loader:
      input = batch.x

      if phase == 'train':
        optimizer.zero_grad()
        batch.x[:, -1] = batch.x[:, -1] + (torch.randn(batch.x[:, -1].shape[0]) * noise * .0)


      batch = batch.to(device)

      mask = batch.train_mask if phase == 'train' else batch.val1_mask

      target = batch.y[mask]
      out = model(batch.x, batch.edge_index, batch.node_index)[mask]

      clamped_out = torch.clamp(out, min=1,)

      loss = criterion(torch.clamp(out, max=35), target)
      viz_loss = clamped_out #(torch.exp(out) - 1) * 5
      viz_target = target #(torch.exp(target) - 1) * 5

      with torch.no_grad():
        viz_loss = criterion_viz(viz_loss, viz_target)
        losses[phase].append(viz_loss.detach().item())

      if phase == 'train':
        loss.backward()
        optimizer.step()

    if phase == 'val':
      epoch_val_loss = mean(losses['val'])
      lr_scheduler.step(epoch_val_loss)
      torch.save(model, f'models/epoch_{epoch}_model.pt')
      
      if lowest_error > epoch_val_loss:
        lowest_error = epoch_val_loss
        torch.save(model, 'models/best_model.pt')

        best_epoch = epoch
        print("")


  print(f"[Epoch #{epoch}] Train Loss: {mean(losses['train'])} Val Loss: {mean(losses['val'])}")

print(f"Loading model with val loss {lowest_error} from epoch {best_epoch}")


[Epoch #0] Train Loss: 206.98584438709884 Val Loss: 162.33809848156505

[Epoch #1] Train Loss: 143.52745434266774 Val Loss: 141.731916223423
[Epoch #2] Train Loss: 142.11522922390103 Val Loss: 144.53729449481835

[Epoch #3] Train Loss: 138.24309536423925 Val Loss: 127.32363842562619

[Epoch #4] Train Loss: 115.83732891668456 Val Loss: 106.04190170097885

[Epoch #5] Train Loss: 99.90463380761308 Val Loss: 96.91323399915265

[Epoch #6] Train Loss: 92.6065654572953 Val Loss: 91.8099298142849

[Epoch #7] Train Loss: 88.60623486694257 Val Loss: 87.99141031195728

[Epoch #8] Train Loss: 84.07702096128787 Val Loss: 83.8461674832817

[Epoch #9] Train Loss: 81.03734857090159 Val Loss: 80.78936644027236

[Epoch #10] Train Loss: 77.53452593133954 Val Loss: 77.55461696501521

[Epoch #11] Train Loss: 74.95321210339989 Val Loss: 75.51186534312188

[Epoch #12] Train Loss: 72.77952435640744 Val Loss: 73.3261804973913

[Epoch #13] Train Loss: 70.3948611454488 Val Loss: 70.94786723322963

[Epoch #14] T

KeyboardInterrupt: ignored

In [None]:
model = torch.load(f'models/best_model.pt')
model = torch.load(f'models/epoch_{163}_model.pt')

!cp 'models/best_model.pt' '/content/drive/MyDrive/saved_models/best_model_hindex.pt'

In [None]:
preds = []

loader = RandomNodeSampler(data=data, num_parts=2, shuffle=False)
with torch.no_grad():
  model.eval().to(device)
  for batch in loader:
    batch = batch.to(device)
    out = model(batch.x, batch.edge_index, batch.node_index).cpu()
    preds.append((batch.node_index.cpu(), out))

pairs = []

for i in range(len(preds)):
  for idx, val in zip(preds[i][0], preds[i][1]):
    pairs.append((idx.item(), val.item()))

pairs = sorted(pairs, key=lambda x: x[0])

pairs = pd.DataFrame(pairs, columns=['author', 'pred']).set_index('author')
pairs

Unnamed: 0_level_0,pred
author,Unnamed: 1_level_1
0,6.385676
1,0.374938
2,12.859091
3,1.621687
4,0.269971
...,...
217796,18.744655
217797,9.944682
217798,3.736912
217799,1.207111


In [None]:
from sklearn.metrics import mean_squared_error
mask = train_mask.numpy()
train.loc[node_idx[mask], "pred"] = np.clip(pairs.loc[mask, "pred"], 1, None)
mean_squared_error(train.loc[node_idx[mask], "pred"], train.loc[node_idx[mask], "hindex"])

34.13945334487358

In [None]:
mask = val1_mask.numpy()
train.loc[node_idx[mask], "pred"] = np.clip(pairs.loc[mask, "pred"], 1, None)
mean_squared_error(train.loc[node_idx[mask], "pred"], train.loc[node_idx[mask], "hindex"])

42.13393948098483

In [None]:
mask = val2_mask.numpy()
train.loc[node_idx[mask], "pred"] = np.clip(pairs.loc[mask, "pred"], 1, None)
mean_squared_error(train.loc[node_idx[mask], "pred"], train.loc[node_idx[mask], "hindex"])

40.19747084973854

In [None]:
mask = test_mask.numpy()
test.loc[node_idx[mask], "hindex"] = np.clip(pairs.loc[mask, "pred"], 1, None)

In [None]:
test[["author", "hindex"]].to_csv("submission_hindex2.csv", index=False)
!kaggle competitions submit -c inf554-2021 -f submission_hindex2.csv -m "SageConv v9, Extended with Second Transformer and H-Index"

In [None]:
test[["author", "hindex"]].to_csv("submission_res4.csv", index=False)
#!kaggle competitions submit -c inf554-2021 -f submission_res3.csv -m "Residual GCN v3"