 # Imports

In [2]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import SAGEConv,MessagePassing
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import negative_sampling
import networkx as nx
import pandas as pd
import numpy as np
import itertools
from sklearn.metrics import roc_auc_score, f1_score
import numpy as np

# Data Extraction

In [3]:
import pandas as pd
import random

data = pd.read_csv('vk_friends.csv')

source = data['Source'].to_numpy()
target = data['Target'].to_numpy()

source = torch.from_numpy(source)
#source =torch.where(source >=10000000,random.randint(0,10000000),source) 
target = torch.from_numpy(target)
#target =torch.where(target >=10000000,random.randint(0,10000000),target) 

edge_index = torch.stack([source,target],dim = 0)
vertex = torch.cat([source,target],dim=-1)

x = torch.unique(vertex)
print(x.shape)
node2id = {x[i].item():i for i in range(x.shape[0])}

new_x = torch.tensor([[v] for v in node2id.values()], dtype = torch.float)

new_edge_index = torch.tensor([[node2id[edge_index[0][i].item()] for i in range(edge_index.shape[1])],[node2id[edge_index[1][i].item()] for i in range(edge_index.shape[1])]])
print(new_edge_index.shape)
data = Data(x=new_x, edge_index=new_edge_index)


torch.Size([86476])
torch.Size([2, 246220])


In [4]:
data.edge_index

tensor([[ 4766,   228,   228,  ..., 65045, 65045,  4766],
        [  228,     1,     2,  ..., 44892, 44892, 65222]])

# Splitting dataset to train/test


In [5]:
from torch.utils.data import random_split


TRAIN_BATCH_SIZE = 1000
TEST_BATCH_SIZE = 1000

train_dataset, test_dataset = random_split(data.x,[70000,16476])

train_edges, test_edges = random_split(data.edge_index.T,[190000,56220])

train_data = Data(x=train_dataset.dataset, edge_index=train_edges.dataset.T)
test_data = Data(x=test_dataset.dataset, edge_index=test_edges.dataset.T)


train_loader = DataLoader(range(train_data.edge_index.shape[1]),
                          batch_size = TRAIN_BATCH_SIZE,
                          shuffle=True)

test_loader = DataLoader(range(test_data.edge_index.shape[1]),
                         batch_size = TEST_BATCH_SIZE, 
                         shuffle=True)



# GraphSage model

In [6]:
class GraphSage(MessagePassing):

    def __init__(self, in_channels, out_channels, normalize=True,
                 bias=False, **kwargs):
        super(GraphSage, self).__init__(**kwargs)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.normalize = normalize

        self.lin_l = torch.nn.Linear(in_channels, out_channels, bias=bias)
        self.lin_r = torch.nn.Linear(in_channels, out_channels, bias=bias)

    def forward(self, x, edge_index, size=None):
        neighbor_out = self.propagate(edge_index, x=(x, x), size=size)
        out = self.lin_l(x) + self.lin_r(neighbor_out)
        if self.normalize:
            out = torch.nn.functional.normalize(out, p=2)

        return out


# Link Predictor model

In [7]:
class LinkPredictor(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout =0.2):
        super().__init__()

        self.lins = nn.ModuleList()
        self.lins.append(nn.Linear(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.lins.append(nn.Linear(hidden_channels, hidden_channels))
        self.lins.append(nn.Linear(hidden_channels, out_channels))

        self.dropout = dropout
        
    def forward(self, x_i, x_j):
        x = x_i * x_j # scalar-product for tensors
        for lin in self.lins[:-1]:
            x = lin(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.lins[-1](x)
        return torch.sigmoid(x)
        
 

# Train Loop

In [9]:
NUM_EPOCHS = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

model = GraphSage(in_channels = 1,
                  out_channels = 64)                    

link_predictor = LinkPredictor(in_channels= 64,
                               hidden_channels = 128,
                               out_channels = 2,
                               num_layers = 4)
  
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), link_predictor.parameters()),
                             lr = 0.001,
                             weight_decay = 0.001)

lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer = optimizer,
                                               step_size = 1,
                                               gamma = 0.5)


In [10]:
min_loss = np.inf

model.to(device)
link_predictor.to(device)

for epoch in range(NUM_EPOCHS):    
    model.train()
    link_predictor.train()

    iteration = 1
    for edge_id in train_loader:
        optimizer.zero_grad()
        node_emb = model(train_data.x.to(device), train_data.edge_index.to(device))  
        
        pos_edge = train_data.edge_index[:,edge_id]
        pos_pred = link_predictor(node_emb[pos_edge[0]].to(device), node_emb[pos_edge[1]].to(device)) 
        
        neg_edge = negative_sampling(data.edge_index, 
                                     num_nodes=train_data.x.shape[0],
                                     num_neg_samples=edge_id.shape[0],
                                     method='dense')
        
        neg_pred = link_predictor(node_emb[neg_edge[0]].to(device), node_emb[neg_edge[1]].to(device))
       
        loss = -torch.log(pos_pred + 1e-15).mean() -  torch.log(1 - neg_pred + 1e-15).mean()
        loss.backward()
       
        optimizer.step()
        
        if iteration % 10 == 0 :
            print(f' Epoch {epoch+1}/Iteration {iteration+1} : {loss.item()}')
        iteration+=1

        if min_loss > loss.item():
            min_loss = loss.item()
            torch.save(model.state_dict(), 'model.pth')
            torch.save(link_predictor.state_dict(), 'predictor.pth')

    lr_scheduler.step()

 Epoch 1/Iteration 11 : 1.3855202198028564
 Epoch 1/Iteration 21 : 1.3813390731811523
 Epoch 1/Iteration 31 : 1.364211916923523
 Epoch 1/Iteration 41 : 1.2927398681640625
 Epoch 1/Iteration 51 : 1.0750616788864136
 Epoch 1/Iteration 61 : 0.8088548183441162
 Epoch 1/Iteration 71 : 0.729498028755188
 Epoch 1/Iteration 81 : 0.7088220119476318
 Epoch 1/Iteration 91 : 0.6813292503356934
 Epoch 1/Iteration 101 : 0.6076738834381104
 Epoch 1/Iteration 111 : 0.6456466913223267
 Epoch 1/Iteration 121 : 0.6809768676757812
 Epoch 1/Iteration 131 : 0.6309910416603088
 Epoch 1/Iteration 141 : 0.6701458692550659
 Epoch 1/Iteration 151 : 0.5998407602310181
 Epoch 1/Iteration 161 : 0.6271449327468872
 Epoch 1/Iteration 171 : 0.6266093254089355
 Epoch 1/Iteration 181 : 0.6082190275192261
 Epoch 1/Iteration 191 : 0.6540667414665222


KeyboardInterrupt: 

# Loading pretrained models

In [12]:
model = GraphSage(in_channels = 1,
                      out_channels = 64)                    

predictor = LinkPredictor(in_channels= 64,
                                   hidden_channels = 128,
                                   out_channels = 2,
                                   num_layers = 4)


model.load_state_dict(torch.load('model.pth'))
predictor.load_state_dict(torch.load('predictor.pth'))

<All keys matched successfully>

# Test Function

In [32]:
def test(model, predictor):
    model.eval()
    predictor.eval()

    node_emb = model(test_data.x, test_data.edge_index)

    pos_edges = test_data.edge_index
    neg_edges = negative_sampling(test_data.edge_index, 
                                     num_nodes=test_data.x.shape[0],
                                     num_neg_samples=int(test_data.edge_index.shape[1]*0.3),
                                     method='dense')
    pos_test_preds = []

    for edge_id in test_loader:  
        edge = pos_edges[:,edge_id]
        pos_test_preds += [predictor(node_emb[edge[0]], node_emb[edge[1]])]
    pos_test_pred = torch.cat(pos_test_preds, dim=0)
    
    pos_preds = [1 if torch.sum(i)/2 >=0.7 else 0 for i in pos_test_pred.detach()] 
    
    neg_test_preds = []
    for edge_id in DataLoader(range(neg_edges.shape[1]),
                         batch_size = TEST_BATCH_SIZE, 
                         shuffle=True):
        edge = neg_edges[:,edge_id]
        neg_test_preds += [predictor(node_emb[edge[0]], node_emb[edge[1]])]
        
    neg_test_pred = torch.cat(neg_test_preds, dim=0)

    neg_preds = [1 if torch.sum(i)/2 >=0.7  else 0  for i in neg_test_pred.detach()] 

    pos_preds = np.array(pos_preds)
    neg_preds = np.array(neg_preds)
    
    check = np.hstack((np.ones(pos_preds.shape[0]),np.zeros(neg_preds.shape[0])))
    
    preds = np.hstack((pos_preds,neg_preds))
    score = f1_score(check, preds)
    
    print(f'Score : {score}')

        
        

In [33]:
test(model,predictor)

Score : 0.8917772524874938


# Prediction

In [34]:
def predict(node_1,node_2,threshhold = 0.7,model,predictor):
    node_1 = node2id[node_1]
    node_2 = node2id[node_2]
    
    node_emb = model(data.x,data.edge_index)

    pred = predictor(node_emb[node_1], node_emb[node_2]) 
    
    if torch.sum(pred)/2 >= threshhold:
         print('Nodes - connected')
    else:
         print('Nodes - are NOT connected')





In [35]:
predict(99210,3520,0.7,model,predictor)


Nodes - connected
