In [1]:
import itertools
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.nn as gnn
import numpy as np
import pandas as pd
from torch_geometric.data import DataLoader
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
from torch_geometric.utils import negative_sampling
from torch_geometric.nn import GCNConv 
from torch.optim.lr_scheduler import StepLR
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score, ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [None]:
target_epoch=30   #TODO train for 30 50 100 200 DONE 100

In [2]:
# class TwitterDataset(InMemoryDataset):
#     def __init__(self, root, transform=None, pre_transform=None):
#         super(TwitterDataset, self).__init__(root, transform, pre_transform)
#         self.data, self.slices = torch.load(self.processed_paths[0])

#     @property
#     def raw_file_names(self):
#         return ['twitter_combined.txt', 'twitter_features.txt']

#     @property
#     def processed_file_names(self):
#         return ['twitter_data.pt']

#     def download(self):
#         # In this case, you're not downloading any data,
#         # but you could add code to download your data here if it's not local
#         pass

#     def process(self):
#         # Load the graph data
#         edges = pd.read_csv(self.raw_paths[0], delimiter=' ', header=None)
#         features = pd.read_csv(self.raw_paths[1], delimiter=' ', header=None)

#         # Create the graph
#         edge_index = torch.tensor(edges.values, dtype=torch.long).t().contiguous()
#         x = torch.tensor(features.values, dtype=torch.float)
    
#         data = Data(x=x, edge_index=edge_index)

#         data, slices = self.collate([data])
#         torch.save((data, slices), self.processed_paths[0])
        
# class GPlusDataset(InMemoryDataset):
#     def __init__(self, root, transform=None, pre_transform=None):
#         super(GPlusDataset, self).__init__(root, transform, pre_transform)
#         self.data, self.slices = torch.load(self.processed_paths[0])

#     @property
#     def raw_file_names(self):
#         return ['gplus_combined.txt', 'gplus_features.txt']

#     @property
#     def processed_file_names(self):
#         return ['gplus_data.pt']

#     def download(self):
#         # In this case, you're not downloading any data,
#         # but you could add code to download your data here if it's not local
#         pass

#     def process(self):
#         # Load the graph data
#         edges = pd.read_csv(self.raw_paths[0], delimiter=' ', header=None)
#         features = pd.read_csv(self.raw_paths[1], delimiter=' ', header=None)

#         # Create the graph
#         edge_index = torch.tensor(edges.values, dtype=torch.long).t().contiguous()
#         x = torch.tensor(features.values, dtype=torch.float)
    
#         data = Data(x=x, edge_index=edge_index)

#         data, slices = self.collate([data])
#         torch.save((data, slices), self.processed_paths[0])

In [3]:
class FacebookDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(FacebookDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['facebook_combined.txt', 'facebook_features.txt']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        # In this case, you're not downloading any data,
        # but you could add code to download your data here if it's not local
        pass

    def process(self):
        # Load the graph data
        edges = pd.read_csv(self.raw_paths[0], delimiter=' ', header=None)
        features = pd.read_csv(self.raw_paths[1], delimiter=' ', header=None)

        # Create the graph
        edge_index = torch.tensor(edges.values, dtype=torch.long).t().contiguous()
        x = torch.tensor(features.values, dtype=torch.float)
    
        data = Data(x=x, edge_index=edge_index)

        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

In [4]:
dataset = FacebookDataset(root='.')
data = dataset[0]

In [5]:
data

Data(x=[4039, 1406], edge_index=[2, 88234])

In [6]:
#Calculate Train Positives
# Define the percentage of edges to remove
test_percentage = 0.3

# Calculate the number of edges to remove
num_edges_to_remove = int(data.edge_index.shape[1] * test_percentage)

# Shuffle the edges
edge_indices = np.arange(data.edge_index.shape[1])
np.random.shuffle(edge_indices)

# Select the edges to keep
edges_to_keep = edge_indices[num_edges_to_remove:]

# Create a new graph with only the edges to keep(Train positives)
data_prime = Data(x=data.x, edge_index=data.edge_index[:, edges_to_keep])

In [7]:
#train pos 
train_pos=data_prime

In [8]:
#TEST POSITIVES
# Select the edges to remove (these will be your test positives)
edges_to_remove = edge_indices[:num_edges_to_remove]

# Create a new graph with only the edges to remove (Test positives)
test_pos = Data(x=data.x, edge_index=data.edge_index[:, edges_to_remove])

In [9]:
#Calculate Train negatives
positives_edges=data_prime.edge_index
# Number of negative samples to generate
num_neg_samples = data_prime.edge_index.size(1)
# Generate negative samples
negative_edge_index = negative_sampling(edge_index=data.edge_index, num_nodes=data.num_nodes, num_neg_samples=num_neg_samples)
#Train negatives
train_neg = Data(x=data.x, edge_index=negative_edge_index)

In [10]:
# TEST negatives
# Number of negative samples to generate for testing
num_test_neg_samples = test_pos.edge_index.size(1)

# Generate negative samples for testing
test_negative_edge_index = negative_sampling(edge_index=data.edge_index, num_nodes=data.num_nodes, num_neg_samples=num_test_neg_samples)

# Convert tensors to lists of tuples
train_pos_edges = [tuple(edge) for edge in train_pos.edge_index.t().tolist()]
test_pos_edges = [tuple(edge) for edge in test_pos.edge_index.t().tolist()]
test_negative_edges = [tuple(edge) for edge in test_negative_edge_index.t().tolist()]

# Ensure that these negative samples are not in the train_pos or test_pos sets
test_neg_edge_index = [edge for edge in test_negative_edges if edge not in train_pos_edges and edge not in test_pos_edges]

# Create the test_neg set
test_neg = Data(x=data.x, edge_index=torch.tensor(test_neg_edge_index).t().contiguous())

In [11]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_size):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_size)
        self.conv2 = GCNConv(hidden_size, hidden_size)

    def forward(self, edge_index, x):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [12]:
class TruncatedSVDModel(torch.nn.Module):
    def __init__(self, num_features, output_size):
        super(TruncatedSVDModel, self).__init__()
        self.svd = TruncatedSVD(n_components=output_size)
        self.fc = torch.nn.Linear(output_size, output_size)

    def forward(self, x):
        x_svd = self.svd.fit_transform(x.numpy())
        x_svd = torch.tensor(x_svd, dtype=torch.float)
        x_out = self.fc(x_svd)
        return x_out

In [13]:
class DotPredictor(torch.nn.Module):
    def forward(self, edge_index, h):
        # Get the embeddings of the source nodes and destination nodes
        source_node_embeddings = h[edge_index[0]]
        destination_node_embeddings = h[edge_index[1]]

        # Compute the dot product (score) between source and destination node embeddings
        scores = (source_node_embeddings * destination_node_embeddings).sum(dim=-1)

        return scores

In [14]:
def pipeline(model_name='GCN', hidden_size=64, epoch=100):
    
    def compute_loss(pos_score, neg_score):  # computes the loss based on binary cross entropy
        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
        return F.binary_cross_entropy_with_logits(scores, labels)

    def compute_auc(pos_score, neg_score):  # computes AUC (Area-Under-Curve) score
        scores = torch.cat([pos_score, neg_score]).numpy()
        labels = torch.cat(
            [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
        return roc_auc_score(labels, scores)
       
    # hidden_size is the size of the hidden layer in the neural net
    if model_name == 'GCN':
        model = GCN(data_prime.num_features, hidden_size)
    elif model_name == 'TruncatedSVD':
        model = TruncatedSVDModel(data_prime.num_features, hidden_size)
        
    pred = DotPredictor()
    optimizer = torch.optim.SGD(itertools.chain(model.parameters(), pred.parameters()), lr=0.01, momentum=0.9)
    # Use a learning rate scheduler
    scheduler = StepLR(optimizer, step_size=500, gamma=0.5)
 

        # ----------- training -------------------------------- #
    train_g = data_prime
    for e in range(epoch):
        if model_name == 'GCN':
            h = model(train_g.edge_index, train_g.x)
        elif model_name == 'TruncatedSVD':
            h = model(train_g.x)  # get node embeddings

        # forward    
        pos_score = pred(train_pos.edge_index, h)
        neg_score = pred(train_neg.edge_index, h)
        loss = compute_loss(pos_score, neg_score)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        if e % 10 == 0:
            print('In epoch {}, loss: {}'.format(e, loss))

    # ----------- test and check results ---------------- #
    with torch.no_grad():
        pos_score = pred(test_pos.edge_index, h)
        neg_score = pred(test_neg.edge_index, h)
        auc=compute_auc(pos_score, neg_score)
        print('AUC', auc)  
        print('Loss', loss)  
               
    # Print model's state_dict
    print("Model's state_dict:")
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

    # Print optimizer's state_dict
    print("Optimizer's state_dict:")
    for var_name in optimizer.state_dict():
        print(var_name, "\t", optimizer.state_dict()[var_name])

    # Print scheduler's state_dict
    print("scheduler's state_dict:")
    for var_name in scheduler.state_dict():
        print(var_name, "\t", scheduler.state_dict()[var_name])
    
    torch.save({
            'epoch': epoch,        
            'epoch_rem': e,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': loss,
            
            }, './torch_model/model_'+model_name+'_'+str(epoch)+'_fb'+'.pt')
        
    return h  # return node embeddings

In [15]:
def generate_rec(h, user_id=0, k=10):
    # `h` represents the node embeddings, with shape [num_nodes, hidden_size]

    # generate a graph with (num_nodes - num_friends_of_user) edges
    # one end of the edge is user_id
    # the other end is a user that's NOT friends with user_id
    user_friends = set()
    user_neg_u, user_neg_v = [], []

    for n1, n2 in data.edge_index.t().tolist():   # get all friends of user_id
        if int(n1) == user_id:
            user_friends.add(int(n2))
        if int(n2) == user_id:
            user_friends.add(int(n1))

    num_nodes=data.x.shape[0]
    for i in range(num_nodes):  # generate "negative edges" for user_id
        if i != user_id and i not in user_friends:
            user_neg_u.append(user_id)
            user_neg_v.append(i)

    user_g = Data(x=data.x,edge_index=torch.tensor([user_neg_u, user_neg_v] ))

    pred = DotPredictor()

    # calculate the score of each user
    scores = []
    for i, score in enumerate(pred(user_g.edge_index, h)):
        rel=1 if ((user_id,i) in test_pos_edges) else 0
        scores.append((i, score,rel))

    # produce final ranked list
    scores.sort(key=lambda x: -x[1])

    # display results
    
    if (k !=0):
        print(f"List of 5 suggested friends for user {user_id}:")
    for i in range(k):
        print(f'- User {scores[i][0]}, score = {scores[i][1]}, rel = {scores[i][2]}')
    return scores[:10]


In [16]:
def calc_rec(h, user_id=0, k=10):
    user_friends = set()
    user_neg_u, user_neg_v = [], []

    for n1, n2 in data.edge_index.t().tolist():   # get all friends of user_id
        if int(n1) == user_id:
            user_friends.add(int(n2))
        if int(n2) == user_id:
            user_friends.add(int(n1))

    num_nodes=data.x.shape[0]
    for i in range(num_nodes):  # generate "negative edges" for user_id
        if i != user_id and i not in user_friends:
            user_neg_u.append(user_id)
            user_neg_v.append(i)

    user_g = Data(x=data.x,edge_index=torch.tensor([user_neg_u, user_neg_v] ))

    pred = DotPredictor()

    # calculate the score of each user
    scores = []
    for i, score in enumerate(pred(user_g.edge_index, h)):
        rel=1 if ((user_id,i) in test_pos_edges) else 0
        scores.append((i, score,rel))

    # produce final ranked list
    scores.sort(key=lambda x: -x[1])
    rel=[x[2] for x in scores]
    # display results
    
    return rel[:500]

In [17]:
def metrics(a,k=50):
    top_k = a[:k]
    hits=pd.DataFrame(0,index=[0],columns=top_k.columns)
    recipr=pd.DataFrame(0,index=[0],columns=top_k.columns)
    dcg=pd.DataFrame(0,index=[0],columns=top_k.columns)
    idcg=pd.DataFrame(0,index=[0],columns=top_k.columns)
    for i in list(top_k.columns):
        hits[i]=(top_k[i].sum()/k)
        recipr[i]=top_k.index[top_k[i] == 1].min()
        dcg[i]=0
        idcg[i]=0
        top_sort=top_k[i]
        top_sort=top_sort.sort_values(ascending=False)
        for j in range(0,k):
            dcg[i]+=top_k[i].iloc[j]/np.log2(j+1+1)
            idcg[i]+=top_sort.iloc[j]/np.log2(j+1+1)

    recipr=recipr.replace(np.nan,-1) +1
    recipr = recipr.loc[:, (recipr != 0).any()]
    ndcg=dcg/idcg
    ndcg=ndcg.T.replace(np.nan,0)

    ndcg_k=np.mean(ndcg)
    mrr=(1/recipr.T).sum()[0]/len(a.columns)
    hits_at_k=np.mean(hits)

    print('Hits@'+str(k)+':', hits_at_k)
    print('NDCG@'+str(k)+':', ndcg_k)
    print('MRR:', mrr)
    
    return hits_at_k, ndcg_k, mrr


In [18]:
h = pipeline("GCN",epoch=target_epoch) 

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In epoch 0, loss: 0.6726780533790588
In epoch 10, loss: 0.6240740418434143
In epoch 20, loss: 0.5987688302993774
In epoch 30, loss: 0.5667281150817871
In epoch 40, loss: 0.5514950156211853
In epoch 50, loss: 0.5316541194915771
In epoch 60, loss: 0.5170732140541077
In epoch 70, loss: 0.5021128058433533
In epoch 80, loss: 0.4952318072319031
In epoch 90, loss: 0.48784470558166504
AUC 0.9316019061431855
Model's state_dict:
conv1.bias 	 torch.Size([64])
conv1.lin.weight 	 torch.Size([64, 1406])
conv2.bias 	 torch.Size([64])
conv2.lin.weight 	 torch.Size([64, 64])
Optimizer's state_dict:
state 	 {0: {'momentum_buffer': tensor([-0.0234, -0.0119, -0.0037, -0.0391, -0.0414, -0.0094, -0.0043, -0.0056,
         0.0018,  0.0030, -0.0172, -0.0258, -0.0056,  0.0280, -0.0259, -0.0034,
        -0.0255, -0.0043, -0.0027, -0.0481, -0.0080, -0.0451, -0.0060,  0.0080,
        -0.0220,  0.0076,  0.0004, -0.0002, -0.1212, -0.0001, -0.0220, -0.0304,
         0.0038,  0.0039,  0.0034, -0.0027,  0.0078, -0.014

In [19]:
target_users = np.unique(np.array(test_pos_edges).flatten())
rankings={}
with tqdm(total=len(target_users), position=0, leave=True) as pbar:
    for user in target_users:         
        rankings[user]=calc_rec(h,user)
        pbar.update()



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3837/3837 [3:23:47<00:00,  3.19s/it]


In [20]:
h2 = pipeline("TruncatedSVD",epoch=target_epoch)

In epoch 0, loss: 0.7797489762306213
In epoch 10, loss: 0.6444979310035706
In epoch 20, loss: 0.6294510364532471
In epoch 30, loss: 0.6225754618644714
In epoch 40, loss: 0.6175698637962341
In epoch 50, loss: 0.6125826239585876
In epoch 60, loss: 0.6074199676513672
In epoch 70, loss: 0.6032512784004211
In epoch 80, loss: 0.5992748737335205
In epoch 90, loss: 0.595876932144165
AUC 0.7861470156248194
Model's state_dict:
fc.weight 	 torch.Size([64, 64])
fc.bias 	 torch.Size([64])
Optimizer's state_dict:
state 	 {0: {'momentum_buffer': tensor([[ 6.4862e-03, -4.9031e-03, -7.0983e-03,  ..., -4.8888e-04,
         -6.7446e-05, -2.7017e-05],
        [ 3.2969e-03, -1.9973e-02, -4.0480e-02,  ..., -5.2377e-04,
         -6.8156e-04, -9.8458e-04],
        [ 3.6111e-03,  4.3694e-03, -4.1244e-02,  ...,  9.5605e-04,
          7.9940e-05, -3.7967e-05],
        ...,
        [ 1.6379e-02, -4.4449e-03,  6.0847e-03,  ..., -2.6447e-04,
         -1.4034e-04, -2.0221e-04],
        [ 6.9915e-04, -5.4450e-03, -2.

In [21]:
rankings2={}
with tqdm(total=len(target_users), position=0, leave=True) as pbar:
    for user in target_users:              
        rankings2[user]=calc_rec(h2,user)
        pbar.update()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3837/3837 [3:23:06<00:00,  3.18s/it]


In [22]:
a=pd.DataFrame(rankings)
#all users
metrics(a,k=50)

Hits@50: 0.008626531144123015
NDCG@50: 0.07611818345241139
MRR: 0.03178743088129017


(0.008626531144123015, 0.07611818345241139, 0.03178743088129017)

In [23]:
metrics(a,k=5)

Hits@5: 0.008965337503257753
NDCG@5: 0.025150078913555276
MRR: 0.01924246373034489


(0.008965337503257753, 0.025150078913555276, 0.01924246373034489)

In [24]:
a.to_csv('./model_results/dot_product_GCN_'+ str(target_epoch)+'_fb.csv', index=False) 
a.to_pickle('./model_results/dot_product_GCN_'+ str(target_epoch)+'_fb.pkl')
torch.save(h, './model_results/node_embeddings_GCN_'+ str(target_epoch)+'_fb.pkl')

In [25]:
a2=pd.DataFrame(rankings2)
#all users
metrics(a2,k=50)

Hits@50: 0.0041907740422204845
NDCG@50: 0.04343913712820476
MRR: 0.0183727636731807


(0.0041907740422204845, 0.04343913712820476, 0.0183727636731807)

In [26]:
metrics(a2,k=5)

Hits@5: 0.004743289027886371
NDCG@5: 0.014322243020868932
MRR: 0.011358700373555729


(0.004743289027886371, 0.014322243020868932, 0.011358700373555729)

In [27]:
a2.to_csv('./model_results/dot_product_TruncatedSVD_'+ str(target_epoch)+'_fb.csv', index=False) 
a2.to_pickle('./model_results/dot_product_TruncatedSVD_'+ str(target_epoch)+'_fb.pkl')
torch.save(h2, './model_results/node_embeddings_TruncatedSVD_'+ str(target_epoch)+'_fb.pkl')

epoch= 100


|model|AUC|hits@5|NDCG@5| MRR  |hits@50|NDCG@50|MRR|
|-----|---|---|------|------|-------|-------|---|
| GCN |0.9316   |0.0090|0.0252|0.0318| 0.0086|0.0761|0.0318|
|tSVD |0.7861|0.0047|0.0143|0.0114| 0.0047|0.0143|0.0114 |

epoch= 1000


|model|AUC|hits@5|NDCG@5| MRR  |hits@50|NDCG@50|MRR|
|-----|---|---|------|------|-------|-------|---|
| GCN |0.9596|0.0096|0.0273|0.0212| 0.0095|0.0868 |0.036  |
|tSVD |0.8556|0.0043|0.0126|0.0097| 0.0048|0.0448 |0.017 |

In [28]:
# Load if unloaded and get recommendations
if 'h' not in globals() or 'h' not in locals():
    h=torch.load('./model_results/node_embeddings_GCN_'+ str(target_epoch)+'_fb.pkl')
if 'a' not in globals() or 'a' not in locals():
    a = pd.read_pickle('./model_results/dot_product_GCN_'+ str(target_epoch)+'_fb.pkl')
if 'h2' not in globals() or 'h2' not in locals():
    h=torch.load('./model_results/node_embeddings_TruncatedSVD_'+ str(target_epoch)+'_fb.pkl')
if 'a2' not in globals() or 'a2' not in locals():
    a = pd.read_pickle('./model_results/dot_product_TruncatedSVD_'+ str(target_epoch)+'_fb.pkl')

In [29]:
h.shape

torch.Size([4039, 64])

In [30]:
sc=generate_rec(h, user_id=40, k=5) #k only for visualization here

List of 5 suggested friends for user 40:
- User 208, score = 2.76556658744812, rel = 0
- User 28, score = 2.76164174079895, rel = 0
- User 130, score = 2.6085283756256104, rel = 0
- User 177, score = 2.603623390197754, rel = 0
- User 137, score = 2.583993911743164, rel = 0


In [31]:
sc2=generate_rec(h2, user_id=40, k=5)

List of 5 suggested friends for user 40:
- User 1940, score = 2.9889063835144043, rel = 0
- User 2173, score = 2.6871931552886963, rel = 0
- User 2048, score = 2.6488707065582275, rel = 0
- User 2501, score = 2.5042083263397217, rel = 0
- User 1976, score = 2.4675705432891846, rel = 0
