In [1]:
import itertools
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.nn as gnn
import numpy as np
import pandas as pd
from torch_geometric.data import DataLoader
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
from torch_geometric.utils import negative_sampling
from torch_geometric.nn import GCNConv 
from torch.optim.lr_scheduler import StepLR
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score, ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn import preprocessing

In [2]:
class TwitterDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(TwitterDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['twitter_combined.txt', 'twitter_features.csv']

    @property
    def processed_file_names(self):
        return ['twitter_data.pt']
    
    def download(self):
        # In this case, you're not downloading any data,
        # but you could add code to download your data here if it's not local
        pass

    def process(self):
        # Load the graph data
        edges = pd.read_csv(self.raw_paths[0], delimiter=' ', header=None)
        features = pd.read_csv(self.raw_paths[1],index_col=0)

        #encode features
        words_list = features['features'].str.split(' ')
        flattened_list = [word for sublist in words_list.dropna() for word in sublist]
        # Convert the list into a set to find all unique words
        unique_words = set(flattened_list)
        unique_words=[x for x in unique_words]
        #         unique_words.append('')
        le = preprocessing.LabelEncoder()
        targets = le.fit_transform(unique_words)
        encoded = dict(zip(unique_words, targets))

        # Create the graph
        edge_index = torch.tensor(edges.values, dtype=torch.long).t().contiguous()
        fdict={}
        for index, row in features.iterrows():
            if pd.isna(row['features']) or row['features'] == '':
                fdict[index]=tuple([])
            else:
                words= row['features']
                wlist=words.split(" ")
                fdict[index] = tuple([encoded[key] for key in wlist])
        max_length = max(len(value) for value in fdict.values())

        padded_fdict = {key: value + (0,)* (max_length - len(value))  for key, value in fdict.items()}

        x = torch.tensor(list(padded_fdict.values()),dtype=torch.long)

        data = Data(x=x, edge_index=edge_index)
        data.node_index = list(fdict.keys())

        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])
        
        
class GPlusDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(GPlusDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['gplus_combined.txt', 'gplus_features.txt']

    @property
    def processed_file_names(self):
        return ['gplus_data.pt']

    def download(self):
        # In this case, you're not downloading any data,
        # but you could add code to download your data here if it's not local
        pass

    def process(self):
        # Load the graph data
        edges = pd.read_csv(self.raw_paths[0], delimiter=' ', header=None)
        features = pd.read_csv(self.raw_paths[1], delimiter=' ', header=None)

        # Create the graph
        edge_index = torch.tensor(edges.values, dtype=torch.long).t().contiguous()
        x = torch.tensor(features.values, dtype=torch.float)
    
        data = Data(x=x, edge_index=edge_index)

        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

In [3]:
dataset = TwitterDataset(root='.')
# dataset
data = dataset[0]

In [4]:
class FacebookDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(FacebookDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return ['facebook_combined.txt', 'facebook_features.txt']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        # In this case, you're not downloading any data,
        # but you could add code to download your data here if it's not local
        pass

    def process(self):
        # Load the graph data
        edges = pd.read_csv(self.raw_paths[0], delimiter=' ', header=None)
        features = pd.read_csv(self.raw_paths[1], delimiter=' ', header=None)

        # Create the graph
        edge_index = torch.tensor(edges.values, dtype=torch.long).t().contiguous()
        x = torch.tensor(features.values, dtype=torch.float)
    
        data = Data(x=x, edge_index=edge_index)

        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

In [5]:
# dataset = FacebookDataset(root='.')
# data = dataset[0]

In [6]:
data

Data(x=[81306, 194], edge_index=[2, 2420766], node_index=[81306])

In [7]:
#Calculate Train Positives
# Define the percentage of edges to remove
test_percentage = 0.3

# Calculate the number of edges to remove
num_edges_to_remove = int(data.edge_index.shape[1] * test_percentage)

# Shuffle the edges
edge_indices = np.arange(data.edge_index.shape[1])
np.random.shuffle(edge_indices)

# Select the edges to keep
edges_to_keep = edge_indices[num_edges_to_remove:]

# Create a new graph with only the edges to keep(Train positives)
data_prime = Data(x=data.x, edge_index=data.edge_index[:, edges_to_keep])

In [8]:
#train pos 
train_pos=data_prime

In [9]:
#TEST POSITIVES
# Select the edges to remove (these will be your test positives)
edges_to_remove = edge_indices[:num_edges_to_remove]

# Create a new graph with only the edges to remove (Test positives)
test_pos = Data(x=data.x, edge_index=data.edge_index[:, edges_to_remove])

In [10]:
#Calculate Train negatives
positives_edges=data_prime.edge_index
# Number of negative samples to generate
num_neg_samples = data_prime.edge_index.size(1)
# Generate negative samples
negative_edge_index = negative_sampling(edge_index=data.edge_index, num_nodes=data.num_nodes, num_neg_samples=num_neg_samples)
#Train negatives
train_neg = Data(x=data.x, edge_index=negative_edge_index)

In [11]:
# TEST negatives
# Number of negative samples to generate for testing
num_test_neg_samples = test_pos.edge_index.size(1)

# Generate negative samples for testing
test_negative_edge_index = negative_sampling(edge_index=data.edge_index, num_nodes=data.num_nodes, num_neg_samples=num_test_neg_samples)

# Convert tensors to lists of tuples
train_pos_edges = [tuple(edge) for edge in train_pos.edge_index.t().tolist()]
test_pos_edges = [tuple(edge) for edge in test_pos.edge_index.t().tolist()]
test_negative_edges = [tuple(edge) for edge in test_negative_edge_index.t().tolist()]

#splitted box

In [12]:
len(test_pos_edges)

726229

In [13]:
len(train_pos_edges)

1694537

In [14]:
len(test_negative_edges)

726229

In [15]:
qqqqa=train_pos_edges+test_pos_edges

In [None]:
test_neg_edge_index = []
with tqdm(total=len(test_negative_edges), position=0, leave=True) as pbar:
    for edge in test_negative_edges:
        if edge not in qqqqa:
            test_neg_edge_index.append(edge)
        pbar.update()

# Create the test_neg set
test_neg = Data(x=data.x, edge_index=torch.tensor(test_neg_edge_index).t().contiguous())

In [18]:
#invers
cnter = []
with tqdm(total=int(len(qqqqa)/1000)+1, position=0, leave=True) as pbar:
    for batch in range(int(len(qqqqa)/1000)+1):
        if(batch+1000>len(qqqqa)):
            xd=qqqqa[batch*1000:]
        else:
            xd=qqqqa[batch*1000:batch*1000+1000]
        for edge in xd:
            if edge in test_negative_edges:
                cnter.append(edge)
        pbar.update()

# Create the test_neg set
test_neg = Data(x=data.x, edge_index=torch.tensor(test_negative_edges).t().contiguous())

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2421/2421 [9:39:06<00:00, 14.35s/it]


In [28]:
test_neg.validate()

True

In [None]:
# import concurrent.futures
# qqqqa=train_pos_edges+test_pos_edges
# # Function to check if an edge meets the condition
# def check_edge(edge):
#     if edge not in qqqqa:
#         return edge
#     return None

# # Initialize list to store results
# test_neg_edge_index = []

# # Submit tasks to ThreadPoolExecutor
# with tqdm(total=len(test_negative_edges), position=0, leave=True) as pbar:
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         futures = [executor.submit(check_edge, edge) for edge in test_negative_edges]
#         pbar.update()

#     # Iterate over completed futures
#         for future in concurrent.futures.as_completed(futures):
#             result = future.result()
#             if result is not None:
#                 test_neg_edge_index.append(result)
            

In [None]:
# import concurrent.futures

# qqqqa=train_pos_edges+test_pos_edges
# # Function to check if an edge meets the condition
# def check_edge(edge):
#     if edge not in qqqqa:
#         return edge
#     return None

# # Initialize list to store results
# test_neg_edge_index = []

# # Submit tasks to ThreadPoolExecutor using map function
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     results = executor.map(check_edge, test_negative_edges)

#     # Iterate over results and update progress bar
#     with tqdm(total=len(test_negative_edges), position=0, leave=True) as pbar:
#         for result in results:
#             if result is not None:
#                 test_neg_edge_index.append(result)
#             pbar.update()


In [None]:
# Ensure that these negative samples are not in the train_pos or test_pos sets
test_neg_edge_index = [edge for edge in test_negative_edges if edge not in train_pos_edges and edge not in test_pos_edges]

# Create the test_neg set
test_neg = Data(x=data.x, edge_index=torch.tensor(test_neg_edge_index).t().contiguous())

In [20]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_size):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_size)
        self.conv2 = GCNConv(hidden_size, hidden_size)

    def forward(self, edge_index, x):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

In [21]:
class TruncatedSVDModel(torch.nn.Module):
    def __init__(self, num_features, output_size):
        super(TruncatedSVDModel, self).__init__()
        self.svd = TruncatedSVD(n_components=output_size)
        self.fc = torch.nn.Linear(output_size, output_size)

    def forward(self, x):
        x_svd = self.svd.fit_transform(x.numpy())
        x_svd = torch.tensor(x_svd, dtype=torch.float)
        x_out = self.fc(x_svd)
        return x_out

In [22]:
class DotPredictor(torch.nn.Module):
    def forward(self, edge_index, h):
        # Get the embeddings of the source nodes and destination nodes
        source_node_embeddings = h[edge_index[0]]
        destination_node_embeddings = h[edge_index[1]]

        # Compute the dot product (score) between source and destination node embeddings
        scores = (source_node_embeddings * destination_node_embeddings).sum(dim=-1)

        return scores

In [23]:
def pipeline(model_name='GCN', hidden_size=64, epoch=100):
    
    def compute_loss(pos_score, neg_score):  # computes the loss based on binary cross entropy
        scores = torch.cat([pos_score, neg_score])
        labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
        return F.binary_cross_entropy_with_logits(scores, labels)

    def compute_auc(pos_score, neg_score):  # computes AUC (Area-Under-Curve) score
        scores = torch.cat([pos_score, neg_score]).numpy()
        labels = torch.cat(
            [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
        return roc_auc_score(labels, scores)
       
    # hidden_size is the size of the hidden layer in the neural net
    if model_name == 'GCN':
        model = GCN(data_prime.num_features, hidden_size)
    elif model_name == 'TruncatedSVD':
        model = TruncatedSVDModel(data_prime.num_features, hidden_size)
        
    pred = DotPredictor()
    optimizer = torch.optim.SGD(itertools.chain(model.parameters(), pred.parameters()), lr=0.01, momentum=0.9)
    # Use a learning rate scheduler
    scheduler = StepLR(optimizer, step_size=500, gamma=0.5)
 

        # ----------- training -------------------------------- #
    train_g = data_prime
    for e in range(epoch):
        if model_name == 'GCN':
            h = model(train_g.edge_index, train_g.x)
        elif model_name == 'TruncatedSVD':
            h = model(train_g.x)  # get node embeddings

        # forward    
        pos_score = pred(train_pos.edge_index, h)
        neg_score = pred(train_neg.edge_index, h)
        loss = compute_loss(pos_score, neg_score)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        if e % 10 == 0:
            print('In epoch {}, loss: {}'.format(e, loss))

    # ----------- test and check results ---------------- #
    with torch.no_grad():
        pos_score = pred(test_pos.edge_index, h)
        neg_score = pred(test_neg.edge_index, h)
        auc=compute_auc(pos_score, neg_score)
        print('AUC', auc)  
               
    # Print model's state_dict
    print("Model's state_dict:")
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

    # Print optimizer's state_dict
    print("Optimizer's state_dict:")
    for var_name in optimizer.state_dict():
        print(var_name, "\t", optimizer.state_dict()[var_name])

    # Print scheduler's state_dict
    print("scheduler's state_dict:")
    for var_name in scheduler.state_dict():
        print(var_name, "\t", scheduler.state_dict()[var_name])
    
    torch.save({
            'epoch': epoch,        
            'epoch_rem': e,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': loss,
            
            }, './torch_model/model_'+model_name+'_'+str(epoch)+'_'+'twitter'+'.pt')
        
    return h  # return node embeddings

In [24]:
def generate_rec(h, user_id=0, k=10):
    # `h` represents the node embeddings, with shape [num_nodes, hidden_size]

    # generate a graph with (num_nodes - num_friends_of_user) edges
    # one end of the edge is user_id
    # the other end is a user that's NOT friends with user_id
    user_friends = set()
    user_neg_u, user_neg_v = [], []

    for n1, n2 in data.edge_index.t().tolist():   # get all friends of user_id
        if int(n1) == user_id:
            user_friends.add(int(n2))
        if int(n2) == user_id:
            user_friends.add(int(n1))

    num_nodes=data.x.shape[0]
    for i in range(num_nodes):  # generate "negative edges" for user_id
        if i != user_id and i not in user_friends:
            user_neg_u.append(user_id)
            user_neg_v.append(i)

    user_g = Data(x=data.x,edge_index=torch.tensor([user_neg_u, user_neg_v] ))

    pred = DotPredictor()

    # calculate the score of each user
    scores = []
    for i, score in enumerate(pred(user_g.edge_index, h)):
        rel=1 if ((user_id,i) in test_pos_edges) else 0
        scores.append((i, score,rel))

    # produce final ranked list
    scores.sort(key=lambda x: -x[1])

    # display results
    
    if (k !=0):
        print(f"List of 5 suggested friends for user {user_id}:")
    for i in range(k):
        print(f'- User {scores[i][0]}, score = {scores[i][1]}, rel = {scores[i][2]}')
    return scores[:10]


In [25]:
def calc_rec(h, user_id=0, k=10):
    user_friends = set()
    user_neg_u, user_neg_v = [], []

    for n1, n2 in data.edge_index.t().tolist():   # get all friends of user_id
        if int(n1) == user_id:
            user_friends.add(int(n2))
        if int(n2) == user_id:
            user_friends.add(int(n1))

    num_nodes=data.x.shape[0]
    for i in range(num_nodes):  # generate "negative edges" for user_id
        if i != user_id and i not in user_friends:
            user_neg_u.append(user_id)
            user_neg_v.append(i)

    user_g = Data(x=data.x,edge_index=torch.tensor([user_neg_u, user_neg_v] ))

    pred = DotPredictor()

    # calculate the score of each user
    scores = []
    for i, score in enumerate(pred(user_g.edge_index, h)):
        rel=1 if ((user_id,i) in test_pos_edges) else 0
        scores.append((i, score,rel))

    # produce final ranked list
    scores.sort(key=lambda x: -x[1])
    rel=[x[2] for x in scores]
    # display results
    
    return rel[:500]

In [26]:
def metrics(a,k=50):
    top_k = a[:k]
    hits=pd.DataFrame(0,index=[0],columns=top_k.columns)
    recipr=pd.DataFrame(0,index=[0],columns=top_k.columns)
    dcg=pd.DataFrame(0,index=[0],columns=top_k.columns)
    idcg=pd.DataFrame(0,index=[0],columns=top_k.columns)
    for i in list(top_k.columns):
        hits[i]=(top_k[i].sum()/k)
        recipr[i]=top_k.index[top_k[i] == 1].min()
        dcg[i]=0
        idcg[i]=0
        top_sort=top_k[i]
        top_sort=top_sort.sort_values(ascending=False)
        for j in range(0,k):
            dcg[i]+=top_k[i].iloc[j]/np.log2(j+1+1)
            idcg[i]+=top_sort.iloc[j]/np.log2(j+1+1)

    recipr=recipr.replace(np.nan,-1) +1
    recipr = recipr.loc[:, (recipr != 0).any()]
    ndcg=dcg/idcg
    ndcg=ndcg.T.replace(np.nan,0)

    ndcg_k=np.mean(ndcg)
    mrr=(1/recipr.T).sum()[0]/len(a.columns)
    hits_at_k=np.mean(hits)

    print('Hits@'+str(k)+':', hits_at_k)
    print('NDCG@'+str(k)+':', ndcg_k)
    print('MRR:', mrr)
    
    return hits_at_k, ndcg_k, mrr


In [27]:
h = pipeline("GCN",epoch=1000)

RuntimeError: index 1702611 is out of bounds for dimension 0 with size 81306

In [None]:
target_users = np.unique(np.array(test_pos_edges).flatten())
rankings={}
with tqdm(total=len(target_users), position=0, leave=True) as pbar:
    for user in target_users:         
        rankings[user]=calc_rec(h,user)
        pbar.update()



In [29]:
h2 = pipeline("TruncatedSVD",epoch=1000)

IndexError: index 14955456 is out of bounds for dimension 0 with size 81306

In [None]:
rankings2={}
with tqdm(total=len(target_users), position=0, leave=True) as pbar:
    for user in target_users:              
        rankings2[user]=calc_rec(h2,user)
        pbar.update()

In [None]:
a=pd.DataFrame(rankings)
#all users
metrics(a,k=50)

In [None]:
metrics(a,k=5)

In [None]:
a.to_csv('dot_product_gcn_all'+'_'+'twitter'+'.csv', index=False) 
a.to_pickle('dot_product_gcn_all'+'_'+'twitter'+'.pkl')
torch.save(h, 'node_embeddings_gcn'+'_'+'twitter'+'.pt')

In [None]:
a2=pd.DataFrame(rankings2)
#all users
metrics(a2,k=50)

In [None]:
metrics(a2,k=5)

In [None]:
a2.to_csv('dot_product_TruncatedSVD_all'+'_'+'twitter'+'.csv', index=False) 
a2.to_pickle('dot_product_TruncatedSVD_all'+'_'+'twitter'+'.pkl')
torch.save(h2, 'node_embeddings_TruncatedSVD'+'_'+'twitter'+'.pt')

|model|AUC|hits@5|NDCG@5| MRR  |hits@50|NDCG@50|MRR|
|-----|---|---|------|------|-------|-------|---|
| GCN |0.9596|0.0096|0.0273|0.0212| 0.0095|0.0868 |0.036  |
|tSVD |0.8556|0.0043|0.0126|0.0097| 0.0048|0.0448 |0.017 |

In [None]:
# Load if unloaded and get recommendations
if 'h' not in globals() or 'h' not in locals():
    h=torch.load('node_embeddings_gcn.pt')
if 'a' not in globals() or 'a' not in locals():
    a = pd.read_pickle('dot_product_gcn_all.pkl')
if 'h2' not in globals() or 'h2' not in locals():
    h=torch.load('node_embeddings_TruncatedSVD.pt')
if 'a2' not in globals() or 'a2' not in locals():
    a = pd.read_pickle('dot_product_TruncatedSVD_all.pkl')

In [None]:
sc=generate_rec(h, user_id=40, k=5) #k only for visualization here

In [None]:
sc2=generate_rec(h2, user_id=40, k=5)