In [None]:
import sys
sys.path.append('../../')

from IPython.core.display import display
import numpy as np
import pandas as pd
import networkx as nx
from sortedcontainers import SortedDict

In [None]:
# import node embeddings
df = pd.read_csv(
    "./facebook_alpha-0.1_beta-0.1_ws-10_neg-5_lr-0.025_icom-36_ind-36_k-10_ds-0.0.txt",
    sep="\t| ",
    header=None
)
df = df.rename(columns={0: 'node'})
df.set_index(['node'], inplace=True)

In [None]:
# import train
train = pd.read_csv("./facebook_train.csv", index_col=0)
# import test
test = pd.read_csv("./facebook_test.csv", index_col=['u', 'v'])

In [None]:
# import predicted labels
labels = pd.read_csv("./labels_pred.txt", header=None)
labels = labels.rename(columns={0: 'label'})
labels.label = labels.astype(int)
labels

In [None]:
df = df.join(labels)

In [None]:
# node embeddings
display(df.head())
display(df.describe())

# train df
display(train.head())
display(train.describe())

# test df
display(test.head())
display(test.describe())

In [None]:
def emb_from_row(row):
    """get node embedding np array from df row"""
    return row.drop('label').to_numpy().flatten()

def emb_distance(emb1, emb2):
    """computes the euclidean distance between two node embeddings"""
    return np.linalg.norm(emb1-emb2)

In [None]:
# get nx train graph
G_train = nx.from_pandas_edgelist(train, "u", "v")

In [None]:
# compute Top-N lists for test dataset
# for each testing row
results = {}
i = 0
for index, _ in test.iterrows():
    userId = index[0]
    if not userId in df.index: continue
    user_emb = emb_from_row(df.loc[userId])
    user_label = df.loc[userId].label

    # get user's non_friends in user's community
    # users_in_community
    users_in_community = df[df.label == user_label]
    #print("len(users_in_community): ", len(users_in_community))
    # friends
    friends = list(G_train[userId])
    #print("len(friends): ", len(friends))
    # not_friends_in_community
    not_friends_in_community = users_in_community[~users_in_community.index.isin(friends+[userId])]
    #print("len(not_friends_in_community): ", len(not_friends_in_community))

    # generate user's top_list from all not_friends
    distances = {}
    top_list = SortedDict()
    for movie_id, node_row in not_friends_in_community.iterrows():
        node_emb = emb_from_row(node_row)
        dist = emb_distance(user_emb, node_emb)
        top_list[dist] = movie_id
        distances[movie_id] = dist

    results[userId] = top_list, distances
    if i % 10 == 0:
        print(f"{i}/{len(test)}")
    i += 1

In [None]:
for N in range(1, 20):

    # counters
    hits = 0
    misses = 0

    for userId in results.keys():
        (top_list, distances) = results[userId]

        # get Top-N friend suggestions
        top_N = top_list.values()[:N]

        # get user's test friend
        test_friend = test.query(f"u == {userId}").index.get_level_values('v')[0]

        # evaluate
        hit = test_friend in top_N
        if hit:
            hits += 1
        else:
            misses += 1

        #print(f"Friend suggestion {test_friend} {'found' if hit else 'not found'} in user {userId}'s Top-{N} list: ", top_N)

    print(f"{N}: {hits}/{hits+misses}")


1: 0/1169
2: 11/1169
3: 21/1169
4: 35/1169
5: 48/1169
6: 58/1169
7: 64/1169
8: 71/1169
9: 81/1169
10: 85/1169
