In [80]:
import sys
sys.path.append('../../')

from IPython.core.display import display
import numpy as np
import pandas as pd
import networkx as nx
from sortedcontainers import SortedDict
import matplotlib.pyplot as plt
import seaborn as sns
from  matplotlib.ticker import PercentFormatter
import time

In [81]:
# settings

dim = 4

In [82]:
# import node embeddings
df = pd.read_csv(
    f"./facebook_alpha-0.1_beta-0.1_ws-10_neg-5_lr-0.025_icom-36_ind-36_ds-0.0_d-{dim}_type-BGMM_k-10.txt",
    sep="\t| ",
    header=None
)
df = df.rename(columns={0: 'node'})
df.set_index(['node'], inplace=True)

  """


In [83]:
# import train
train = pd.read_csv("./facebook_train.csv", index_col=0)
# import test
test = pd.read_csv("./facebook_test.csv", index_col=['u', 'v'])

In [84]:
# import predicted labels
labels = pd.read_csv(f"./labels_pred_BGMM_d{dim}_k10.txt", header=None)
labels = labels.rename(columns={0: 'label'})
labels.label = labels.astype(int)
labels

Unnamed: 0,label
0,8
1,8
2,8
3,8
4,8
...,...
3959,9
3960,9
3961,9
3962,9


In [85]:
df = df.join(labels)

In [86]:
# node embeddings
display(df.head())
display(df.describe())

# train df
display(train.head())
display(train.describe())

# test df
display(test.head())
display(test.describe())

Unnamed: 0_level_0,1,2,3,4,label
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-5.791565,-2.125865,-0.705079,4.920946,8.0
1,-4.867615,-1.492043,-0.698155,3.644008,8.0
2,-3.79384,-1.553433,-0.527849,4.136863,8.0
3,-5.121711,-1.41578,-0.662337,3.322037,8.0
4,-3.916894,-1.559996,-0.560903,4.01478,8.0


Unnamed: 0,1,2,3,4,label
count,3964.0,3964.0,3964.0,3964.0,3897.0
mean,-1.203034,0.838945,1.831198,1.236076,3.586092
std,3.005514,2.817995,2.120366,2.794049,2.512871
min,-8.352823,-5.690406,-2.69433,-4.385391,0.0
25%,-3.310163,-1.085476,0.399157,-0.479998,2.0
50%,-0.766376,0.929176,1.941858,1.598302,3.0
75%,0.856466,3.045442,3.527953,3.122448,5.0
max,4.269142,10.501018,6.011499,7.314298,9.0


Unnamed: 0,u,v
0,0,1
1,0,2
2,0,3
3,0,4
4,0,5


Unnamed: 0,u,v
count,86999.0,86999.0
mean,1865.069449,2154.50742
std,910.682085,894.143924
min,0.0,1.0
25%,1160.0,1601.0
50%,1983.0,2259.0
75%,2423.0,2631.0
max,4031.0,4038.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0
u,v,Unnamed: 2_level_1
0,276,0
9,21,1
21,281,2
25,141,3
26,9,4


Unnamed: 0.1,Unnamed: 0
count,1169.0
mean,584.0
std,337.605539
min,0.0
25%,292.0
50%,584.0
75%,876.0
max,1168.0


In [87]:
def emb_from_row(row):
    """get node embedding np array from df row"""
    return row.drop('label').to_numpy().flatten()

def emb_distance(emb1, emb2):
    """computes the euclidean distance between two node embeddings"""
    return np.linalg.norm(emb1-emb2)

In [88]:
# get nx train graph
G_train = nx.from_pandas_edgelist(train, "u", "v")

In [89]:
# compute Top-N lists for test dataset
# for each testing row
results = {}
results_com = {}
i = 1
start_time = time.time()
for index, _ in test.iterrows():
    userId = index[0]
    if not userId in df.index: continue
    user_emb = emb_from_row(df.loc[userId])
    user_label = df.loc[userId].label

    # get user's non_friends in user's community
    # users_in_community
    users_in_community = df[df.label == user_label]
    #print("len(users_in_community): ", len(users_in_community))
    # friends
    friends = list(G_train[userId])
    #print("len(friends): ", len(friends))
    # not_friends
    not_friends = df[~df.index.isin(friends+[userId])]
    #print("len(not_friends): ", len(not_friends))
    # not_friends_in_community
    not_friends_in_community = users_in_community[~users_in_community.index.isin(friends+[userId])]
    #print("len(not_friends_in_community): ", len(not_friends_in_community))

    # generate user's top_list from all not_friends
    distances = {}
    top_list = SortedDict()
    for movie_id, node_row in not_friends.iterrows():
        node_emb = emb_from_row(node_row)
        dist = emb_distance(user_emb, node_emb)
        top_list[dist] = movie_id
        distances[movie_id] = dist
    results[userId] = top_list, distances

    # generate user's top_list from all not_friends_in_community
    distances_com = {}
    top_list_com = SortedDict()
    for movie_id, node_row in not_friends_in_community.iterrows():
        node_emb = emb_from_row(node_row)
        dist = emb_distance(user_emb, node_emb)
        top_list_com[dist] = movie_id
        distances_com[movie_id] = dist
    results_com[userId] = top_list_com, distances_com

    if i % 10 == 0:
        elapsed_sec = (time.time() - start_time)
        elapsed_min = elapsed_sec / 60
        print(f"{i}/{len(test)} in {elapsed_min:.2f} min")
        print(f"  => done at {time.ctime(time.time() + ((len(test)-i)/i)*elapsed_sec)}")
    i += 1

KeyboardInterrupt: 

In [93]:
N = 100
hit_rates = []
hit_rates_com = []

for n in range(1, N + 1):

    # counters
    hits = 0
    misses = 0
    hits_com = 0
    misses_com = 0

    for userId in results.keys():
        top_list, _ = results[userId]
        top_list_com, _ = results_com[userId]
        print("top_list:", top_list)
        # get Top-N friend suggestions
        top_n = top_list.values()[:n]
        top_n_com = top_list_com.values()[:n]
        # get user's test friend
        test_friend = test.query(f"u == {userId}").index.get_level_values('v')[0]
        # evaluate
        hit = test_friend in top_n
        if hit:
            hits += 1
        else:
            misses += 1
        hit_com = test_friend in top_n_com
        if hit_com:
            hits_com += 1
        else:
            misses_com += 1

        #print(f"Friend suggestion {test_friend} {'found' if hit else 'not found'} in user {userId}'s Top-{N} list: ", top_N)

    hit_rate = hits / (hits+misses)
    hit_rate_com = hits_com / (hits_com+misses_com)
    print(f"{n}: {hits}/{hits+misses} = {hit_rate}")
    print(f"{n}: {hits_com}/{hits_com+misses_com} = {hit_rate_com} (com)")

    hit_rates.append(hit_rate)
    hit_rates_com.append(hit_rate_com)

top_list: SortedDict({1.1682671169300323: 67, 1.8703326155717894: 276, 2.5298012819831164: 2576, 2.5367661048017265: 2311, 2.537418779892048: 2258, 2.5491455698690926: 2613, 2.602739367066349: 1942, 2.6237919393405824: 2536, 2.6340075262627347: 2476, 2.6405553802275215: 2021, 2.6472145298883674: 2147, 2.681775857329454: 2092, 2.700214011941313: 2287, 2.7147565189057223: 2387, 2.7233972350474858: 2391, 2.725640460796333: 2067, 2.740920502162846: 2449, 2.764239010907861: 2393, 2.8040768231613433: 2055, 2.8128311130053634: 2136, 2.8247922163791332: 2234, 2.8299641645229423: 2303, 2.8568528945726603: 2606, 2.8585697380940585: 2044, 2.8777993306098706: 2058, 2.8881537878226817: 2383, 2.9670324565874906: 2350, 2.9829579189962954: 1963, 2.986939627296643: 1989, 2.9933752568804306: 2429, 3.0141306404472314: 2353, 3.0251635278700317: 2407, 3.052656869033264: 2462, 3.0605309612305427: 2060, 3.0846150745264573: 2478, 3.0910155711396334: 2392, 3.093688370305632: 2499, 3.0987305625697985: 2179, 3.1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# plot hit-rates

Ns = np.arange(N)

hit_rate_1_df = pd.DataFrame(
    zip(Ns, hit_rates, ['Algorithm 1' for _ in range(N)]),
    columns=['N', 'hit_rate', 'Algorithm']
)
hit_rate_2_df = pd.DataFrame(
    zip(Ns, hit_rates_com, ['Algorithm 2' for _ in range(N)]),
    columns=['N', 'hit_rate', 'Algorithm']
)
hit_rate_df = pd.concat([hit_rate_1_df, hit_rate_2_df])

sns.set_style("darkgrid")

# hit rates plot

ax = sns.lineplot(x='N', y='hit_rate', hue='Algorithm', data=hit_rate_df)
ax.set(xlabel='N', ylabel='hit rate', ylim=(0, 1))
ax.yaxis.set_major_formatter(PercentFormatter(1))

fig = ax.get_figure()
fig.savefig(f"./hit_rates_d-{dim}.png", format='png', dpi=1000)

In [None]:
# hit rates plot on log scale

ax = sns.lineplot(x='N', y='hit_rate', hue='Algorithm', data=hit_rate_df)
ax.set(xlabel='N', ylabel='hit rate', ylim=(0, 1), xscale="log")
ax.yaxis.set_major_formatter(PercentFormatter(1))

fig = ax.get_figure()
fig.savefig(f"./hit_rates_d-{dim}_log.png", format='png', dpi=1000)
