In [None]:
import sys
sys.path.append('../')

from IPython.core.display import display
import numpy as np
import pandas as pd
from sortedcontainers import SortedDict

In [None]:
# import node embeddings
df = pd.read_csv(
    "./movie_ratings/movie_ratings_alpha-0.1_beta-0.1_ws-10_neg-5_lr-0.025_icom-107_ind-107_k-300_ds-0.0.txt",
    sep="\t| ",
    header=None
)
df = df.rename(columns={0: 'node'})
df.set_index(['node'], inplace=True)

In [None]:
# import ratings
ratings = pd.read_csv("movie_ratings/movie_ratings_unique.csv")
ratings.set_index(['userId', 'movieId'], inplace=True)
# import test
test = pd.read_csv("movie_ratings/movie_ratings_test.csv", index_col=0)
test.set_index(['userId', 'movieId'], inplace=True)

In [None]:
# node embeddings
display(df.head())
display(df.describe())

In [None]:
# ratings df
display(ratings.head())
#display(ratings.describe())

# test df
display(test.head())
#display(test.describe())

In [None]:
# remove test edges from ratings
print("len(ratings): ", len(ratings))
print("len(test): ", len(test))
ratings = ratings[~ratings.index.isin(test.index)]
print("len(ratings): ", len(ratings))

In [None]:
def emb_from_row(row):
    """get node embedding np array from df row"""
    return row.to_numpy().flatten()

def emb_distance(emb1, emb2):
    """computes the euclidean distance between two node embeddings"""
    return np.linalg.norm(emb1-emb2)

In [None]:
# get dataframe with movie embeddings
print("len(df): ", len(df))
userIds = ratings.index.get_level_values(level="userId").unique()
print("len(userIds): ", len(userIds))
df_movies = df[~df.index.isin(userIds)]
print("len(df_movies): ", len(df_movies))

In [None]:
# compute Top-N lists for test dataset
# for each testing row (for each user)
results = {}
for i, test_row in test.iterrows():
    userId = i[0]
    user_emb = emb_from_row(df.iloc[userId])

    # get user's unwatched movies
    #print("len(df_movies): ", len(df_movies))
    movies_watched = ratings.query(f"userId == {userId}").index.get_level_values("movieId")
    #print("len(movies_watched): ", len(movies_watched))
    df_movies_unwatched = df_movies[~df_movies.index.isin(movies_watched)]
    #print("len(df_movies_unwatched): ", len(df_movies_unwatched))

    # generate user's top_list from all unwatched movies
    distances = {}
    top_list = SortedDict()
    for movie_id, node_row in df_movies_unwatched.iterrows():
        node_emb = emb_from_row(node_row)
        dist = emb_distance(user_emb, node_emb)
        top_list[dist] = movie_id
        distances[movie_id] = dist

    results[userId] = top_list, distances
    if userId % 10 == 0: print(f"{userId}/{len(test)}")

In [None]:
N = 20

# counters
hits = 0
misses = 0

for userId, (top_list, distances) in enumerate(results):
    # get Top-N movies to watch
    top_N = top_list.values()[:N]

    # get user's test movie
    test_movie = test.query(f"userId == {userId}").index.get_level_values('movieId')[0]

    # evaluate
    hit = test_movie in top_N
    if hit:
        hits += 1
    else:
        misses += 1

    print(f"Movie {test_movie} {'found' if hit else 'not found'} in user {userId}'s Top-{N} list: ", top_N)


In [None]:
print("hits: ", hits)
print("misses: ", misses)
