In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data_base_dir = '../../../datasets/Movielens/'
data_dir2 = data_base_dir + 'Movielens Latest/ml-latest/'
data_dir = data_base_dir + 'serendipity-sac2018/'

output_dir = data_dir + 'output4/'

answers = data_dir + 'answers.csv'
recommendations = data_dir + 'recommendations.csv'

genome_scores = data_dir + 'tag_genome.csv'
# genome_tags = data_dir + 'genome-tags.csv'
genome_scores = data_dir + 'mlLatestgenome-scores.csv'
movies = data_dir + 'movies.csv'
training = data_dir + 'training.csv'
tags = data_dir + 'tags.csv'


In [3]:
movies_df = pd.read_csv(movies)

answers_df = pd.read_csv(answers)

# read all users and filter ratings df
recommendations_df = pd.read_csv(recommendations)
recommendations_df

all_user_ids = recommendations_df['userId'].unique().tolist()

count_df = answers_df.groupby('userId').count()
count_df[count_df['movieId'] == 5]
all_user_ids.extend(count_df[count_df['movieId'] == 5].index.values.tolist())
all_user_ids = np.unique(np.array(all_user_ids))
all_user_ids.size

genome_scores_df = pd.read_csv(genome_scores).pivot(index='movieId', columns='tagId', values='relevance')

tag_genome_movies = genome_scores_df.index.values
tag_genome_movies.size

ratings_df = pd.read_csv(training)

# filter ratings for movies watched only by these users
ratings_df = ratings_df[ratings_df['userId'].isin(all_user_ids)]

# filter ratings for movies only having tag-genome scores
ratings_df = ratings_df[ratings_df['movieId'].isin(tag_genome_movies)]
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
269,112406,3930,3.5,1515208093000
506,108188,58025,4.0,1515204044000
509,108188,60072,4.0,1515204013000
511,108188,8665,4.5,1515203993000
512,108188,105593,5.0,1515203943000
515,123335,2918,4.0,1515203131000
516,123335,178061,1.0,1515203100000
655,199760,177651,3.5,1515200288000
767,115169,97938,4.0,1515196492000
946,199760,5378,2.0,1515194332000


In [4]:
user1 = 100200
recommendations1 = np.array([780.0,47.0,317.0,909.0,673.0,158.0,455.0,608.0]).astype(np.int)

user_a = 100200
recommendations_a = np.array([8608, 65660, 77810, 100306, 5473, 5806, 1156, 95604]).astype(np.int)

user2 = 100269

recommendations2 = np.array([47.0,16.0,913.0,942.0,928.0,608.0,111.0,930.0]).astype(np.int)

# Item unexpectedness

we adopt the idea of measuring an item’s unex-
pectedness (or surprise) as its distance from the set of expected items. Furthermore,
we follow the idea of Nakatsuji et al. to measure an item’s surprise as the minimum
distance from the user’s profile items and we hypothesize that, by contrast, averaging
the distances between items results in a loss of information, particularly for users with
diverse profiles [Kaminskas and Bridge 2014]

In [5]:
from sklearn.metrics import pairwise_distances

def calculate_unexpectedness(user_id, recommendation_list, round_decimals=6, users_movies=None, unexpected_ratio=0.001):
    # get term vec for all movies watched by the user
    if users_movies is None:
        users_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].values
        
    user_term_vec = genome_scores_df.loc[users_movies, :].values

    # load genome tags for recommended movies
    recommendations_term_vec = genome_scores_df.loc[recommendation_list, :].values

    # calculate distances of all recommended movies with all movies watched by user
    distance_from_user_profile = pd.DataFrame(pairwise_distances(user_term_vec, recommendations_term_vec, metric='cosine'))

    # get minimum distance from users profile for each recommended movie
    min_distances = distance_from_user_profile.min().values

    # round distances to N decimals - parameter round_decimals
    unexpectedness = np.around(min_distances, decimals=round_decimals)
    
    unexpected_movies = recommendation_list[unexpectedness >= unexpected_ratio]
    
    return unexpected_movies

In [98]:
unexpected_threshold = 0.001
unexpected1 =calculate_unexpectedness(user1, recommendations1, round_decimals=6, unexpected_ratio=unexpected_threshold)
unexpected1

array([909, 158, 455])

In [7]:
unexpected_threshold = 0.1
unexpected_a =calculate_unexpectedness(user_a, recommendations_a, round_decimals=6, unexpected_ratio=unexpected_threshold)
unexpected_a

array([  8608,  65660,  77810, 100306,   5473,   5806,   1156,  95604])

In [99]:
unexpected_threshold = 0.001
unexpected2 =calculate_unexpectedness(user2, recommendations2, round_decimals=6, unexpected_ratio=unexpected_threshold)
unexpected2

array([913, 942, 928, 111, 930])

## Usefulness of item
The usefulness of recommendations may be judged by the user or, in an offline setting, approximated by
the user’s ratings for the items [Adamopoulos and Tuzhilin 2014]. A limitation of this
comparative approach to serendipity measurement is its sensitivity to the choice of the
primitive baseline system.

In [8]:
from sklearn.metrics import pairwise_distances

class ContentBased_Recommender:
    def __init__(self, term_vector_df, ratings_df, K=5, metric='cosine', weighted=True):
        self.term_vector_df = term_vector_df
        self.K = K
        self.ratings_df = ratings_df
        self.weighted = weighted

        # preprocessing and other calculations
        term_vector_df.fillna(0, inplace=True)
        self.movie_movie_distances = pd.DataFrame(
            pairwise_distances(term_vector_df, metric='cosine'),
            index=term_vector_df.index,
            columns=term_vector_df.index)

    def get_predicted_rating(self, user_id, candidate_movie_id, user_movies, K):
        # hide candidate movie from the user
        user_movies = np.setdiff1d(user_movies, candidate_movie_id)

        # load user rating for watched movies other than the candidate movie
        # user_ratings =
        users_all_ratings_df = ratings_df[ratings_df['userId'] == user_id]
        users_all_ratings_df = users_all_ratings_df[
            users_all_ratings_df['movieId'].isin(user_movies)]

        # load similarities to the candidate movie
        users_all_ratings_df['sim_candidate_movie'] = self.movie_movie_distances.loc[
            candidate_movie_id, users_all_ratings_df['movieId']].values

        predicted = self.predict_rating(user_id, candidate_movie_id, users_all_ratings_df, K)

        return predicted

    def predict_rating(self, user_id, candidate_movie_id,
                       users_all_ratings_df,
                       K):
        user_ratings = users_all_ratings_df['rating'].values[:K]
        similarities = users_all_ratings_df['sim_candidate_movie'].values[:K]

        predicted_rating = 0
        if self.weighted:
            # weighted average
            predicted_rating = np.sum(user_ratings * similarities) / np.sum(similarities)
        else:
            # non-weighted average
            predicted_rating = np.sum(user_ratings) / len(user_ratings)

        return predicted_rating

In [9]:
# choose and initialize primitive recommender
primitive_recommender = ContentBased_Recommender(term_vector_df=genome_scores_df,\
                                                 ratings_df=ratings_df, K=50, metric='cosine',\
                                                 weighted=True)

user_id = user1

def calculate_usefulness(user_id, recommendation_list, primitive_recommender, like_threshold=3, K=50):
    users_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].values

    predicted_ratings_list = list()

    # get predicted rating for each movie using the primitive recommender
    for candidate_movie_id in recommendation_list:
        predicted_rating = primitive_recommender.get_predicted_rating(user_id, candidate_movie_id, users_movies, K=K)
        predicted_ratings_list.append(predicted_rating)

    predicted_ratings_list = np.array(predicted_ratings_list)

    # filter as useful if potential rating is above or equal to the like threshold
    useful_movies = recommendation_list[predicted_ratings_list >= like_threshold]
    
    return useful_movies

## Serendipity Calculation
Serendipity(R, u) = |R unexp ∩ R useful |/ |R| -- (3)

,where R is the set of recommendations generated for user

In [10]:
def calculate_serendipity(unexpected_movies, useful_movies, recommendation_list):
    serendipity = len(np.intersect1d(unexpected_movies, useful_movies)) / len(recommendation_list)
    
    return serendipity

In [11]:
useful_a = calculate_usefulness(user_a, recommendations_a, primitive_recommender)
calculate_serendipity(unexpected_a, useful_a, recommendations_a)


1.0

## Novelty
of novelty. Zhang [2013] identified three qualities of a novel recommendation: being
unknown to the user, being relevant to the user, and being dissimilar to items in the
user’s profile.
We note that the quality of an item being different from the user’s profile is closely
related to the surprise of recommendations, which we identify as a core component of
serendipity