In [1]:
import pandas as pd

In [2]:
ratings = pd.read_table("train_ratings.csv", sep=",")
test = pd.read_table("test_set_no_ratings.csv", sep=",")
movies = pd.read_table("movies.csv", sep=",").set_index("movieId")

#### Test if there are multiple ratings for a pair (user, movie) in ratings

In [3]:
number_of_ratings = ratings.groupby(["userId", "movieId"]).count()
(number_of_ratings == 1).all()

rating       True
timestamp    True
dtype: bool

No duplicates !

#### Test if there are users (in test) without movie ratings

In [4]:
test.userId.isin(ratings.userId).all()

True

No users without movie ratings !!

#### Define function that retrives rated movies by userId

In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,509,7347,3.0,1435994597
1,326,71462,4.0,1322252335
2,57,2115,3.0,965798155
3,610,1127,4.0,1479544102
4,462,2409,2.0,1174438249
...,...,...,...,...
80663,42,4005,4.0,996259059
80664,364,141,4.0,869443367
80665,480,6867,4.0,1179163171
80666,6,981,3.0,845556567


In [6]:
def find_rated_movies_by_user(user_id, ratings):
    rated_movies_by_user = ratings[ratings.userId == user_id]
    return rated_movies_by_user

In [7]:
find_rated_movies_by_user(509, ratings)

Unnamed: 0,userId,movieId,rating,timestamp
0,509,7347,3.0,1435994597
69,509,4993,5.0,1435992277
239,509,2028,3.5,1436393168
275,509,85510,3.5,1435999262
471,509,94780,2.5,1435999746
...,...,...,...,...
79126,509,7045,3.0,1436027388
79765,509,70183,3.5,1435997259
79853,509,45447,3.0,1435998371
80217,509,78105,4.0,1435996118


In [8]:
def find_genre_from_movie(movie_id, movies):
    genres = movies.loc[movie_id]["genres"]
    return genres.split("|")


In [9]:
find_genre_from_movie(1,movies)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']

Define similarity function (Jaccard's distance with Genres)

do we need to work with sets ? duplicates in genre ?

In [10]:
def compute_similarity(movie_id_1, movie_id_2, movies):
    genres_1 = set(find_genre_from_movie(movie_id_1, movies))
    genres_2 = set(find_genre_from_movie(movie_id_2, movies))
    
    n_common = len(genres_1 & genres_2)
    n_total = len(genres_1 | genres_2)
    
    return n_common/n_total

Define aggregate function

In [11]:
def estimate_rating(user_id, movie_id, ratings):
    user_ratings = find_rated_movies_by_user(user_id, ratings)
    similarities = [compute_similarity(movie_id, movie_id_rated, movies) for movie_id_rated in user_ratings.movieId]
    sum_similarities = sum(similarities)
    if sum_similarities != 0:
        estimated_rating = sum([rating * similarity for (rating, similarity) in zip(user_ratings.rating, similarities)])/sum_similarities
    else:
        estimated_rating = user_ratings.rating.mean()
    return estimated_rating

In [12]:
estimate_rating(509, 7347, ratings.iloc[1:])  # true value 3

3.2362271742494606

#### Estimate the rating for all test request

In [14]:
import csv
from tqdm import tqdm
import multiprocessing
f = open('submission.csv', 'w', newline='')
writer = csv.writer(f)
writer.writerow(["Id", "rating"])
resolution = 0.5

with tqdm(total=test.shape[0]) as pbar: 
    for i, row in test.iterrows():
        pbar.update(1)
        movie_id = row["movieId"]
        user_id = row["userId"]
        estimated_rating = round(estimate_rating(user_id, movie_id, ratings) / resolution) * resolution
        writer.writerow([i, "{}".format(estimated_rating)])   
f.close()

  0%|          | 72/20168 [00:02<11:35, 28.90it/s]


KeyboardInterrupt: 