In [14]:
import pandas as pd

In [15]:
ratings = pd.read_table("train_ratings.csv", sep=",")
test = pd.read_table("test_set_no_ratings.csv", sep=",")
movies = pd.read_table("movies.csv", sep=",").set_index("movieId")

#### Test if there are multiple ratings for a pair (user, movie) in ratings

In [16]:
number_of_ratings = ratings.groupby(["userId", "movieId"]).count()
(number_of_ratings == 1).all()

rating       True
timestamp    True
dtype: bool

No duplicates !

#### Test if there are users (in test) without movie ratings

In [17]:
test.userId.isin(ratings.userId).all()

True

No users without movie ratings !!

#### Define function that retrives rated movies by userId

In [18]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,509,7347,3.0,1435994597
1,326,71462,4.0,1322252335
2,57,2115,3.0,965798155
3,610,1127,4.0,1479544102
4,462,2409,2.0,1174438249
...,...,...,...,...
80663,42,4005,4.0,996259059
80664,364,141,4.0,869443367
80665,480,6867,4.0,1179163171
80666,6,981,3.0,845556567


In [19]:
def find_rated_movies_by_user(user_id, ratings):
    rated_movies_by_user = ratings[ratings.userId == user_id]
    return rated_movies_by_user

def find_ratings_by_movie(movie_id, ratings):
    ratings_for_movie = ratings[ratings.movieId == movie_id]
    return ratings_for_movie

In [20]:
find_rated_movies_by_user(509, ratings)

Unnamed: 0,userId,movieId,rating,timestamp
0,509,7347,3.0,1435994597
69,509,4993,5.0,1435992277
239,509,2028,3.5,1436393168
275,509,85510,3.5,1435999262
471,509,94780,2.5,1435999746
...,...,...,...,...
79126,509,7045,3.0,1436027388
79765,509,70183,3.5,1435997259
79853,509,45447,3.0,1435998371
80217,509,78105,4.0,1435996118


In [21]:
def find_genre_from_movie(movie_id, movies):
    genres = movies.loc[movie_id]["genres"]
    return genres


In [22]:
movies["genres"] = movies["genres"].apply(lambda x : x.split("|"))

In [24]:
find_genre_from_movie(1,movies)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']

Define similarity function (Jaccard's distance with Genres)

do we need to work with sets ? duplicates in genre ?

In [25]:
def compute_similarity(movie_id_1, movie_id_2, movies):
    genres_1 = set(find_genre_from_movie(movie_id_1, movies))
    genres_2 = set(find_genre_from_movie(movie_id_2, movies))
    
    n_common = len(genres_1 & genres_2)
    n_total = len(genres_1 | genres_2)
    
    return n_common/n_total

Define aggregate function

In [26]:
def compute_user_average(user_id, ratings):
    user_ratings = find_rated_movies_by_user(user_id, ratings)
    return user_ratings.rating.mean()

In [27]:
def compute_user_average_with_weight(user_id, movie_id, ratings, movies):
    user_ratings = find_rated_movies_by_user(user_id, ratings)
    similarities = [compute_similarity(movie_id, movie_id_rated, movies) for movie_id_rated in user_ratings.movieId]
    sum_similarities = sum(similarities)
    if sum_similarities != 0:
        estimated_rating = sum([rating * similarity for (rating, similarity) in zip(user_ratings.rating, similarities)])/sum_similarities
    else:
        estimated_rating = compute_user_average(user_id, ratings)
    return estimated_rating

In [28]:
def scale(user_rating, average_user_rating):
    if (user_rating > average_user_rating):
        return 5.0 - average_user_rating
    elif (user_rating < average_user_rating):
        return average_user_rating - 1.0
    else:
        return 1.0

def compute_normalized_deviation_for_user_and_movie(rating, average_user_rating):
    return (rating - average_user_rating) / scale(rating, average_user_rating)

In [29]:
def compute_deviation_for_movie(movie_ratings, ratings):
    sum = 0.0
    for index, row in movie_ratings.iterrows():
        sum += compute_normalized_deviation_for_user_and_movie(row.rating,
                                              compute_user_average(row.userId, ratings))
    return sum / len(movie_ratings)

In [30]:
def movie_not_rated(movie_ratings):
    return len(movie_ratings) == 0

In [31]:
def estimate_rating_alternate_algo(user_id, movie_id, ratings):
    user_average = compute_user_average_with_weight(user_id, movie_id, ratings, movies)

    movie_ratings = find_ratings_by_movie(movie_id, ratings)
    if movie_not_rated(movie_ratings):
        return user_average
    else:
        deviation = compute_deviation_for_movie(movie_ratings, ratings)
        scaling_factor = scale(user_average + deviation, user_average)
        return user_average + deviation*scaling_factor

In [32]:
def estimate_rating(user_id, movie_id, ratings):
    user_ratings = find_rated_movies_by_user(user_id, ratings)
    similarities = [compute_similarity(movie_id, movie_id_rated, movies) for movie_id_rated in user_ratings.movieId]
    sum_similarities = sum(similarities)
    if sum_similarities != 0:
        estimated_rating = sum([rating * similarity for (rating, similarity) in zip(user_ratings.rating, similarities)])/sum_similarities
    else:
        estimated_rating = compute_user_average(user_id, ratings)
    return estimated_rating

In [33]:
estimate_rating_alternate_algo(509, 7347, ratings.iloc[1:])  # true value 3

3.261617109248885

#### Estimate the rating for all test request

In [63]:
%%time

from tqdm import tqdm
import numpy as np
from concurrent.futures import ProcessPoolExecutor

resolution = 0.5

def process_row(row, resolution, ratings):
    i = row.name
    movie_id = row["movieId"]
    user_id = row["userId"]
    estimated_rating = round(estimate_rating_alternate_algo(user_id, movie_id, ratings) / resolution) * resolution
    return i, estimated_rating


def process_chunk(chunk, resolution, ratings):
    result = []
    for _, row in chunk.iterrows():
        result.extend(process_row(row, resolution, ratings))
    return result
    
num_processes = 4


chunks = np.array_split(test, num_processes)

with ProcessPoolExecutor(max_workers=num_processes) as executor:
    results = list(executor.map(process_chunk, chunks, [resolution] * num_processes, [ratings] * num_processes))

# Write the results to the output file
with open('submission.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Id", "rating"])
    for chunk_result in results:
        for i in range(0, len(chunk_result), 2):
            writer.writerow([chunk_result[i], "{}".format(chunk_result[i+1])])

chunk processed
chunk processed
chunk processed
chunk processed
chunk processed
chunk processed
chunk processed
chunk processed
CPU times: user 49 ms, sys: 72.7 ms, total: 122 ms
Wall time: 4min 54s
