In [45]:
from csv import reader

movies_file = reader(open('ml-latest-small/movies.csv'))
next(movies_file, None)
movies = {}
genres = {}
movie_genres = {}
for line in movies_file:
    movie_id = int(line[0])
    movies[movie_id] = line[1]
    current_genres = line[2].split('|')
    for genre in current_genres:
        if genre not in genres:
            genres[genre] = set()
        genres[genre].add(line[1])
    movie_genres[line[1]] = current_genres
all_movies = set(movies.values())

In [46]:
ratings_file = reader(open('ml-latest-small/ratings.csv'))
next(ratings_file, None)
user_ratings = {}
user_genre_ratings = {}
for line in ratings_file:
    # get basic data from the line
    user_id = int(line[0])
    movie_id = int(line[1])
    movie_name = movies[movie_id]
    rating = float(line[2])
    # store the rating of this movie
    if user_id not in user_ratings:
        user_ratings[user_id] = {}
    user_ratings[user_id][movie_name] = rating
    
    # store the rating of the movie for the user's genre ratings
    if user_id not in user_genre_ratings:
        user_genre_ratings[user_id] = {}
    for genre in movie_genres[movie_name]:
        if genre not in user_genre_ratings[user_id]:
            user_genre_ratings[user_id][genre] = []
        user_genre_ratings[user_id][genre].append(rating)

# convert user genre ratings to one average instead a list of ratings
for user, genres in user_genre_ratings.items():
    for genre in genres:
        user_genre_ratings[user][genre] = sum(user_genre_ratings[user][genre])/len(user_genre_ratings[user][genre])

In [47]:
sorted_users = sorted(user_ratings.keys())
test_user = sorted_users[0]
test_users_movies = set(users[test_user].keys())
test_users_movies_with_ratings = user_ratings[test_user]
# shared movies is stored as user_id -> set of shared movies
shared_movies = {}
for user in sorted_users[1:]:
    shared_movies[user] = test_users_movies & set(user_ratings[user].keys())

In [48]:
from math import sqrt
def find_cosine_similarity(ratings1, ratings2):
    d1 = 0
    d2 = 0
    dot = 0
    for movie in ratings1:
        rating1 = ratings1[movie]
        rating2 = ratings2[movie]
        dot += rating1*rating2
        d1 += rating1*rating1
        d2 += rating2*rating2
    return dot / (sqrt(d1)*sqrt(d2))

# similarities is stored as user_id -> user_similarity
similarities = {}
for user in sorted_users[1:]:
    if not shared_movies[user]:
        continue
    user1_movies = dict([(k, v) for k,v in test_users_movies_with_ratings.items() if k in shared_movies[user]])
    user2_movies = dict([(k, v) for k,v in users[user].items() if k in shared_movies[user]])
    similarity = find_cosine_similarity(user1_movies, user2_movies)
    similarities[user] = similarity

In [51]:
# user averages is user_id -> average of all their ratings
user_averages = {}
for user in sorted_users:
    ratings = list(user_ratings[user].values())
    average = sum(ratings)/len(ratings)
    user_averages[user] = average

In [55]:
from operator import itemgetter
# predicted ratings is movie name -> predicted rating
predicted_ratings = {}
test_user_average = user_averages[test_user]
for movie in all_movies - test_users_movies:
    total_sim = 0
    total_sim_rated = 0
    for user in sorted_users[1:]:
        if movie not in user_ratings[user] or user not in similarities:
            continue
        similarity = similarities[user]
        total_sim += similarity
        total_sim_rated += similarity*(user_ratings[user][movie] - user_averages[user])
    if total_sim == 0 or total_sim_rated == 0:
        predicted_ratings[movie] = test_user_average
        continue
    predicted = total_sim_rated/total_sim
    predicted_ratings[movie] = test_user_average + predicted
sorted(predicted_ratings.items(), key=itemgetter(1), reverse=True)

[('Match Factory Girl, The (Tulitikkutehtaan tyttö) (1990)',
  7.120924764890281),
 ('Galaxy of Terror (Quest) (1981)', 6.930481874447391),
 ('Alien Contamination (1980)', 6.930481874447391),
 ('Bossa Nova (2000)', 6.726379310344827),
 ('Paterson', 6.620924764890281),
 ('Seve (2014)', 6.5190080657096345),
 ('The Big Bus (1976)', 6.5190080657096345),
 ('Villain (1971)', 6.496814092953523),
 ('Master of the Flying Guillotine (Du bi quan wang da po xue di zi) (1975)',
  6.430481874447391),
 ('Looker (1981)', 6.430481874447391),
 ('Jetée, La (1962)', 6.404657452386706),
 ('Unfaithfully Yours (1948)', 6.374898314276676),
 ('Come and See (Idi i smotri) (1985)', 6.261577711750164),
 ('On the Beach (1959)', 6.224329269989703),
 ('Strictly Sexual (2008)', 6.131669222973898),
 ('Cat Soup (Nekojiru-so) (2001)', 6.120924764890281),
 ('Babes in Toyland (1934)', 6.06480980361837),
 ('Tom Segura: Mostly Stories (2016)', 6.027619620422347),
 ('Tom Segura: Completely Normal (2014)', 6.027619620422347),

In [60]:
## TEST RATINGS
# predicted ratings is movie name -> predicted rating
test_predicted_ratings = {}
for movie in test_users_movies:
    total_sim = 0
    total_sim_rated = 0
    for user in sorted_users[1:]:
        if movie not in user_ratings[user] or user not in similarities:
            continue
        similarity = similarities[user]
        total_sim += similarity
        total_sim_rated += similarity*(user_ratings[user][movie] - user_averages[user])
    if total_sim == 0 or total_sim_rated == 0:
        test_predicted_ratings[movie] = test_user_average
        continue
    predicted = total_sim_rated/total_sim
    test_predicted_ratings[movie] = test_user_average + predicted
incorrectness = []
for movie,rating in test_users_movies_with_ratings.items():
    incorrectness.append(abs(rating - test_predicted_ratings[movie]))
print(sum(incorrectness)/len(incorrectness))
# print(sorted(test_predicted_ratings.items(), key=itemgetter(1), reverse=True)[:15])
# print(sorted(test_users_movies_with_ratings.items(), key=itemgetter(1), reverse=True)[:15])

0.5742085886078788
