In [1]:
import os
import time
import gc
import argparse

# data science imports
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# utils import
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz
from scipy.stats import pearsonr





In [2]:
 # read data
df_movies = pd.read_csv(
                "movies.csv",
                usecols=['movieId', 'title'],
                dtype={'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv(
                "ratings.csv",
                usecols=['userId', 'movieId', 'rating'],
                dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
df_movies.tail()

Unnamed: 0,movieId,title
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)
9741,193609,Andrew Dice Clay: Dice Rules (1991)


In [4]:
movie_dict = df_movies.set_index('movieId')['title'].to_dict()
movie_dict

{1: 'Toy Story (1995)',
 2: 'Jumanji (1995)',
 3: 'Grumpier Old Men (1995)',
 4: 'Waiting to Exhale (1995)',
 5: 'Father of the Bride Part II (1995)',
 6: 'Heat (1995)',
 7: 'Sabrina (1995)',
 8: 'Tom and Huck (1995)',
 9: 'Sudden Death (1995)',
 10: 'GoldenEye (1995)',
 11: 'American President, The (1995)',
 12: 'Dracula: Dead and Loving It (1995)',
 13: 'Balto (1995)',
 14: 'Nixon (1995)',
 15: 'Cutthroat Island (1995)',
 16: 'Casino (1995)',
 17: 'Sense and Sensibility (1995)',
 18: 'Four Rooms (1995)',
 19: 'Ace Ventura: When Nature Calls (1995)',
 20: 'Money Train (1995)',
 21: 'Get Shorty (1995)',
 22: 'Copycat (1995)',
 23: 'Assassins (1995)',
 24: 'Powder (1995)',
 25: 'Leaving Las Vegas (1995)',
 26: 'Othello (1995)',
 27: 'Now and Then (1995)',
 28: 'Persuasion (1995)',
 29: 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 30: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 31: 'Dangerous Minds (1995)',
 32: 'Twelve Monkeys (a.k.a. 12 Monkeys) (199

In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
 # filter data
df_movies_cnt = pd.DataFrame(
                    df_ratings.groupby('movieId').size(),
                    columns=['count'])
popular_movies = list(set(df_movies_cnt.query('count >= 70').index))  # noqa
movies_filter = df_ratings.movieId.isin(popular_movies).values

df_users_cnt = pd.DataFrame(
                    df_ratings.groupby('userId').size(),
                    columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 70').index))  # noqa
users_filter = df_ratings.userId.isin(active_users).values

df_ratings_filtered = df_ratings[movies_filter & users_filter]

In [7]:
df_ratings_filtered

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
7,1,110,4.0
...,...,...,...
100417,610,72998,4.5
100452,610,79132,4.0
100538,610,91529,4.5
100596,610,99114,4.5


In [8]:
# pivot and create movie-user matrix
movie_user_mat = df_ratings_filtered.pivot(
                        index='movieId', columns='userId', values='rating').fillna(0)

In [9]:
movie_user_mat

userId,1,4,6,7,10,15,16,17,18,19,...,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,4.5,0.0,2.5,0.0,4.5,3.5,4.0,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
6,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
10,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
11,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,2.5,3.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72998,0.0,0.0,0.0,0.0,2.5,3.0,0.0,0.0,4.0,0.0,...,0.0,4.0,0.0,0.0,0.0,3.5,3.0,0.0,0.0,4.5
79132,0.0,0.0,0.0,0.0,0.0,3.5,3.0,4.5,4.5,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
91529,0.0,0.0,0.0,0.0,5.0,2.0,4.5,0.0,4.0,0.0,...,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
99114,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5


In [10]:
# create mapper from movie title to index
hashmap = {
            i: movie for i, movie in
            enumerate(list(df_movies.set_index('movieId').title)) # noqa
           }

In [11]:
hashmap

{0: 'Toy Story (1995)',
 1: 'Jumanji (1995)',
 2: 'Grumpier Old Men (1995)',
 3: 'Waiting to Exhale (1995)',
 4: 'Father of the Bride Part II (1995)',
 5: 'Heat (1995)',
 6: 'Sabrina (1995)',
 7: 'Tom and Huck (1995)',
 8: 'Sudden Death (1995)',
 9: 'GoldenEye (1995)',
 10: 'American President, The (1995)',
 11: 'Dracula: Dead and Loving It (1995)',
 12: 'Balto (1995)',
 13: 'Nixon (1995)',
 14: 'Cutthroat Island (1995)',
 15: 'Casino (1995)',
 16: 'Sense and Sensibility (1995)',
 17: 'Four Rooms (1995)',
 18: 'Ace Ventura: When Nature Calls (1995)',
 19: 'Money Train (1995)',
 20: 'Get Shorty (1995)',
 21: 'Copycat (1995)',
 22: 'Assassins (1995)',
 23: 'Powder (1995)',
 24: 'Leaving Las Vegas (1995)',
 25: 'Othello (1995)',
 26: 'Now and Then (1995)',
 27: 'Persuasion (1995)',
 28: 'City of Lost Children, The (Cité des enfants perdus, La) (1995)',
 29: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 30: 'Dangerous Minds (1995)',
 31: 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995

In [12]:
def get_intersection_index_of_notNull_rating(U1, U2):
    '''
    U1 : id of user 1
    U2 : id of user 2
    '''
    dict_item_u1 = movie_user_mat[:][U1].to_dict()
    dict_item_u2 = movie_user_mat[:][U2].to_dict()
    indices_ratingU1_notNull = [key for key in dict_item_u1 if dict_item_u1[key] != 0]
    indices_ratingU2_notNull = [key for key in dict_item_u2 if dict_item_u2[key] != 0]
    return list(set(indices_ratingU1_notNull) & set(indices_ratingU2_notNull)) 

In [13]:
def calculate_pearson(user1ID, user2ID):
    corr = 0
    index_of_not_null_rating = get_intersection_index_of_notNull_rating(user1ID, user2ID)
    dict_rating_user1 = movie_user_mat[:][user1ID].to_dict()
    dict_rating_user2 = movie_user_mat[:][user2ID].to_dict()
    rating_user1 = [dict_rating_user1[key] for key in index_of_not_null_rating]
    rating_user2 = [dict_rating_user2[key] for key in index_of_not_null_rating]
    if len(rating_user1) > 1 :
        corr,pvalue = pearsonr(rating_user1, rating_user2)
    return corr

In [14]:
def get_neirest_users(v):
    '''
    '''
    neirest_users = []
    users = list(pd.array(movie_user_mat.columns))
    users.remove(v)
    for u in users:
        corr = calculate_pearson(v, u)
        if corr >= 0.5:
            neirest_users.append((u,corr))
    neirest_users.sort(key=lambda x:x[1])
    neirest_users.reverse()
    return [i for i,j in neirest_users]

In [15]:
def get_index_of_item_without_rating(userId):
    dict_all_item = movie_user_mat[:][userId].to_dict()
    indices_rating_null = [key for key in dict_all_item if dict_all_item[key] == 0]
    return indices_rating_null

In [16]:
def Average_ratings_of_user(userId):
    user_dict = movie_user_mat[:][1].to_dict()
    not_null_rating = [user_dict[key] for key in user_dict if  user_dict[key] != 0]
    sum_of_rating = 0
    for rating in not_null_rating:
        sum_of_rating += rating
    return sum_of_rating / len(not_null_rating)

In [17]:
Average_ratings_of_user(1)

4.375

In [20]:
import operator
def predict_all_null_rating(userId):
    prediction_dict = {}
    for null_rating_index in get_index_of_item_without_rating(userId):
        numerator = 0
        denominator = 1
        for neirest_userId in get_neirest_users(userId):
            r = movie_user_mat[:][neirest_userId].to_dict()[null_rating_index]
            if r != 0:
                numerator += calculate_pearson(userId, neirest_userId) * (r - Average_ratings_of_user(neirest_userId))
                denominator += calculate_pearson(userId, neirest_userId)
        prediction = Average_ratings_of_user(userId) + (numerator / denominator)
        prediction_dict[null_rating_index] = prediction
    sorted_prediction = sorted(prediction_dict.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_prediction

In [21]:
def display_top_10_movies(prediction):
    top_10 = prediction[:10]
    for movieId, rating in top_10:
        print(f"movie id : {movieId} , title : {movie_dict[movieId]} \n")

In [22]:
print('Enter User id:')
userId = input()
print('wait for prediction')
prediction = predict_all_null_rating(int(userId))
display_top_10_movies(prediction)

Enter User id:
1
wait for prediction
movie id : 5618 , title : Spirited Away (Sen to Chihiro no kamikakushi) (2001) 

movie id : 1221 , title : Godfather: Part II, The (1974) 

movie id : 7438 , title : Kill Bill: Vol. 2 (2004) 

movie id : 99114 , title : Django Unchained (2012) 

movie id : 91529 , title : Dark Knight Rises, The (2012) 

movie id : 79132 , title : Inception (2010) 

movie id : 4995 , title : Beautiful Mind, A (2001) 

movie id : 750 , title : Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 

movie id : 1527 , title : Fifth Element, The (1997) 

movie id : 1704 , title : Good Will Hunting (1997) 

