In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker

#LOAD MOVIES
movies_df = pd.read_csv("../data/movies.csv")
display(movies_df.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [73]:
#LOAD RATINGS
ratings_df = pd.read_csv("../data/ratings.csv")
#MERGE RATINGS WITH MOVIES
ratings_movies_df = ratings_df.merge(movies_df, how='left', on='movieId')
display(ratings_movies_df.head())
print(ratings_movies_df.shape)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,1112486027,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,1112484676,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,1112484819,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,1112484727,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,1112484580,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


(20000263, 6)


In [74]:
#calculating dimensions for our resulting matrix
n_users = ratings_movies_df['userId'].nunique()

n_movies = ratings_movies_df['title'].nunique()

print("Number of users: ", n_users)

print("Number of movies: ", n_movies)

Number of users:  138493
Number of movies:  26729


In [75]:
#prepping df for pivot
ratings_movies_df.drop(["timestamp", "genres", "movieId"],axis = 1, inplace = True)

In [80]:
#checking minimum value for "rating" to confirm there is no ratings "0", so we can then fill the NaN with 0 later.
min(ratings_movies_df["rating"])

0.5

In [82]:
# Create the ratings matrix associated with the DataFrame by taking the user identifiers as an index
# and the movie titles as columns,
#storing the matrix in a variable named mat_ratings 

# Use the 'pivot_table' method to create the ratings matrix.
# 'columns = 'title'' specifies that the columns of the matrix correspond to the titles of the books.
# 'index = 'user_id'' specifies that the rows correspond to user IDs.
# 'values = 'rating'' specifies that the values in the matrix are the ratings given by users.
mat_ratings1 = ratings_movies_df.loc[ratings_movies_df["userId"] < (n_users/2)].pivot_table(columns='title', index='userId', values='rating')
mat_ratings2 = ratings_movies_df.loc[ratings_movies_df["userId"] > (n_users/2)].pivot_table(columns='title', index='userId', values='rating')
mat_ratings = pd.concat([mat_ratings1, mat_ratings2])
mat_ratings.fillna(0, inplace=True)

In [95]:
# Show the shape and first 10 rows of the matrix.
print(mat_ratings.shape)
display(mat_ratings.iloc[265:].head(10))

(138493, 26729)


title,"""Great Performances"" Cats (1998)",#chicagoGirl: The Social Network Takes on a Dictator (2013),$ (Dollars) (1971),$5 a Day (2008),$9.99 (2008),$ellebrity (Sellebrity) (2012),'71 (2014),'Hellboy': The Seeds of Creation (2004),"'Human' Factor, The (Human Factor, The) (1975)",'Neath the Arizona Skies (1934),...,Zincirbozan (2007),Zodiac (2014),Zombies on Broadway (1945),Zulu (2013),alaskaLand (2013),i hate myself :) (2013),"¡Alambrista! (Illegal, The) (1977)",Åsa-Nisse - Wälkom to Knohult (2011),Üvegtigris (2001),貞子3D (2012)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
266,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
268,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
from scipy.sparse import csr_matrix

# Convert the 'mat_ratings' rating matrix to a 'sparse_ratings' sparse matrix.
sparse_ratings = csr_matrix(mat_ratings)

# Extract user IDs and moivie titles from the ratings matrix.
user_ids = mat_ratings.index.tolist()  
titles = mat_ratings.columns.tolist()  

# Show sparse matrix 'sparse_ratings'.
print(sparse_ratings)

  (0, 145)	3.5
  (0, 182)	3.5
  (0, 322)	4.0
  (0, 554)	4.0
  (0, 764)	4.0
  (0, 784)	4.0
  (0, 1022)	4.0
  (0, 1258)	3.5
  (0, 1339)	4.0
  (0, 1549)	3.5
  (0, 1995)	3.0
  (0, 2072)	4.0
  (0, 2409)	3.5
  (0, 2410)	4.0
  (0, 2462)	4.0
  (0, 2594)	4.0
  (0, 2938)	3.5
  (0, 3251)	4.0
  (0, 3283)	3.5
  (0, 3399)	3.0
  (0, 4083)	3.5
  (0, 4128)	3.5
  (0, 4240)	3.5
  (0, 4266)	4.0
  (0, 4304)	4.0
  :	:
  (138492, 21614)	4.5
  (138492, 21615)	4.5
  (138492, 21618)	4.5
  (138492, 21619)	4.5
  (138492, 21707)	4.5
  (138492, 21792)	5.0
  (138492, 21867)	4.5
  (138492, 21902)	4.0
  (138492, 22030)	4.0
  (138492, 22087)	4.5
  (138492, 22101)	4.5
  (138492, 22104)	4.5
  (138492, 22148)	5.0
  (138492, 22150)	4.5
  (138492, 22152)	4.0
  (138492, 22394)	4.5
  (138492, 22405)	4.0
  (138492, 22564)	4.5
  (138492, 22700)	5.0
  (138492, 22994)	4.5
  (138492, 22995)	4.5
  (138492, 22999)	4.5
  (138492, 23027)	4.0
  (138492, 23046)	5.0
  (138492, 23103)	5.0


In [100]:
import numpy as np

# Definition of a function 'sim_cos' to calculate the cosine similarity between two vectors 'x' and 'y'.
def sim_cos(x, y):
    # Calculation of the scalar product between the vectors 'x' and 'y'.
    dot_product = np.dot(x, y)
    
    # Calculation of Euclidean norms of 'x' and 'y'.
    norm_x = np.sqrt(np.sum(x ** 2))
    norm_y = np.sqrt(np.sum(y ** 2))
    
    # Checking if one of the standards is zero to avoid division by zero.
    if norm_x == 0 or norm_y == 0:
        return 0
    
    # Calculation of cosine similarity using the formula.
    similarity = dot_product / (norm_x * norm_y)
    return similarity


pref_1 = mat_ratings.loc[1, :].values
pref_2 = mat_ratings.loc[2, :].values
similarity = sim_cos(pref_1, pref_2)
print("The similarity between the two users is ", similarity)

The similarity between the two users is  0.10291643773488954


In [136]:
#Long compiling time, needs optimizing
sims = []
pref_1 = mat_ratings[mat_ratings.index == 1].values
for i in range(0, (len(mat_ratings)+1)):
    pref_2 = mat_ratings[mat_ratings.index == i].values
    if(len(pref_2) != 0):
        sims.append(sim_cos(pref_1[0], pref_2[0]))


In [137]:
df_cossim = pd.DataFrame(mat_ratings.index)
df_cossim["similiarity"] = sims

In [141]:
df_cossim.sort_values("similiarity", ascending = False)

Unnamed: 0,userId,similiarity
0,1,1.000000
81274,81275,0.442530
62234,62235,0.426172
110068,110069,0.417257
2594,2595,0.415359
...,...,...
116257,116258,0.000000
26349,26350,0.000000
62261,62262,0.000000
38564,38565,0.000000


In [90]:
import sklearn.metrics.pairwise as dist

# Using the 'cosine_similarity' function of the 'dist' module to calculate the cosine similarity between users.
user_similarity = dist.cosine_similarity(sparse_ratings)

# Creation of a pandas DataFrame from the similarity matrix between users.
# The indexes and columns of the DataFrame are the user identifiers.
user_similarity = pd.DataFrame(user_similarity, index=user_ids, columns=user_ids)

MemoryError: Unable to allocate 125. GiB for an array with shape (16800582983,) and data type int64

In [None]:

def pred_user(mat_ratings, user_similarity, k, user_id):

    # Select in mat_ratings the books that have not yet been read by the user
    to_predict = mat_ratings.loc[userId][mat_ratings.loc[userId]==0]

    # Select the k most similar users excluding the user itself
    similar_users = user_similarity.loc[user_id].sort_values(ascending=False)[1:k+1]
    
    # Calculation of the denominator
    norm = np.sum(np.abs(similar_users))

    for i in to_predict.index:
        # Retrieve similar user ratings associated with the movie i
        ratings = mat_ratings[i].loc[similar_users.index]
        
        # Calculate the dot product between ratings and similar_users
        scalar_prod = np.dot(ratings, similar_users)
        
        #Calculate predicted rating for movie i
        pred = scalar_prod / norm

        # Replace with prediction
        to_predict[i] = pred

    return to_predict

In [None]:
# Top ratings from user '1'
userId = '1'
user_preferences = ratings_movies_df[(ratings_movies_df['userId']==userId) & (df['rating']>=4)]
user_preferences.sort_values('rating', ascending=False).drop_duplicates().head(10)

In [None]:
# Solution

reco_user = pred_user(mat_ratings, user_similarity, 3,'1').sort_values(ascending=False).head(10)

print(reco_user)