In [121]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

# Loading MovieLens rating dataset

In [123]:
ratings = pd.read_csv("/content/dataset/ratings.csv")
movies = pd.read_csv("/content/dataset/movies.csv")

In [124]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [125]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [126]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [127]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

# Merge datasets

In [128]:
data = pd.merge(ratings, movies, on='movieId', how='inner')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [129]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 4.6+ MB


In [130]:
# Aggregate by movie
agg_ratings = data.groupby('title').agg(mean_rating = ('rating', 'mean'),ratings_count = ('rating', 'count')).reset_index()
agg_ratings.head()

Unnamed: 0,title,mean_rating,ratings_count
0,'71 (2014),4.0,1
1,'Hellboy': The Seeds of Creation (2004),4.0,1
2,'Round Midnight (1986),3.5,2
3,'Salem's Lot (2004),5.0,1
4,'Til There Was You (1997),4.0,2


In [131]:
# Create user-item matrix
matrix = data.pivot_table(index='title', columns='userId', values='rating')
matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,


In [132]:
# Normalize user-item matrix
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,0.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,


In [133]:
# Item similarity matrix using Pearson correlation
item_similarity = matrix_norm.T.corr()
item_similarity.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,1.0,,,,,,...,,,,,,,,,,


In [134]:
# Item similarity matrix using cosine similarity
item_similarity_cosine = cosine_similarity(matrix_norm.fillna(0))
item_similarity_cosine

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [135]:
# Pick a user ID
picked_userid = 5

# Pick a movie
picked_movie = 'Toy Story (1995)'

# Movies that the target user has watched
picked_userid_watched = pd.DataFrame(matrix_norm[picked_userid].dropna(axis=0, how='all')\
                          .sort_values(ascending=False))\
                          .reset_index()\
                          .rename(columns={picked_userid:'rating'})

picked_userid_watched.head()

Unnamed: 0,title,rating
0,Pinocchio (1940),1.55
1,Snow White and the Seven Dwarfs (1937),1.383117
2,Beauty and the Beast (1991),1.229452
3,Dances with Wolves (1990),1.164634
4,Heavenly Creatures (1994),1.071429


In [136]:
# Similarity score of the movie American Pie with all the other movies
picked_movie_similarity_score = item_similarity[[picked_movie]].reset_index().rename(columns={'Toy Story (1995)':'similarity_score'})

# Rank the similarities between the movies user 1 rated and American Pie.
n = 5
picked_userid_watched_similarity = pd.merge(left=picked_userid_watched,
                                            right=picked_movie_similarity_score,
                                            on='title',
                                            how='inner')\
                                     .sort_values('similarity_score', ascending=False)[:5]

# Take a look at the User 1 watched movies with highest similarity
picked_userid_watched_similarity

Unnamed: 0,title,rating,similarity_score
20,Toy Story (1995),0.07907,1.0
10,"Secret Garden, The (1993)",0.75,0.763257
15,Aladdin (1992),0.20765,0.611892
0,Pinocchio (1940),1.55,0.42747
13,Babe (1995),0.347656,0.407493


In [137]:
# Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
predicted_rating = round(np.average(picked_userid_watched_similarity['rating'],
                                    weights=picked_userid_watched_similarity['similarity_score']), 6)

print(f'The predicted rating for {picked_movie} by user {picked_userid} is {predicted_rating}' )

The predicted rating for Toy Story (1995) by user 5 is 0.493073


In [141]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def item_based_rec(picked_userid=1, number_of_similar_items=5, number_of_recommendations=3):
    import operator
    # Movies that the target user has not watched
    picked_userid_unwatched = pd.DataFrame(matrix_norm[picked_userid].isna()).reset_index()
    picked_userid_unwatched = picked_userid_unwatched[picked_userid_unwatched[picked_userid] == True]['title'].values.tolist()

    # Movies that the target user has watched
    picked_userid_watched = pd.DataFrame(matrix_norm[picked_userid].dropna(axis=0, how='all')
                                         .sort_values(ascending=False)) \
        .reset_index() \
        .rename(columns={picked_userid: 'rating'})

    # Dictionary to save the unwatched movie and predicted rating pair
    rating_prediction = {}

    # Loop through unwatched movies
    for picked_movie in picked_userid_unwatched:
        # Calculate the similarity score of the picked movie with other movies
        picked_movie_similarity_score = item_similarity[[picked_movie]].reset_index().rename(columns={picked_movie: 'similarity_score'})
        # Rank the similarities between the picked user watched movie and the picked unwatched movie.
        picked_userid_watched_similarity = pd.merge(left=picked_userid_watched,
                                                    right=picked_movie_similarity_score,
                                                    on='title',
                                                    how='inner') \
            .sort_values('similarity_score', ascending=False)[:number_of_similar_items]
        # Check if the sum of similarity scores is not zero
        if picked_userid_watched_similarity['similarity_score'].sum() != 0:
            # Calculate the predicted rating using weighted average of similarity scores and the ratings from user 1
            predicted_rating = round(np.average(picked_userid_watched_similarity['rating'],
                                                weights=picked_userid_watched_similarity['similarity_score']), 6)
            # Save the predicted rating in the dictionary
            rating_prediction[picked_movie] = predicted_rating
        else:
            # Assign a default value if the sum of similarity scores is zero
            rating_prediction[picked_movie] = 0

    # Return the top recommended movies
    return sorted(rating_prediction.items(), key=operator.itemgetter(1), reverse=True)[:number_of_recommendations]

# Get recommendations
recommended_movie = item_based_rec(picked_userid=20, number_of_similar_items=5, number_of_recommendations=5)
recommended_movie


[('Kin-Dza-Dza! (1986)', 3.661683),
 ('20 Feet from Stardom (Twenty Feet from Stardom) (2013)', 3.425952),
 ('Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 1.757857),
 ('Diabolique (Les diaboliques) (1955)', 1.754604),
 ('Kitchen Stories (Salmer fra kjøkkenet) (2003)', 1.399006)]