In [1]:
from itertools import permutations

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.spatial.distance import pdist, squareform

from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


Pivoting your data

In this chapter, you will go one step further in generating personalized recommendations — you will find items that users, similar to the one you are making recommendations for, have liked.

The first step you will need to start with is formatting your data. You begin with a dataset containing users and their ratings as individual rows with the following columns:

user: User ID
title: Title of the movie
rating: Rating the user gave the movie
You will need to transform the DataFrame into a user rating matrix where each row represents a user, and each column represents the movies on the platform. This will allow you to easily compare users and their preferences.

In [2]:
user_ratings = pd.read_csv('user_ratings.csv')
user_ratings.reset_index(drop=True, inplace=True)
user_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [3]:
# Check for duplicates based on 'userId', 'movieId', and 'title'
user_ratings_duplicates = user_ratings[user_ratings.duplicated(subset=['userId', 'title', 'rating'], keep=False)]

user_ratings_duplicates['len_genres'] = user_ratings_duplicates['genres'].apply(len)

user_ratings_duplicates.sort_values(['userId','len_genres'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_ratings_duplicates['len_genres'] = user_ratings_duplicates['genres'].apply(len)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,len_genres
73911,28,64997,3.5,1234850075,War of the Worlds (2005),Action|Sci-Fi,13
39411,28,34048,3.5,1234516420,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller,32
88958,111,6003,4.0,1516468531,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller,27
89385,111,144606,4.0,1517441257,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller,35
99604,509,26958,3.5,1436031753,Emma (1996),Romance,7
33526,509,838,3.5,1436031723,Emma (1996),Comedy|Drama|Romance,20


In [4]:
# I want to remove the duplicated with incomplete genres description (lower len_genres)

to_keep = user_ratings_duplicates.groupby('userId')['len_genres'].max()
to_keep.index, to_keep.values

(Index([28, 111, 509], dtype='int64', name='userId'),
 array([32, 35, 20], dtype=int64))

In [5]:
to_keep_indices = user_ratings_duplicates[user_ratings_duplicates.userId.isin(to_keep.index) & user_ratings_duplicates.len_genres.isin(to_keep.values)].index
to_keep_indices

Index([33526, 39411, 89385], dtype='int64')

In [6]:
to_remove_indices = user_ratings_duplicates[~user_ratings_duplicates.index.isin(to_keep_indices)].index
to_remove_indices

Index([73911, 88958, 99604], dtype='int64')

In [7]:
user_ratings_no_dupl = user_ratings[~user_ratings.index.isin(to_remove_indices)]
user_ratings_no_dupl

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [8]:
user_ratings_table = user_ratings_no_dupl.pivot_table(index='userId', columns='title', values='rating')

user_ratings_table

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


Compensating for incomplete data

For most datasets, the majority of users will have rated only a small number of items. As you saw in the last exercise, how you deal with users who do not have ratings for an item can greatly influence the validity of your models.

In this exercise, you will fill in missing data with information that should not bias the data that you do have.

You'll get the average score each user has given across all their ratings, and then use this average to center the users' scores around zero. Finally, you'll be able to fill in the empty values with zeros, which is now a neutral score, minimizing the impact on their overall profile, but still allowing the comparison of users.