## Lab 6 Collaborative Filtering

In [2]:
# imports
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

import numpy as np

%matplotlib inline

#### Q1

- MovieLens is a movie rec service
- Dataset has 5 star ratings and free text activity
- 100,836 ratings (0.5-5 w/ 0.5 increments)
- 9742 movies
- 3683 tags
- Users represented by ID (consistent b/w rating and tags)
- Movie ID consistent accross ratings, tags, movies and links
- Genres pipe separated list

In [3]:
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

In [4]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


## User-Base Similarity
#### Q2

In [9]:
pivot_movie_user_rating = pd.pivot_table(ratings, values="rating", index="userId", columns="movieId")
pivot_movie_user_rating.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [10]:
pivot_movie_user_rating.isnull().sum().sum()

5830804

In [11]:
pivot_movie_user_rating.fillna(value=0, inplace=True)
pivot_movie_user_rating.isnull().sum().sum()

0

#### Q3, Q4, Q5

In [12]:
users_dist_by_movie = pairwise_distances(pivot_movie_user_rating, metric='cosine')
print(users_dist_by_movie)

[[0.00000000e+00 9.72717135e-01 9.40279738e-01 ... 7.08902628e-01
  9.06428070e-01 8.54679193e-01]
 [9.72717135e-01 1.11022302e-16 1.00000000e+00 ... 9.53789046e-01
  9.72434599e-01 8.97573246e-01]
 [9.40279738e-01 1.00000000e+00 1.11022302e-16 ... 9.78871538e-01
  1.00000000e+00 9.67881252e-01]
 ...
 [7.08902628e-01 9.53789046e-01 9.78871538e-01 ... 0.00000000e+00
  8.78007286e-01 6.77945142e-01]
 [9.06428070e-01 9.72434599e-01 1.00000000e+00 ... 8.78007286e-01
  2.22044605e-16 9.46774537e-01]
 [8.54679193e-01 8.97573246e-01 9.67881252e-01 ... 6.77945142e-01
  9.46774537e-01 0.00000000e+00]]


#### Q6

In [63]:
user_10_similarity = users_dist_by_movie[10 - 1, :]
# to get the top 5 most similar users we can get the highest values' indices
five_most_similar_users_to_id_10 = user_10_similarity.argsort()
five_most_similar_users_to_id_10 = five_most_similar_users_to_id_10[:5]
print(f"The users that are most similar to user ID 10 : {[x+1 for x in five_most_similar_users_to_id_10]} in the same high to low order")

The users that are most similar to user ID 10 have the ID: [10, 159, 143, 563, 177] in the same high to low order


#### Q7

In [15]:
rating_movie = pd.DataFrame(pd.merge(ratings, movies, on="movieId"))
rating_movie

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [18]:
user2_movie = pd.DataFrame(rating_movie[rating_movie["userId"] == 2][["title","rating"]])
user2_movie.rename(columns = {'title':'title', 'rating':'rating_2'}, inplace = True)
user2_movie.head()

Unnamed: 0,title,rating_2
2267,Tommy Boy (1995),4.0
15657,Gladiator (2000),4.0
16296,"Shawshank Redemption, The (1994)",3.0
16613,Good Will Hunting (1997),4.5
16754,Kill Bill: Vol. 1 (2003),4.0


In [19]:
user338_movie = pd.DataFrame(rating_movie[rating_movie["userId"] == 338][["title","rating"]])
user338_movie.rename(columns = {'title':'title', 'rating':'rating_338'}, inplace = True)
user338_movie.head()

Unnamed: 0,title,rating_338
692,"Usual Suspects, The (1995)",4.5
1997,Pulp Fiction (1994),4.5
3684,Schindler's List (1993),5.0
4462,"Silence of the Lambs, The (1991)",4.0
14435,Fight Club (1999),4.5


In [20]:
merged_by_title_user_2_338 = pd.merge(user2_movie, user338_movie, on="title")
merged_by_title_user_2_338.head()

Unnamed: 0,title,rating_2,rating_338
0,"Shawshank Redemption, The (1994)",3.0,5.0
1,Kill Bill: Vol. 1 (2003),4.0,4.5


#### Q8

In [21]:
merged_by_title_user_2_338[(merged_by_title_user_2_338["rating_2"] >= 4) & ((merged_by_title_user_2_338["rating_338"] >= 4))]

Unnamed: 0,title,rating_2,rating_338
1,Kill Bill: Vol. 1 (2003),4.0,4.5


## Item Based Similarity
#### Q9

In [22]:
df_movie_user = pd.DataFrame(rating_movie.pivot_table(index='title',columns='userId',values='rating'))
df_movie_user

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,5.0,,,,,4.5,,
xXx (2002),,,,,,,,,1.0,,...,,,,,,,,3.5,,2.0
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,1.5
¡Three Amigos! (1986),4.0,,,,,,,,,,...,,,,,,,,,,


In [23]:
df_movie_user.fillna(value=0, inplace=True)
df_movie_user.isnull().sum().sum()

0

#### Question 10

In [24]:
movie_dist_by_user = pairwise_distances(df_movie_user, metric='cosine')
print(movie_dist_by_user.shape)

(9719, 9719)


In [25]:
# Finding what index Godfather is atmovie_dist_by_user
# movie_dist_by_user[movie_dist_by_user["title"] == "Godfather"]
df_movie_user[df_movie_user.index.str.contains('Godfather')]

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Godfather, The (1972)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,5.0,0.0,0.0,4.0,4.0,5.0,0.0,5.0
"Godfather: Part II, The (1974)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,5.0,0.0,0.0,4.0,0.0,4.5,0.0,5.0
"Godfather: Part III, The (1990)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
The Godfather Trilogy: 1972-1990 (1992),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tokyo Godfathers (2003),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


THE END