# Recommendation Movie

## 1. Matrix Factorization, Content-based filtering

In [39]:
import re
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

movies = pd.read_csv('data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [40]:
def clean_title(title):
    return re.sub(r'\(\d{4}\)', '', title).strip()
movies["title"] = movies["title"].apply(clean_title)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


In [41]:
movies["genres"] = movies["genres"].apply(lambda x: x.replace("|", " "))
movies["combine"] = movies["title"] + " " + movies["genres"]
movies.head()

Unnamed: 0,movieId,title,genres,combine
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,Toy Story Adventure Animation Children Comedy ...
1,2,Jumanji,Adventure Children Fantasy,Jumanji Adventure Children Fantasy
2,3,Grumpier Old Men,Comedy Romance,Grumpier Old Men Comedy Romance
3,4,Waiting to Exhale,Comedy Drama Romance,Waiting to Exhale Comedy Drama Romance
4,5,Father of the Bride Part II,Comedy,Father of the Bride Part II Comedy


In [42]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["combine"])
tfidf.shape

(62423, 154860)

In [43]:
title = "Old Men"
title = clean_title(title)
query_vector = vectorizer.transform([title])
query_vector.shape

(1, 154860)

In [44]:
cosine_sim = linear_kernel(query_vector, tfidf)
cosine_sim = cosine_sim.flatten()

related_movie_indices = cosine_sim.argsort()[:-6:-1]
result = movies.iloc[related_movie_indices]
result

Unnamed: 0,movieId,title,genres,combine
3354,3450,Grumpy Old Men,Comedy,Grumpy Old Men Comedy
32828,141820,Old Men: Robbers,Comedy,Old Men: Robbers Comedy
2,3,Grumpier Old Men,Comedy Romance,Grumpier Old Men Comedy Romance
11932,55820,No Country for Old Men,Crime Drama,No Country for Old Men Crime Drama
47016,173637,Seven Old Men and One Girl,Comedy,Seven Old Men and One Girl Comedy


## 2. Collaborative filtering based on user ratings

In [45]:
ratings = pd.read_csv("data/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [46]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Find users who have rates above 4 for the movie

In [47]:
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
similar_users

array([     3,      5,      8, ..., 162530, 162533, 162534], dtype=int64)

### Find other movies that similar users have rated above 4

In [48]:
similar_users_rec = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]
similar_users_rec

Unnamed: 0,userId,movieId,rating,timestamp
254,3,1,4.0,1439472215
255,3,29,4.5,1484754967
256,3,32,4.5,1439474635
257,3,50,5.0,1439474391
258,3,111,4.0,1484753849
...,...,...,...,...
24999332,162534,166643,4.0,1526891765
24999342,162534,171763,4.0,1526717390
24999348,162534,177593,5.0,1526666314
24999351,162534,177765,4.0,1526666311


### Find the percentage of ratings for each movie that similar users have rated above 4

In [49]:
similar_users_rec = similar_users_rec["movieId"].value_counts() / len(similar_users)
similar_users_rec = similar_users_rec[similar_users_rec > 0.2]
similar_users_rec

movieId
1       1.000000
318     0.549604
260     0.531518
356     0.517224
296     0.495744
          ...   
1258    0.203055
4963    0.201835
4973    0.201835
592     0.200642
6874    0.200615
Name: count, Length: 79, dtype: float64

### Find the average rating for each movie that all users have rated above 4

In [68]:
all_users = ratings[(ratings["movieId"].isin(similar_users_rec.index)) & (ratings["rating"] >= 4)]
all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_rec

movieId
318      0.450106
296      0.398414
356      0.375811
593      0.370028
2571     0.355813
260      0.332559
527      0.310735
2959     0.299092
50       0.291381
1196     0.282431
858      0.277102
1198     0.265152
4993     0.265088
110      0.263282
2858     0.256918
589      0.250809
1210     0.248926
47       0.242888
1        0.240704
7153     0.240481
5952     0.240430
608      0.232017
480      0.224185
457      0.221906
2028     0.216550
1270     0.215376
2762     0.209905
58559    0.205635
4226     0.203324
32       0.201390
Name: count, dtype: float64

In [69]:
rec_percentage = pd.concat([similar_users_rec, all_users_rec], axis = 1)
rec_percentage.columns = ["similar_users", "all_users"]
rec_percentage = rec_percentage.fillna(0)
rec_percentage

Unnamed: 0_level_0,similar_users,all_users
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.240704
318,0.549604,0.450106
260,0.531518,0.332559
356,0.517224,0.375811
296,0.495744,0.398414
...,...,...
1258,0.203055,0.000000
4963,0.201835,0.000000
4973,0.201835,0.000000
592,0.200642,0.000000
