# Recommendation Movie

## 1. Matrix Factorization, Content-based filtering

In [118]:
import re
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

movies = pd.read_csv('data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [119]:
def clean_title(title):
    return re.sub(r'\(\d{4}\)', '', title).strip()
movies["title"] = movies["title"].apply(clean_title)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


In [120]:
movies["genres"] = movies["genres"].apply(lambda x: x.replace("|", " "))
movies["combine"] = movies["title"] + " " + movies["genres"]
movies.head()

Unnamed: 0,movieId,title,genres,combine
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,Toy Story Adventure Animation Children Comedy ...
1,2,Jumanji,Adventure Children Fantasy,Jumanji Adventure Children Fantasy
2,3,Grumpier Old Men,Comedy Romance,Grumpier Old Men Comedy Romance
3,4,Waiting to Exhale,Comedy Drama Romance,Waiting to Exhale Comedy Drama Romance
4,5,Father of the Bride Part II,Comedy,Father of the Bride Part II Comedy


In [121]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["combine"])
tfidf.shape

(62423, 154860)

In [122]:
title = "Toy Story"
title = clean_title(title)
query_vector = vectorizer.transform([title])
query_vector.shape

(1, 154860)

In [123]:
cosine_sim = linear_kernel(query_vector, tfidf)
similarity = cosine_sim.flatten()
similarity_df = pd.DataFrame(similarity, index = movies["title"])
similarity_df.columns = ["similarity"]
similarity_df = similarity_df.sort_values("similarity", ascending = False)
similarity_df

Unnamed: 0_level_0,similarity
title,Unnamed: 1_level_1
Toy Story 4,0.687338
Toy Story,0.652562
Toy Story 2,0.652562
Toy Story 3,0.584308
Toy Story of Terror,0.576664
...,...
Ten Little Indians (Ein Unbekannter rechnet ab) (And Then There Were None),0.000000
Mustalaishurmaaja,0.000000
"Trials of Muhammad Ali, The",0.000000
You Ain't Seen Nothin' Yet (Vous n'avez encore rien vu),0.000000


In [124]:
related_movie_indices = similarity.argsort()[:-6:-1]
result = movies.iloc[related_movie_indices]
result

Unnamed: 0,movieId,title,genres,combine
59767,201588,Toy Story 4,Adventure Animation Children Comedy,Toy Story 4 Adventure Animation Children Comedy
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,Toy Story Adventure Animation Children Comedy ...
3021,3114,Toy Story 2,Adventure Animation Children Comedy Fantasy,Toy Story 2 Adventure Animation Children Comed...
14813,78499,Toy Story 3,Adventure Animation Children Comedy Fantasy IMAX,Toy Story 3 Adventure Animation Children Comed...
20497,106022,Toy Story of Terror,Animation Children Comedy,Toy Story of Terror Animation Children Comedy


In [125]:
def recommend_movie(title):
    title = clean_title(title)
    query_vector = vectorizer.transform([title])
    cosine_sim = linear_kernel(query_vector, tfidf)
    similarity = cosine_sim.flatten()
    similarity_df = pd.DataFrame(similarity, index = movies["title"])
    similarity_df.columns = ["similarity"]
    related_movie_indices = similarity.argsort()[:-6:-1]
    result = movies.iloc[related_movie_indices]
    return result

## 2. Collaborative filtering based on user ratings

In [126]:
ratings = pd.read_csv("data/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [127]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Find users who have rates above 4 for the movie

In [128]:
movie_id = movies[movies["title"] == title]["movieId"].values[0]
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
similar_users

array([     3,      5,      8, ..., 162530, 162533, 162534], dtype=int64)

### Find other movies that similar users have rated above 4

In [129]:
similar_users_rec = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]
similar_users_rec

Unnamed: 0,userId,movieId,rating,timestamp
254,3,1,4.0,1439472215
255,3,29,4.5,1484754967
256,3,32,4.5,1439474635
257,3,50,5.0,1439474391
258,3,111,4.0,1484753849
...,...,...,...,...
24999332,162534,166643,4.0,1526891765
24999342,162534,171763,4.0,1526717390
24999348,162534,177593,5.0,1526666314
24999351,162534,177765,4.0,1526666311


### Find the percentage of ratings for each movie that similar users have rated above 4

In [130]:
similar_users_rec = similar_users_rec["movieId"].value_counts() / len(similar_users)
similar_users_rec = similar_users_rec[similar_users_rec > 0.2]
similar_users_rec

movieId
1       1.000000
318     0.549604
260     0.531518
356     0.517224
296     0.495744
          ...   
1258    0.203055
4963    0.201835
4973    0.201835
592     0.200642
6874    0.200615
Name: count, Length: 79, dtype: float64

### Find the average rating for each movie that all users have rated above 4

In [131]:
all_users = ratings[(ratings["movieId"].isin(similar_users_rec.index)) & (ratings["rating"] >= 4)]
all_users_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_rec

movieId
318     0.450106
296     0.398414
356     0.375811
593     0.370028
2571    0.355813
          ...   
733     0.111374
34      0.108572
3114    0.104538
919     0.103529
1073    0.101340
Name: count, Length: 79, dtype: float64

In [132]:
rec_percentage = pd.concat([similar_users_rec, all_users_rec], axis = 1)
rec_percentage.columns = ["similar_users", "all_users"]
rec_percentage = rec_percentage.fillna(0)
rec_percentage

Unnamed: 0_level_0,similar_users,all_users
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.240704
318,0.549604,0.450106
260,0.531518,0.332559
356,0.517224,0.375811
296,0.495744,0.398414
...,...,...
1258,0.203055,0.137041
4963,0.201835,0.122296
4973,0.201835,0.164029
592,0.200642,0.112849


In [133]:
rec_percentage["score"] = (rec_percentage["similar_users"] / rec_percentage["all_users"]) / 10
rec = rec_percentage.sort_values("score", ascending = False)
rec

Unnamed: 0_level_0,similar_users,all_users,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.240704,0.415447
3114,0.328914,0.104538,0.314636
1073,0.231801,0.101340,0.228737
4886,0.309316,0.138611,0.223153
6377,0.294094,0.137137,0.214453
...,...,...,...
4973,0.201835,0.164029,0.123048
58559,0.252327,0.205635,0.122706
318,0.549604,0.450106,0.122105
2959,0.351826,0.299092,0.117631


In [134]:
rec = pd.DataFrame(rec.merge(movies, left_index = True, right_on = "movieId"))
rec

Unnamed: 0,similar_users,all_users,score,movieId,title,genres,combine
0,1.000000,0.240704,0.415447,1,Toy Story,Adventure Animation Children Comedy Fantasy,Toy Story Adventure Animation Children Comedy ...
3021,0.328914,0.104538,0.314636,3114,Toy Story 2,Adventure Animation Children Comedy Fantasy,Toy Story 2 Adventure Animation Children Comed...
1047,0.231801,0.101340,0.228737,1073,Willy Wonka & the Chocolate Factory,Children Comedy Fantasy Musical,Willy Wonka & the Chocolate Factory Children C...
4780,0.309316,0.138611,0.223153,4886,"Monsters, Inc.",Adventure Animation Children Comedy Fantasy,"Monsters, Inc. Adventure Animation Children Co..."
6258,0.294094,0.137137,0.214453,6377,Finding Nemo,Adventure Animation Children Comedy,Finding Nemo Adventure Animation Children Comedy
...,...,...,...,...,...,...,...
4867,0.201835,0.164029,0.123048,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le)",Comedy Romance,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ..."
12221,0.252327,0.205635,0.122706,58559,"Dark Knight, The",Action Crime Drama IMAX,"Dark Knight, The Action Crime Drama IMAX"
314,0.549604,0.450106,0.122105,318,"Shawshank Redemption, The",Crime Drama,"Shawshank Redemption, The Crime Drama"
2867,0.351826,0.299092,0.117631,2959,Fight Club,Action Crime Drama Thriller,Fight Club Action Crime Drama Thriller


## Combine two methods 

In [135]:
merged_df = pd.merge(similarity_df, rec, on="title", how="left")
merged_df.drop(columns=["movieId", "genres", "combine", "similar_users", "all_users"], inplace=True)
merged_df = merged_df.fillna(0)
merged_df["total_score"] = (merged_df["similarity"] + merged_df["score"] * 2) / 3
merged_df = merged_df.sort_values ("total_score", ascending = False)
merged_df = merged_df.drop_duplicates(subset = "title")
merged_df

Unnamed: 0,title,similarity,score,total_score
1,Toy Story,0.652562,0.415447,0.494486
2,Toy Story 2,0.652562,0.314636,0.427278
0,Toy Story 4,0.687338,0.000000,0.229113
3,Toy Story 3,0.584308,0.000000,0.194769
4,Toy Story of Terror,0.576664,0.000000,0.192221
...,...,...,...,...
21189,Fangoria: Blood Drive,0.000000,0.000000,0.000000
21190,One Hell of a Christmas,0.000000,0.000000,0.000000
21191,Scream - Because I Will Kill You!,0.000000,0.000000,0.000000
21192,Angel of the Night,0.000000,0.000000,0.000000


In [136]:
movie_input_name = widgets.Text(
    value='',
    placeholder='Type something',
    description='Movie:',
    disabled=False
)

display(movie_input_name)

Text(value='', description='Movie:', placeholder='Type something')