# Movie Recommendation System 

## Completed By Ashish Kothwal 

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display
import ipywidgets as widgets
import pandas as pd
import numpy as np
import re

In [2]:
movies = pd.read_csv("data/movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


#### The function "clean_title" below will clean movie titles using regex. It will search through each title and remove any characters that are not a space, digit, or letter (lowercase or uppercase).

In [3]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

#### A new column called 'Clean-title' will be created in the data frame to store the cleaned up titles.

In [4]:
movies["Clean-title"] = movies["title"].apply(clean_title)
movies.head()

Unnamed: 0,movieId,title,genres,Clean-title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


#### Creating a TFIDF (Term Frequency Inverse Document Frequency) Matrix. Taking the log of the values. Word "the" is common.

#### The parameter "ngram_range" is not just looking for individual words in the title, it also looks for consecutive pairs of words, making the search more accurate.

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["Clean-title"])

#### Creating the search function. We will compute the similarity between the term entered and all of the movies in our list using cosine similarity.

In [6]:
def search(title):
    
    # Put the input title in "title" variable
    title = clean_title(title)
    
    # Create TFIDF for the input word
    query_vec = vectorizer.transform([title])
    
    # Get the list of similarityes between the whole list and the query_vec
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    # Get only those indeces that are relevan, the best ones. In sorted order
    indices = np.argpartition(similarity, -5)[-5:]
    
    # Put them into "results" and revert the order
    results = movies.iloc[indices][::-1]
    return results

In [7]:
search("Rocky")

Unnamed: 0,movieId,title,genres,Clean-title
1812,2412,Rocky V (1990),Action|Drama,Rocky V 1990
1431,1954,Rocky (1976),Drama,Rocky 1976
1809,2409,Rocky II (1979),Action|Drama,Rocky II 1979
1810,2410,Rocky III (1982),Action|Drama,Rocky III 1982
1811,2411,Rocky IV (1985),Action|Drama,Rocky IV 1985


In [8]:
search("The Girl on the Train")

Unnamed: 0,movieId,title,genres,Clean-title
9373,162602,The Girl on the Train (2016),Thriller,The Girl on the Train 2016
8743,127194,The D Train (2015),Comedy,The D Train 2015
4340,6339,"Man on the Train (Homme du train, L') (2002)",Comedy|Drama,Man on the Train Homme du train L 2002
2856,3822,"Girl on the Bridge, The (Fille sur le pont, La...",Drama|Romance,Girl on the Bridge The Fille sur le pont La 1999
3209,4333,Throw Momma from the Train (1987),Comedy|Crime,Throw Momma from the Train 1987


#### Reading in movie ratings data set, and find movies that are similar to our movie that we liked.

In [9]:
ratings = pd.read_csv("data/ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [10]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

#### Find all the users who also liked the movie we typed in and we want to find the other movies they liked, because those are probably going to be good recommendations. Shorted, people who liked the same movie as us what else did they like.

In [11]:
# Let's say this is the movieId that we typed in.

movie_id = 1

# We need to finding anyone who watched our movie
# And find anyone who liked it
# Let the threshold value be 4.5 rating out of 5

similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4.5)]["userId"].unique()
similar_users

array([ 31,  40,  43,  46,  57,  63,  71,  96, 145, 151, 166, 171, 177,
       201, 206, 220, 229, 234, 240, 247, 269, 270, 273, 275, 304, 328,
       341, 347, 353, 357, 364, 367, 380, 389, 396, 411, 448, 451, 453,
       456, 471, 533, 559, 573, 584, 587, 610], dtype=int64)

In [12]:
# First we going to find anyone who is similar to us and they watch the same movie and like it
# Than we want to find any movies they rated  greater than 4.5

similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4.5)]["movieId"]
similar_user_recs

4879           1
4890         377
4891         588
4894         648
4895         733
           ...  
100814    158238
100829    164179
100832    168248
100833    168250
100834    168252
Name: movieId, Length: 2067, dtype: int64

#### Only the movies that greater than 10% of the users who are similar to us liked

In [13]:
# similar_user_recs.value_counts() - counts up how many times each movie appears

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs

1         1.000000
296       0.404255
260       0.340426
593       0.276596
356       0.276596
            ...   
5241      0.021277
5339      0.021277
5466      0.021277
5603      0.021277
168250    0.021277
Name: movieId, Length: 1086, dtype: float64

#### Only take the ones with grater than 10%

In [14]:
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

1        1.000000
296      0.404255
260      0.340426
593      0.276596
356      0.276596
           ...   
2396     0.106383
58559    0.106383
2791     0.106383
25       0.106383
2918     0.106383
Name: movieId, Length: 76, dtype: float64

#### Finding how much all users like movies

In [15]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [16]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
3,1,47,5.0,964983815
4,1,50,5.0,964982931
15,1,260,5.0,964981680
25,1,457,5.0,964981909
28,1,527,5.0,964984002
...,...,...,...,...
99699,610,2959,5.0,1479541966
99709,610,3114,5.0,1479542923
99810,610,4226,5.0,1493844685
100310,610,58559,4.5,1493844688


#### All of the users who have watched movies that were recommended to us. What percentage of all users recommend each of these movies.

In [17]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [18]:
all_users_recs

318     0.372694
296     0.308118
356     0.285978
2571    0.276753
2959    0.239852
          ...   
553     0.036900
736     0.035055
500     0.031365
3175    0.025830
661     0.014760
Name: movieId, Length: 76, dtype: float64

#### Creating a recommendation score. Comparing the percentages.

In [19]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [20]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.119926
296,0.404255,0.308118
260,0.340426,0.236162
593,0.276596,0.236162
356,0.276596,0.285978
...,...,...
2396,0.106383,0.044280
58559,0.106383,0.145756
2791,0.106383,0.047970
25,0.106383,0.038745


#### We want movies where those two numbers are big in difference

In [21]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [22]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [23]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.119926,8.338462
661,0.106383,0.014760,7.207447
3175,0.127660,0.025830,4.942249
500,0.148936,0.031365,4.748436
2716,0.212766,0.055351,3.843972
...,...,...,...
608,0.106383,0.149446,0.711847
2959,0.170213,0.239852,0.709656
527,0.148936,0.236162,0.630652
318,0.234043,0.372694,0.627976


In [24]:
rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,Clean-title
0,1.0,0.119926,8.338462,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
551,0.106383,0.01476,7.207447,661,James and the Giant Peach (1996),Adventure|Animation|Children|Fantasy|Musical,James and the Giant Peach 1996
2393,0.12766,0.02583,4.942249,3175,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi,Galaxy Quest 1999
436,0.148936,0.031365,4.748436,500,Mrs. Doubtfire (1993),Comedy|Drama,Mrs Doubtfire 1993
2038,0.212766,0.055351,3.843972,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi,Ghostbusters aka Ghost Busters 1984
32,0.212766,0.055351,3.843972,34,Babe (1995),Children|Drama,Babe 1995
2355,0.212766,0.060886,3.49452,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
7355,0.12766,0.038745,3.294833,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
138,0.170213,0.053506,3.181218,165,Die Hard: With a Vengeance (1995),Action|Crime|Thriller,Die Hard With a Vengeance 1995
594,0.106383,0.035055,3.034714,736,Twister (1996),Action|Adventure|Romance|Thriller,Twister 1996


#### Building a recommendation function

In [25]:
def find_similar_movies(movie_id):
    # Finding recommendations from users similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # Adjusting so we only have recommendations where over 10% of users
    # recommended that movie
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    # Finding how common the recommendations were among all of the users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # Creating score
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title","genres"]]

In [26]:
results = search("Spiderman")
movie_id = results.iloc[0]["movieId"]
display(find_similar_movies(movie_id))

Unnamed: 0,score,title,genres
6470,134.75,Spider-Man 3 (2007),Action|Adventure|Sci-Fi|Thriller|IMAX
2855,134.75,Nutty Professor II: The Klumps (2000),Comedy
8140,134.75,Wolf Children (Okami kodomo no ame to yuki) (2...,Animation|Fantasy
5328,134.75,Resident Evil: Apocalypse (2004),Action|Horror|Sci-Fi|Thriller
6681,134.75,Jumper (2008),Action|Adventure|Drama|Sci-Fi|Thriller
6629,134.75,"Golden Compass, The (2007)",Adventure|Children|Fantasy
8289,134.75,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX
6770,134.75,"Incredible Hulk, The (2008)",Action|Sci-Fi
4422,134.75,Hulk (2003),Action|Adventure|Sci-Fi
8645,134.75,Into the Forest of Fireflies' Light (2011),Animation|Drama|Fantasy


In [27]:
results = search("Beautiful mind")
movie_id = results.iloc[0]["movieId"]
display(find_similar_movies(movie_id))

Unnamed: 0,score,title,genres
3640,12.883721,"Beautiful Mind, A (2001)",Drama|Romance
3138,11.043189,Enemy at the Gates (2001),Drama|War
7776,9.018605,"Girl with the Dragon Tattoo, The (2011)",Drama|Thriller
6992,7.157623,"Girl with the Dragon Tattoo, The (Män som hata...",Crime|Drama|Mystery|Thriller
4159,6.799742,Catch Me If You Can (2002),Crime|Drama
3832,6.44186,Star Wars: Episode II - Attack of the Clones (...,Action|Adventure|Sci-Fi|IMAX
6609,5.856237,American Gangster (2007),Crime|Drama|Thriller
4851,5.856237,The Butterfly Effect (2004),Drama|Sci-Fi|Thriller
1486,5.424725,Back to the Future Part II (1989),Adventure|Comedy|Sci-Fi
6276,5.368217,"Illusionist, The (2006)",Drama|Fantasy|Mystery|Romance
