# Portfolio Project - Build a Movie Recommendation System in Python

### Reading in Our Movie Data in Pandas

In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv("movies.csv")

You can download the .csv file from [here](https://files.grouplens.org/datasets/movielens/ml-25m.zip)

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Cleaning Movie Titles Using Regex

In [4]:
import re

In [5]:
# This function takes in a title and returns the cleaned title. It should remove any character that isn't a letter, digit, or a space.
def cleaning_title(title):

    return re.sub(r"[^a-zA-Z0-9\s]*", "", title)

In [6]:
movies["clean_title"] = movies["title"].apply(cleaning_title)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


### Creating a TFIDF Matrix

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

In [10]:
tfidf_matrix = vectorizer.fit_transform(movies["clean_title"])

### Creating a Search Function

In [11]:
import numpy as np

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
def search(term):
    
    cleaned_pattern = cleaning_title(term)

    pattern_vector = vectorizer.transform([cleaned_pattern]) 

    similarities = cosine_similarity(pattern_vector, tfidf_matrix)

    top5 = np.argsort(similarities[0])[-5:][::-1]
    
    return movies.iloc[top5, 1]

In [14]:
toy_story = search("Terminator") # Testing the function

In [15]:
toy_story

1207                Terminator, The (1984)
13334          Terminator Salvation (2009)
31990            Russian Terminator (1989)
24155            Terminator Genisys (2015)
581      Terminator 2: Judgment Day (1991)
Name: title, dtype: object

### Building an Interactive Search Box in Jupyter

In [16]:
import ipywidgets as widgets
from IPython.display import display

In [17]:
input_widget = widgets.Text(placeholder="Please type the title here")
search_button = widgets.Button(description="Search")
output_widget = widgets.HTML()

In [18]:
def on_search_clicked(e):

    recommendation = search(input_widget.value)

    result = "<ul>"
    for title in recommendation:
        result += f"<li>{title} </li>"
    result += "</ul>"
    
    output_widget.value = result

In [19]:
search_button.on_click(on_search_clicked)

#### The search function in action

Note: It usable in realtime, feel free to try it

In [20]:
display(input_widget, search_button, output_widget)

Text(value='', placeholder='Please type the title here')

Button(description='Search', style=ButtonStyle())

HTML(value='')

### Reading in Movie Ratings Data & Finding Users Who Liked the Same Movie


In this section finding movies that liked by users who liked the sample movie which is the Toy Story with `movieId` 1.

In [21]:
ratings = pd.read_csv("ratings.csv")

In [22]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [23]:
# Finding the users who also liked the same movie we liked. In this example the Toy Story (movieId: 1)
similar_users = ratings[(ratings["movieId"] == 1) & (ratings["rating"] > 4)]["userId"]

In [24]:
similar_users

5101            36
9939            75
11842           86
12232           90
12504           93
             ...  
24996419    162519
24997459    162524
24997758    162527
24998300    162530
24998525    162533
Name: userId, Length: 18835, dtype: int64

In [25]:
similar_movies = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)] # Finding the other movies that they liked.

In [67]:
similar_movies

Unnamed: 0,userId,movieId,rating,timestamp
5101,36,1,5.0,857131378
5105,36,34,5.0,834413787
5111,36,110,5.0,834412999
5114,36,150,5.0,839928587
5127,36,260,5.0,857131062
...,...,...,...,...
24998854,162533,60069,4.5,1280919889
24998861,162533,67997,4.5,1280920712
24998876,162533,78499,4.5,1281405901
24998884,162533,81591,4.5,1297289876


In [68]:
rate = similar_movies["movieId"].value_counts() / len(similar_movies) * 100

In [69]:
rate

movieId
1         1.386633
318       0.617893
260       0.559880
356       0.513352
296       0.509303
            ...   
128478    0.000074
125125    0.000074
119701    0.000074
107563    0.000074
7625      0.000074
Name: count, Length: 19282, dtype: float64

In [70]:
rate_over_10 = rate[rate > .1] # Finding only the movies that more than 10% of similar users liked.

In [77]:
rate_over_10

movieId
1        1.386633
318      0.617893
260      0.559880
356      0.513352
296      0.509303
           ...   
3897     0.101890
2542     0.101448
54286    0.101154
36       0.100418
2019     0.100050
Name: count, Length: 187, dtype: float64

In [80]:
same_movies = movies[movies["movieId"].isin(rate_over_10.index)]

In [81]:
same_movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
5,6,Heat (1995),Action|Crime|Thriller,Heat 1995
16,17,Sense and Sensibility (1995),Drama|Romance,Sense and Sensibility 1995
31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,Twelve Monkeys aka 12 Monkeys 1995
33,34,Babe (1995),Children|Drama,Babe 1995
...,...,...,...,...
17464,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,Dark Knight Rises The 2012
19021,99114,Django Unchained (2012),Action|Drama|Western,Django Unchained 2012
21199,109487,Interstellar (2014),Sci-Fi|IMAX,Interstellar 2014
21936,112852,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,Guardians of the Galaxy 2014


### Determining How Much Users Like Movies

In [82]:
# Finding all users who rated a movie highly that is in our set of recommended movies
users_highly_liked = ratings[(ratings["rating"] > 4) & (ratings["movieId"].isin(same_movies["movieId"]))]

In [83]:
users_highly_liked

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
23,1,3949,5.0,1147868678
29,1,4973,4.5,1147869080
37,1,6016,5.0,1147869090
48,1,7361,5.0,1147880055
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [85]:
# Finding what percentage of all users recommend each of these movies
movies_rec_percentage = users_highly_liked.groupby("movieId").count()["userId"] / len(users_highly_liked["userId"].unique())

In [87]:
movies_rec_percentage

movieId
1         0.122160
6         0.047807
17        0.044642
32        0.098227
34        0.051153
            ...   
91529     0.053761
99114     0.056018
109487    0.072518
112852    0.042060
134853    0.035263
Name: userId, Length: 187, dtype: float64

### Creating a Recommendation Score

In [88]:
recommendation_percentages = pd.concat([movies_rec_percentage, rate_over_10], axis=1) # Concatenating similar user recommendations and all user recommendations.

In [89]:
recommendation_percentages.columns = ["similar", "all"]

In [90]:
recommendation_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.122160,1.386633
6,0.047807,0.113669
17,0.044642,0.103804
32,0.098227,0.222848
34,0.051153,0.181032
...,...,...
91529,0.053761,0.123314
99114,0.056018,0.107632
109487,0.072518,0.123166
112852,0.042060,0.114185


In [91]:
recommendation_percentages["score"] = recommendation_percentages["similar"] / recommendation_percentages["all"]

In [92]:
recommendation_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.122160,1.386633,0.088098
6,0.047807,0.113669,0.420578
17,0.044642,0.103804,0.430057
32,0.098227,0.222848,0.440783
34,0.051153,0.181032,0.282567
...,...,...,...
91529,0.053761,0.123314,0.435968
99114,0.056018,0.107632,0.520455
109487,0.072518,0.123166,0.588779
112852,0.042060,0.114185,0.368354


In [93]:
top10_recommendations = recommendation_percentages.sort_values("score", ascending=False).iloc[:10, :]

In [94]:
top10_recommendations

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3949,0.062192,0.104025,0.597859
109487,0.072518,0.123166,0.588779
2959,0.212254,0.364198,0.582799
4973,0.11009,0.197596,0.557146
79132,0.128678,0.231314,0.556292
4878,0.072745,0.131412,0.553563
6016,0.068198,0.123387,0.552717
296,0.278812,0.509303,0.547437
2858,0.164182,0.300517,0.54633
858,0.20573,0.378922,0.542933


In [95]:
top10_recommendations = pd.merge(top10_recommendations, movies, on="movieId", how="left")

In [96]:
top10_recommendations

Unnamed: 0,movieId,similar,all,score,title,genres,clean_title
0,3949,0.062192,0.104025,0.597859,Requiem for a Dream (2000),Drama,Requiem for a Dream 2000
1,109487,0.072518,0.123166,0.588779,Interstellar (2014),Sci-Fi|IMAX,Interstellar 2014
2,2959,0.212254,0.364198,0.582799,Fight Club (1999),Action|Crime|Drama|Thriller,Fight Club 1999
3,4973,0.11009,0.197596,0.557146,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance,Amelie Fabuleux destin dAmlie Poulain Le 2001
4,79132,0.128678,0.231314,0.556292,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,Inception 2010
5,4878,0.072745,0.131412,0.553563,Donnie Darko (2001),Drama|Mystery|Sci-Fi|Thriller,Donnie Darko 2001
6,6016,0.068198,0.123387,0.552717,City of God (Cidade de Deus) (2002),Action|Adventure|Crime|Drama|Thriller,City of God Cidade de Deus 2002
7,296,0.278812,0.509303,0.547437,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction 1994
8,2858,0.164182,0.300517,0.54633,American Beauty (1999),Drama|Romance,American Beauty 1999
9,858,0.20573,0.378922,0.542933,"Godfather, The (1972)",Crime|Drama,Godfather The 1972
