# Portfolio Project - Build a Movie Recommendation System in Python

### Reading in Our Movie Data in Pandas

In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv("movies.csv")

You can download the .csv file from [here](https://files.grouplens.org/datasets/movielens/ml-25m.zip)

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Cleaning Movie Titles Using Regex

In [4]:
import re

In [5]:
# This function takes in a title and returns the cleaned title. It should remove any character that isn't a letter, digit, or a space.
def cleaning_title(title):

    return re.sub(r"[^a-zA-Z0-9\s]*", "", title)

In [6]:
movies["clean_title"] = movies["title"].apply(cleaning_title)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


### Creating a TFIDF Matrix

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

In [10]:
tfidf_matrix = vectorizer.fit_transform(movies["clean_title"])

### Creating a Search Function

In [11]:
import numpy as np

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
def search(term):
    
    cleaned_pattern = cleaning_title(term)

    pattern_vector = vectorizer.transform([cleaned_pattern]) 

    similarities = cosine_similarity(pattern_vector, tfidf_matrix)

    top5 = np.argsort(similarities[0])[-5:][::-1]
    
    return movies.iloc[top5, 1]

In [14]:
toy_story = search("Terminator") # Testing the function

In [15]:
toy_story

1207                Terminator, The (1984)
13334          Terminator Salvation (2009)
31990            Russian Terminator (1989)
24155            Terminator Genisys (2015)
581      Terminator 2: Judgment Day (1991)
Name: title, dtype: object

### Building an Interactive Search Box in Jupyter

In [16]:
import ipywidgets as widgets
from IPython.display import display

In [17]:
input_widget = widgets.Text(placeholder="Please type the title here")
search_button = widgets.Button(description="Search")
output_widget = widgets.HTML()

In [18]:
def on_search_clicked(e):

    recommendation = search(input_widget.value)

    result = "<ul>"
    for title in recommendation:
        result += f"<li>{title} </li>"
    result += "</ul>"
    
    output_widget.value = result

In [19]:
search_button.on_click(on_search_clicked)

#### The search function in action

Note: It usable in realtime, feel free to try it

In [20]:
display(input_widget, search_button, output_widget)

Text(value='', placeholder='Please type the title here')

Button(description='Search', style=ButtonStyle())

HTML(value='')

### Reading in Movie Ratings Data & Finding Users Who Liked the Same Movie


In this section finding movies that liked by users who liked the sample movie which is the Toy Story with `movieId` 1.

In [21]:
ratings = pd.read_csv("ratings.csv")

In [22]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [23]:
similar_users = ratings[(ratings["movieId"] == 1) & (ratings["rating"] > 4)]["userId"] # Finding the users who also liked the same movie we liked. In this example the Toy Story (movieId: 1)

In [24]:
similar_users

5101            36
9939            75
11842           86
12232           90
12504           93
             ...  
24996419    162519
24997459    162524
24997758    162527
24998300    162530
24998525    162533
Name: userId, Length: 18835, dtype: int64

In [25]:
similar_movies = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)] # Finding the other movies that they liked.

In [26]:
similar_movies

Unnamed: 0,userId,movieId,rating,timestamp
5101,36,1,5.0,857131378
5105,36,34,5.0,834413787
5111,36,110,5.0,834412999
5114,36,150,5.0,839928587
5127,36,260,5.0,857131062
...,...,...,...,...
24998854,162533,60069,4.5,1280919889
24998861,162533,67997,4.5,1280920712
24998876,162533,78499,4.5,1281405901
24998884,162533,81591,4.5,1297289876


In [56]:
rate = similar_movies["movieId"].value_counts() / len(similar_users)

In [57]:
rate

movieId
1         1.000000
318       0.445607
260       0.403770
356       0.370215
296       0.367295
            ...   
128478    0.000053
125125    0.000053
119701    0.000053
107563    0.000053
7625      0.000053
Name: count, Length: 19282, dtype: float64

In [58]:
rate_over_10 = rate[rate > .1] # Finding only the moives that more than 10% of similar users liked.

In [59]:
rate_over_10

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64

In [60]:
recommendations = movies[movies["movieId"].isin(rate_over_10.index)]

In [61]:
recommendations

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,Twelve Monkeys aka 12 Monkeys 1995
33,34,Babe (1995),Children|Drama,Babe 1995
46,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,Seven aka Se7en 1995
49,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,Usual Suspects The 1995
...,...,...,...,...
12324,59315,Iron Man (2008),Action|Adventure|Sci-Fi,Iron Man 2008
12429,60069,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi,WALLE 2008
13362,68954,Up (2009),Adventure|Animation|Children|Drama,Up 2009
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
