Creating a movie recommendation system with Jupyter Notebook and Pandas

In [34]:
import pandas as pd
# https://files.grouplens.org/datasets/movielens/ml-25m.zip

In [37]:
# reading csv file of movies
movies = pd.read_csv("D:\MEGA\Data Science\Movie Recommendation System - Python\ml-25m\movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
# cleaning titles of the movie file to facilitate search
# a function will be written to deal with that using the re module (python module that works with regular expressions)
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)
    


In [5]:
# adding a new column while running the function using apply function
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:
# checking new column called clean_title
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [12]:
# building a search engine for the movies
# using a tfIDF - term frequency inverse document frequency utlizing ngram_range=(1,2) for pairs of words
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# def search(title)
# running without the use of a function to see how the code behaves
title = "Harry Potter" 
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
# the vectorizer transformed words in numbers so they can be compared via cosine similarity the more they are similar the higher the number in the array will be

In [19]:
query_vec

<1x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [20]:
similarity
# the title in the example was Harry Potter and if you take a look in the first three entries in the clean_title column you'll notice there's no similarity between Harry Potter (our input) vs the data in the dataset (Toy Story,
# Jumanji, Grumpier Old Men) that's why the array is filled by zeroes

array([0., 0., 0., ..., 0., 0., 0.])

In [21]:
title = "Men 1995"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()

In [22]:
# trying a different input for title results in the following a number greater than zero, meaning some similarities
similarity

array([0.113662  , 0.12665159, 0.57464617, ..., 0.        , 0.        ,
       0.        ])

In [27]:
title = "Toy Story 1995"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()
# we need to find the titles with the highest similarity, for that we use numpy arg partition, in this case the 5 most similar titles
indices = np.argpartition(similarity, -5)[-5:]
# in order to find the actual items we have to use the iloc function
# the term [::-1] reverses the order in what the results are showed
results = movies.iloc[indices][::-1]

In [24]:
# this is showing us the most similar items but their indices
indices

array([20497, 14813, 59767,  3021,     0], dtype=int64)

In [28]:
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [30]:
# writing a function so it's more organized
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [None]:
# the cell below is setting up a search bar using widgets in Python

In [77]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = '',
    description = 'Movie Title:',
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data['new']
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names = 'value')

display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [None]:
# building the recommendation movie system
# it'll be based in movies other users also liked the same movie as us

In [40]:
ratings = pd.read_csv("D:/MEGA/Data Science/Movie Recommendation System - Python/ml-25m/ratings.csv")
# I had to change '\' for this '/' so the code could work after searching for it on stack overflow


In [41]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [42]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [48]:
# finding the users who liked the same movies as us
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4.5)]["userId"].unique()

In [49]:
similar_users
# array of people who likes the same movies as us

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [59]:
# recommendations from people who likes the same movies as us giving  a rating greater than or equal to 4.5
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4.5)]["movieId"]

In [60]:
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [63]:
# filtering the movies based on 10% or more of the users similar to us liked. we're narrowing it down to get better results
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

In [64]:
similar_user_recs

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

In [65]:
# finding how much all users like movies
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= 4.5)]

In [66]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [67]:
# finding what percentage of all users recommended these movies
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) 

In [68]:
all_user_recs

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

In [None]:
# creating a recommendation score
# we want a big differential between the percentages of similar users and all users so we can have movies good recommendations (movies similar users might not have seen)

In [69]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [70]:
rec_percentages
# similar contains the data of how much 'similar' users like the movie and 'all' users like the movie in general

Unnamed: 0,similar,all
1,1.000000,0.124728
32,0.160711,0.100293
34,0.130555,0.052229
47,0.225909,0.144469
50,0.275604,0.200513
...,...,...
59315,0.104593,0.054269
60069,0.170640,0.076307
68954,0.159172,0.064944
78499,0.152960,0.035131


In [71]:
# finding the ratio or big differential between columns using sort_values() panda method
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [88]:
rec_percentages.sort_values(by="score", ascending=False)

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [74]:
# using merge function so we can attach a name to the movie - merging rec_percentages df to the movies df
# in this case the parameters left index = True and it'll be a right join on movieId
# we'll display the first 10 results (higher differential) as a recommendation
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
31,0.160711,0.100293,1.602424,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
33,0.130555,0.052229,2.49966,34,Babe (1995),Children|Drama
46,0.225909,0.144469,1.563719,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
49,0.275604,0.200513,1.374497,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
108,0.239873,0.160872,1.491075,110,Braveheart (1995),Action|Drama|War
109,0.11553,0.081704,1.413997,111,Taxi Driver (1976),Crime|Drama|Thriller
148,0.190231,0.090578,2.10019,150,Apollo 13 (1995),Adventure|Drama|IMAX
257,0.40377,0.222207,1.817089,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
289,0.134962,0.095346,1.415493,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller


In [91]:
# building the recommendation function
# we're gonna add all the code snippets from before into a single function

def find_similar_movies(movie_id):
    # finding recommendations from users similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4.5)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4.5)]["movieId"]
    
    # adjusting so we only get recommendations over 10% of the users recommended to us
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

    # finding how common the recommendations were among all of the users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= 4.5)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    # creating the score of the recommendation system by concatenating the columns
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    # sorting our score in descending order
    rec_percentages = rec_percentages.sort_values(by="score", ascending=False)

    # returning the top 10 results in a merged version with the movies dataframe while selecting only the three columns needed
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId") [["score", "title", "genres"]]

In [92]:
# creating the widget for the find_similar_movies function
movie_name_input = widgets.Text(
    value="",
    description="Movie Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()