In [1]:
import os
import re

import ipywidgets as widgets
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv(os.path.join(os.getcwd(), 'datasets', 'movies.csv'))

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
def clean_title(title: str) -> str:
    return re.sub('[^a-zA-Z0-9 ]', '', title)

In [5]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies['clean_title'])

In [8]:
def search(title):
    title = clean_title(title)

    # Transform the query into a vector
    query_vec = vectorizer.transform([title])

    # Compute the cosine similarity
    cosine_similarities = cosine_similarity(query_vec, tfidf).flatten()

    # Find the top 5 similar movies
    indices = np.argpartition(cosine_similarities, -5)[-5:]

    # Print the results
    results = movies.iloc[indices][::-1]

    return results

In [9]:
ratings = pd.read_csv(os.path.join(os.getcwd(), 'datasets', 'ratings.csv'))

In [10]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [11]:
def find_similar_movies(movie_id):
    # Find the other movies that the user has rated
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['userId'] >= 5)]['userId'].unique()
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']
    
    # Find the percentage of similar users that rated the movie highly
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]

    # Find all users that rated the movie highly
    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
    all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

    # Score the movies based on the percentage of similar users that rated the movie highly
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ['similar', 'all']
    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
    rec_percentages = rec_percentages.sort_values('score', ascending=False)

    return rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')[['movieId', 'title', 'score']]


In [12]:
find_similar_movies(1)

Unnamed: 0,movieId,title,score
0,1,Toy Story (1995),2.589624
3021,3114,Toy Story 2 (1999),2.109378
8246,8961,"Incredibles, The (2004)",1.757397
4780,4886,"Monsters, Inc. (2001)",1.756527
6258,6377,Finding Nemo (2003),1.753626
580,588,Aladdin (1992),1.679763
1232,1265,Groundhog Day (1993),1.651709
359,364,"Lion King, The (1994)",1.646634
2826,2918,Ferris Bueller's Day Off (1986),1.634527
1070,1097,E.T. the Extra-Terrestrial (1982),1.62534


In [13]:
movie_input_name = widgets.Text(
    value='Toy Story',
    description='Movie title:',
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names='value')
display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie title:')

Output()