In [117]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import ipywidgets as widgets
from IPython.display import display

# reading in data sets
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [118]:
# cleaning function
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

def clean_genres(genres):
    return genres.split('|')

In [119]:
# applying cleaning fucntion
movies["clean_title"] = movies["title"].apply(clean_title)
movies["clean_genres"] = movies["genres"].apply(clean_genres)

In [120]:
# creating vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [142]:
def search(title, genres):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5) [-5:]
    results = movies.iloc[indices][::-1]
    results["clean_genres"] = results["genres"].apply(clean_genres)

    # genre filtering
    if genres:
        results = results[results['clean_genres'].apply(lambda x: any(genre in x for genre in genres))]

    return results

In [143]:
def find_similar_movies(movie_id, rating, genres):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= rating)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= rating)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= rating)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    recs = rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]
    recs["clean_genres"] = recs["genres"].apply(clean_genres)

    # filtering recommendations by genre
    if genres:
        recs = recs[recs['clean_genres'].apply(lambda x: any(genre in x for genre in genres))]
    
    return recs

In [145]:
# creating widgets
title_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

rating_input = widgets.FloatSlider(
    description="Rating:",
    min=0,
    max=5,
    step=0.5
)

genre_options = sorted(set([genre for sublist in movies['clean_genres'] for genre in sublist]))
genre_input = widgets.SelectMultiple(
    options=genre_options,
    description="Genres:",
    disabled=False
)

rec_list = widgets.Output()

def on_type(data):
    with rec_list:
        rec_list.clear_output()
        title = title_input.value
        rating = rating_input.value
        genres = genre_input.value
        
        if len(title) > 5:
            results = search(title, genres)
            if not results.empty:
                movie_id = results.iloc[0]["movieId"]
                display(find_similar_movies(movie_id, rating, genres))
            else:
                print("No movies of matching criteria")

title_input.observe(on_type, names="value")
rating_input.observe(on_type, names="value")
genre_input.observe(on_type, names="value")

display(title_input, rating_input, genre_input, rec_list)

Text(value='Toy Story', description='Movie Title:')

FloatSlider(value=0.0, description='Rating:', max=5.0, step=0.5)

SelectMultiple(description='Genres:', options=('(no genres listed)', 'Action', 'Adventure', 'Animation', 'Chil…

Output()