In [1]:
import pandas as pd
import re

In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")
def clean_title (title):
  return re.sub ("[^a-zA-Z0-9 ]", "", title)



In [3]:
movies["clean_title"] = movies["title"].apply(clean_title)

movies["genres_clean"] = movies["genres"].str.replace("|", " ", regex=False)

movies["features"] = movies["clean_title"] + " " + (movies["genres_clean"] + "") * 3

In [4]:
movies

Unnamed: 0,movieId,title,genres,clean_title,genres_clean,features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,Adventure Animation Children Comedy Fantasy,Toy Story 1995 Adventure Animation Children Co...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995,Adventure Children Fantasy,Jumanji 1995 Adventure Children FantasyAdventu...
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995,Comedy Romance,Grumpier Old Men 1995 Comedy RomanceComedy Rom...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995,Comedy Drama Romance,Waiting to Exhale 1995 Comedy Drama RomanceCom...
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995,Comedy,Father of the Bride Part II 1995 ComedyComedyC...
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017,Action Animation Comedy Fantasy,Black Butler Book of the Atlantic 2017 Action ...
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017,Animation Comedy Fantasy,No Game No Life Zero 2017 Animation Comedy Fan...
9739,193585,Flint (2017),Drama,Flint 2017,Drama,Flint 2017 DramaDramaDrama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018,Action Animation,Bungo Stray Dogs Dead Apple 2018 Action Animat...


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))


movies["genres_clean"] = movies["genres"].str.replace("|", " ", regex=False)
movies["features"] = movies["clean_title"] + " " + movies["genres_clean"]

tfidf = vectorizer.fit_transform(movies["features"])




In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def title_search(title):
#title = " Toy Story (1995) "
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -3)[-3:]
  results = movies.iloc[indices][::-1]
  #print(indices)
  return results




In [7]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value=' ',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        display(title_search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value=' ', description='Movie Title:')

Output()

In [8]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [9]:
movie_id = 1

In [10]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] == 5)]["userId"].unique()


In [11]:
similar_user_reccs = ratings[(ratings["userId"]. isin(similar_users)) & (ratings["rating"] == 5)]["movieId"]

In [12]:
similar_users

array([ 31,  40,  43,  46,  57,  63,  71,  96, 145, 151, 166, 171, 177,
       201, 206, 220, 229, 234, 240, 247, 269, 270, 273, 275, 304, 328,
       341, 347, 353, 357, 364, 367, 380, 389, 396, 411, 448, 451, 453,
       456, 471, 533, 559, 573, 584, 587, 610])

In [13]:
similar_user_reccs

Unnamed: 0,movieId
4879,1
4890,377
4891,588
4894,648
4895,733
...,...
100814,158238
100829,164179
100832,168248
100833,168250


In [14]:
similar_user_reccs = similar_user_reccs.value_counts() / len(similar_users)

similar_user_reccs = similar_user_reccs[similar_user_reccs > 0.1]

In [15]:
similar_user_reccs

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,1.000000
296,0.404255
260,0.340426
1198,0.276596
593,0.276596
...,...
380,0.106383
62,0.106383
17,0.106383
1201,0.106383


In [16]:
all_users = ratings[(ratings["movieId"].isin(similar_user_reccs.index)) & (ratings["rating"] == 5)]
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())


In [17]:
rec_percentages = pd.concat([similar_user_reccs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.096708
296,0.404255,0.253086
260,0.340426,0.213992
1198,0.276596,0.141975
593,0.276596,0.189300
...,...,...
380,0.106383,0.049383
62,0.106383,0.041152
17,0.106383,0.039095
1201,0.106383,0.049383


In [18]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [19]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)
rec_percentages


Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.096708,10.340426
661,0.106383,0.016461,6.462766
3114,0.212766,0.039095,5.442329
3175,0.127660,0.024691,5.170213
2716,0.212766,0.043210,4.924012
...,...,...,...
2858,0.127660,0.133745,0.954501
608,0.106383,0.117284,0.907055
527,0.148936,0.189300,0.786772
2571,0.170213,0.224280,0.758930


In [20]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,clean_title,genres_clean,features
0,1.0,0.096708,10.340426,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995,Adventure Animation Children Comedy Fantasy,Toy Story 1995 Adventure Animation Children Co...
551,0.106383,0.016461,6.462766,661,James and the Giant Peach (1996),Adventure|Animation|Children|Fantasy|Musical,James and the Giant Peach 1996,Adventure Animation Children Fantasy Musical,James and the Giant Peach 1996 Adventure Anima...
2355,0.212766,0.039095,5.442329,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999,Adventure Animation Children Comedy Fantasy,Toy Story 2 1999 Adventure Animation Children ...
2393,0.12766,0.024691,5.170213,3175,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi,Galaxy Quest 1999,Adventure Comedy Sci-Fi,Galaxy Quest 1999 Adventure Comedy Sci-Fi
2038,0.212766,0.04321,4.924012,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi,Ghostbusters aka Ghost Busters 1984,Action Comedy Sci-Fi,Ghostbusters aka Ghost Busters 1984 Action Com...
436,0.148936,0.030864,4.825532,500,Mrs. Doubtfire (1993),Comedy|Drama,Mrs Doubtfire 1993,Comedy Drama,Mrs Doubtfire 1993 Comedy Drama
7355,0.12766,0.028807,4.431611,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010,Adventure Animation Children Comedy Fantasy IMAX,Toy Story 3 2010 Adventure Animation Children ...
32,0.212766,0.055556,3.829787,34,Babe (1995),Children|Drama,Babe 1995,Children Drama,Babe 1995 Children Drama
681,0.106383,0.028807,3.693009,899,Singin' in the Rain (1952),Comedy|Musical|Romance,Singin in the Rain 1952,Comedy Musical Romance,Singin in the Rain 1952 Comedy Musical Romance
138,0.170213,0.047325,3.59667,165,Die Hard: With a Vengeance (1995),Action|Crime|Thriller,Die Hard With a Vengeance 1995,Action Crime Thriller,Die Hard With a Vengeance 1995 Action Crime Th...


In [21]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [22]:
movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if title.strip() != "":   # avoid running on empty input
            results = title_search(title)
            if not results.empty:
                movie_id = results.iloc[0]["movieId"]
                display(find_similar_movies(movie_id))
            else:
                display("No results found.")

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()