In [59]:
import pandas as pd
import numpy as np
# reg expression library
import re

# python machine learning library

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

# python widget
import ipywidgets as widgets

# display cells
from IPython.display import display

In [3]:
movies = pd.read_csv('movies.csv')

In [44]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


### function

In [13]:
def clean_title(title):
    # looking through the strings. if you aren't one of these chararters, you become removed
    return re.sub('[^a-zA-Z0-9 ]','',title)
    

In [14]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [162]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


# building a TFIDF matrix

In [17]:
# term frequency matrix

#from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))

In [25]:
# use the vectorizer to turn titles into sets of numbers

tfidf = vectorizer.fit_transform(movies['clean_title'])

# building a search function

In [32]:
# cosine similarity
# from sklearn.metrics.pairwise import cosine_similarity

### function

In [55]:
def search(title):
    
    title = clean_title(title)
    query_vector = vectorizer.transform([title])

    # find the similarity between title and search term

                                #our title      # our matrix
    similarity = cosine_similarity(query_vector,tfidf).flatten()

    # finding the greatest similarity to search term

              # retuning the five most similar to our term
    indices = np.argpartition(similarity,-5)[-5:]

            # using iloc subset specific rows
    results = movies.iloc[indices][::-1]
    
    return results

# interactive search box 

In [56]:
# widget
# import ipywidgets as widgets

In [None]:
#input widget

In [65]:
movie_input = widgets.Text(
              value = "Toy Story",
              description = "Movie Title:",
              disalbed = False

)

In [67]:
movie_input

Text(value='Toy Story', description='Movie Title:')

In [None]:
#output widget

In [63]:
movie_list = widgets.Output()

###  function

In [68]:
def on_type(data):
        with movie_list:
            # clear the type box
            movie_list.clear_output()
            #grab title from text input
            title = data['new']
            #
            if len(title) > 5:
                # search set of titles and display into widget
                display(search(title))
                
#   when input is found, we will call on_type function       
movie_input.observe(on_type,names = 'value') 

display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

# movie ratings

In [111]:
ratings = pd.read_csv('ratings.csv')

In [112]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [113]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [75]:
# basing movie recommendations based on previous people who have ranked those movies
# "users who liked the same movie"

In [114]:
movie_id = 1

In [237]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()


In [238]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

# similar users found, recommender

In [135]:
# so we subseting userid that are in the similar users array, as well as finding other movies that they have rated highly

In [136]:
# all of the movies that users who are similar to us liked

In [239]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [145]:
# counting how many times each movies shows up

In [240]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
 

In [241]:
similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [242]:
similar_user_recs

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

# find out how much all users like the movies

In [243]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]

In [244]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [245]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

### compare percentages

In [246]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]




In [247]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [None]:
# looking for movies where the difference is significant

In [250]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]


In [251]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)


In [253]:
# higher the score, the better the recommendation is
# we want to see movies that stand out from similar users. not movies that are generally ranked high buy everyone
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [255]:
rec_percentages.head(10).merge(movies,left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


# recommendation function

In [258]:
def find_similar_movies(movie_id):
    # find users similar to us
    # recommendations from users similar to us
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    # filter recommendation where 10% of users recommended
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    
    # find all users and there recommendations
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    
    # creating our score by concatenating similar_user_recs and all_user_recs
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    # lookning for bigger perc
    
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    # taking top ten and merging with movieId for name
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]



# interactive widget

In [276]:
# creating import widget
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

recommendation_list = widgets.Output()


def on_type(data):
    with recommendation_list:
        # clear output
        recommendation_list.clear_output()
        # grab title
        title = data["new"]
        # length of title is greater than 5 do search
        if len(title) > 5:
            
            results = search(title)
            
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

Unnamed: 0,movieId,title,genres,clean_title
13822,71535,Zombieland (2009),Action|Comedy|Horror,Zombieland 2009
61142,205072,Zombieland: Double Tap (2019),Action|Comedy|Horror,Zombieland Double Tap 2019
