In [32]:
import pandas as pd
movies = pd.read_csv("movieData1\ml-25m\movies.csv")
movies

  movies = pd.read_csv("movieData1\ml-25m\movies.csv")


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [33]:
import re

#cleaning the data by removing any characters not in the [a-zA-z0-9]
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "",title)

    

In [34]:
#.apply applies the clean function to all titles of the movies in the DataFrame
movies["clean_title"] = movies["title"].apply(clean_title)

#checking
print(movies["clean_title"])

#checking the datatype of column "clean_title"
print(type(movies["clean_title"]))
print(movies["clean_title"].dtype)


0                          Toy Story 1995
1                            Jumanji 1995
2                   Grumpier Old Men 1995
3                  Waiting to Exhale 1995
4        Father of the Bride Part II 1995
                       ...               
62418                             We 2018
62419             Window of the Soul 2001
62420                      Bad Poems 2018
62421                   A Girl Thing 2001
62422         Women of Devils Island 1962
Name: clean_title, Length: 62423, dtype: object
<class 'pandas.core.series.Series'>
object


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer as tfid

vectorizer = tfid(ngram_range=(1,2)) 

#fit transform learns then applies the transform to the paramter
tfidF = vectorizer.fit_transform(movies["clean_title"])
tfidF


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 446566 stored elements and shape (62423, 170073)>

In [36]:
#search function
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    #clean the entered title
    
    title = clean_title(title)
    query_vec = vectorizer.transform([title])

    #similarity compares our entered title to all the titles in vector tfidF
    similarity = cosine_similarity(query_vec,tfidF).flatten()

    #finding the top 5 most appropriate results
    indices  =np.argpartition(similarity,-5)[-5:]
    #sorting the top 5 again in descending, also numpy fsr allows array in array (fancy indexing)
    indices= indices[np.argsort(similarity[indices])[::-1]]

    #we use iloc for finding movies data cuz its a dataframe not a list
    result = movies.iloc[indices]
    return result


In [37]:
#importing modules for visual elements
import ipywidgets as widgets 
from IPython.display import display


movie_input = widgets.Text(
    value="Toy Story",
    description="movie Title",
    disabled = False
)

#output wiodget for functuionality
movie_list = widgets.Output()

#syntax is from the Iwidget library
def on_type(data):
    with movie_list:
        #clearing the output
        movie_list.clear_output()
        title=data["new"]
        if len(title)>5:
            display(search(title))


#tracks changes in dataframe/widgets
movie_input.observe(on_type,names='value')
display(movie_input,movie_list)


Text(value='Toy Story', description='movie Title')

Output()

SEARCH FUNCTIONALITY ENDS HERE,
making the recommendation system from here on,

In [38]:

ratings = pd.read_csv(r"movieData1\ml-25m\ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [39]:
#finding movies that people liked who also liked the movie we searched above
movie_id=1
similar_users = ratings[(ratings["movieId"]== movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
similar_user_rec = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>=4)]["movieId"]
similar_users


array([     3,      5,      8, ..., 162530, 162533, 162534],
      shape=(37709,))

In [40]:
#finding the 10% most similar movies and display
similar_user_rec = similar_user_rec.value_counts()/len(similar_users)
similar_user_rec = similar_user_rec[similar_user_rec > .1]
similar_user_rec

movieId
1       1.000000
318     0.549604
260     0.531518
356     0.517224
296     0.495744
          ...   
235     0.101249
1242    0.100931
1907    0.100772
3527    0.100613
2761    0.100135
Name: count, Length: 273, dtype: float64

In [41]:
#finding how much all users like the movies 
all_users = ratings[(ratings["movieId"].isin(similar_user_rec.index) )& (ratings["rating"] >= 4)]
all_users_rec = all_users["movieId"].value_counts()/len(all_users["userId"].unique())
all_users_rec

movieId
318     0.440215
296     0.389659
356     0.367553
593     0.361897
2571    0.347994
          ...   
3175    0.049325
2081    0.047128
1282    0.044712
2761    0.039855
1907    0.039805
Name: count, Length: 273, dtype: float64

In [42]:
#creating recommendation score
rec_percentages = pd.concat([similar_user_rec,all_users_rec],axis=1)
rec_percentages.columns = ["similar","all"]
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.235415
318,0.549604,0.440215
260,0.531518,0.325251
356,0.517224,0.367553
296,0.495744,0.389659
...,...,...
235,0.101249,0.055281
1242,0.100931,0.050805
1907,0.100772,0.039805
3527,0.100613,0.056879


In [43]:
rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score",ascending=False)
rec_percentages.head(10).merge(movies,left_index=True,right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.235415,4.247819,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.328914,0.102241,3.217054,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,0.161924,0.05771,2.80584,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2264,0.191095,0.068978,2.770367,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
1992,0.120714,0.047128,2.561408,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance,Little Mermaid The 1989
1818,0.100772,0.039805,2.531636,1907,Mulan (1998),Adventure|Animation|Children|Comedy|Drama|Musi...,Mulan 1998
2669,0.100135,0.039855,2.512494,2761,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi,Iron Giant The 1999
1005,0.12806,0.054719,2.340299,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
1047,0.231801,0.099113,2.338762,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
1249,0.103636,0.044712,2.317855,1282,Fantasia (1940),Animation|Children|Fantasy|Musical,Fantasia 1940


In [None]:
#function for creating recommendation using our previosuly written code

def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"]== movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
    similar_user_rec = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>=4)]["movieId"]
    
    similar_user_rec = similar_user_rec.value_counts()/len(similar_users)
    similar_user_rec = similar_user_rec[similar_user_rec > .1]

    all_users = ratings[(ratings["movieId"].isin(similar_user_rec.index) ) & (ratings["rating"] >= 4)]
    all_users_rec = all_users["movieId"].value_counts()/len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_rec,all_users_rec],axis=1)
    rec_percentages.columns = ["similar","all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score",ascending=False)
    return rec_percentages.head(10).merge(movies,left_index=True,right_on="movieId")



In [46]:
#widget to recommend movies

movie_input_name = widgets.Text(
    value="Toy Story",
    descriptiopn="movie title : ",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title= data["new"]
        if len(title) > 5:
            results=search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type,names="value")
display(movie_input_name,recommendation_list)

Text(value='Toy Story')

Output()