In [24]:
import pandas as pd

movies = pd.read_csv(r"C:\Users\vs786\OneDrive\Desktop\DATA\Assignment Codiis\ml-latest\ml-latest\movies.csv")

In [25]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [26]:
import re #Cleaning Movies title with REGEX
def CT(title):
    title=re.sub("[^a-zA-Z0-9]","",title) #structured title by removing extra format 
    return title

In [27]:
 #Creating a new culomn clean title (CT) by assigning the value from function CT
movies["CT"]=movies["title"].apply(CT)

In [28]:
movies

Unnamed: 0,movieId,title,genres,CT
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,ToyStory1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji1995
2,3,Grumpier Old Men (1995),Comedy|Romance,GrumpierOldMen1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,WaitingtoExhale1995
4,5,Father of the Bride Part II (1995),Comedy,FatheroftheBridePartII1995
...,...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed),TheGreatGlinka1946
58094,193878,Les tribulations d'une caissière (2011),Comedy,Lestribulationsdunecaissire2011
58095,193880,Her Name Was Mumu (2016),Drama,HerNameWasMumu2016
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi,Flora2017


In [29]:
#Creating a TFIDF Matrix
#As we know computer understand data in numeric format so here we converted data in numeric form
#ML Libraries
from sklearn.feature_extraction.text import TfidfVectorizer  
#Instead of looking 1 individual words its look for 2 consecutive words
vectorizer = TfidfVectorizer(ngram_range=(1,2)) 
#Turns set of titles into set of numbers
tfidf = vectorizer.fit_transform(movies["CT"])

In [30]:
#Creating a serach function
#
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np

def search(title):
    #Clean the title
    title = CT(title)
    #Turn the query terms into a set of numbers
    query_vec = vectorizer.transform([title])
    #cosine_similarity:compare the query terms to each of the titles(clean titles) that we have in dataset
                      #and returns how similar our title is to each of those titles in dataset
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    #Indices returns 5 most similar titles through query term
    indices = np.argpartition(similarity, -5)[-5:]
    #index movie data by these indices
    results = movies.iloc[indices].iloc[::-1] #Reverse the result
    
    return results

In [31]:
#Search box with jupyter
import ipywidgets as widgets
from IPython.display import display

#Taking input here
movie_input = widgets.Text(
    value=' ',
    description='Movie Title:',
    disabled=False
)
#output widgets
movie_list = widgets.Output()
#this function is going to be called whenever we type something in search box
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value=' ', description='Movie Title:')

Output()

In [32]:
movie_id= 89745
movie=movies[movies["movieId"]==movie_id]

In [33]:
ratings=pd.read_csv(r"C:\Users\vs786\OneDrive\Desktop\DATA\Assignment Codiis\ml-latest\ml-latest\ratings.csv")

In [34]:
#Finding Users who liked the same movies
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()


In [35]:
#Find the other movies they(similar_users) liked
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [36]:
#Converting into percentage
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
#Only consider those movies which has greater than 10% likes
similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [37]:
#It finds what percentage of just regular people outside of the set of people similar to us
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [38]:
#Percentage of all users who recommended these movies that are in similar_user_rec
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())


In [39]:
#Combinig two series, Compare the percentages
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [40]:
#Creating a recommendation score,
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]


In [41]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)


In [42]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")


Unnamed: 0,similar,all,score,movieId,title,genres,CT
17997,1.0,0.030533,32.751846,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,AvengersThe2012
22251,0.101926,0.003921,25.996778,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,ThorTheDarkWorld2013
27550,0.223756,0.00891,25.113352,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,AvengersAgeofUltron2015
21101,0.211075,0.008983,23.496278,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,IronMan32013
17615,0.202408,0.008684,23.307042,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,CaptainAmericaTheFirstAvenger2011
23452,0.265008,0.011679,22.69127,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,CaptainAmericaTheWinterSoldier2014
17138,0.168379,0.007523,22.382206,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor2011
27562,0.176404,0.008317,21.210535,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,CaptainAmericaCivilWar2016
27554,0.120064,0.005915,20.296919,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan2015
15271,0.233387,0.011669,20.000497,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,IronMan22010


In [43]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]


In [47]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value=' ',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value=' ', description='Movie Title:')

Output()