In [87]:
import pandas as pd
import numpy as np
import requests
from sklearn.feature_extraction.text import TfidfVectorizer         ## To vectorize the movie names
import re                                                           ## To clean text
from sklearn.metrics.pairwise import cosine_similarity              ## To calculate similarity
from ipywidgets import  widgets                                     ## To create interactive interface
from IPython.display import display                                   ## To create interactive interface
from zipfile import ZipFile

In [88]:
zip = 'data.zip'
zip_data = ZipFile(zip)
et = {}
with zip_data:
    for idx, file in enumerate(zip_data.namelist()):
        et['data_' + str(idx)] = zip_data.extract(file)
et

{'data_0': 'C:\\Users\\yusuf\\GitHub\\Movie_Recommend\\movies.parquet',
 'data_1': 'C:\\Users\\yusuf\\GitHub\\Movie_Recommend\\ratings1.parquet'}

In [89]:
df = pd.read_parquet(et['data_0'])
ratings = pd.read_parquet(et['data_1'])
df = df[~df.genres.str.contains('\(')]

In [90]:
def clean_title(x):                                             ## Create a function
    return re.sub('[^\w ]','',x)                               ## This code removes anything except numbers,letters and blanks

In [91]:
df['clean_title']=df.title.apply(lambda x:clean_title(x.strip()))                  ## Use the function to clean the title text in each row.

In [92]:
vec = TfidfVectorizer(ngram_range=(1,2))                      ## Vectorizer converts the test into numpy arrays, it takes single words and word pairs into consideration
vec_data=vec.fit_transform(df.clean_title)                    ## Transform the cleaned text column

In [93]:
def search(query):
    query = clean_title(query)                             ## Clean the variable passed in the function
    query = vec.transform([query])                           ## Vectorize the variable   **  Only transform **
    similarity = cosine_similarity(query,vec_data).flatten()             ## Calculate the  similarity score
    # locs = np.append(np.argpartition(similarity,-10)[-10:],np.argmax(similarity))      ## Find 10 indices with the highest score
    locs = np.argsort(similarity)[-10:]   ## Find 10 indices with the highest score
    movies = df.iloc[locs][::-1].drop_duplicates()              ## Pass the indices in the movie data frame and create a new data frame.
    return movies                                          ## Return the data frame

In [94]:
search('Inception')                       ## Try the function

Unnamed: 0,movieId,title,genres,clean_title
14937,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,Inception 2010
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama,Women of Devils Island 1962
19279,100253,Karlsson Brothers (Bröderna Karlsson) (2010),Comedy|Drama,Karlsson Brothers Bröderna Karlsson 2010
19137,99677,Dr. Bronner's Magic Soapbox (2006),Documentary,Dr Bronners Magic Soapbox 2006
19136,99675,Eat Sleep Die (Äta sova dö) (2012),Drama,Eat Sleep Die Äta sova dö 2012
19135,99673,Manuel on the Island of Wonders (Manoel dans l...,Drama,Manuel on the Island of Wonders Manoel dans lî...
19134,99671,Fragments of an Alms-Film (Fragmentos de um Fi...,Comedy,Fragments of an AlmsFilm Fragmentos de um Film...
19133,99669,Aftermath (1994),Horror,Aftermath 1994
19132,99667,Excuse Me for Living (2012),Comedy|Romance,Excuse Me for Living 2012
19131,99665,L'homme qui rit (2012),Drama|Fantasy|Romance,Lhomme qui rit 2012


In [95]:
movie_input = widgets.Text(value = 'Toy Story',                                 ## Create a widget for an interactive interface
                           description = 'Movie Title',
                           disabled = False)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        # display(data)
        title = data['new']
        if len(title)>3:
            display(search(title))


movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [97]:
def recommendation(movie_id):
    ## Get userIds of people who liked the movie registered with the specified movie id. We can assume those users are similar users. I will refer this group as similar users to make things clear.
    similar_users = ratings[(ratings.movieId==movie_id) & (ratings.rating>4)]['userId'].unique()
    ## Collect the Ids of the other movies that similar people liked. Assume that similar people generally like similar movies.
    recs = ratings[(ratings.userId.isin(similar_users)==True) & (ratings.rating>4)]['movieId']
    ## Calculate which movie is liked how many times by similar users and divide it to the total number of the group. It shows us the percentage of people who like the movie
    recs = recs.value_counts() / len(similar_users)
    ## Filter the movies that are liked by at least %10 of the group.
    recs = recs[recs > 0.1]
    ## The data that show all users who liked the movies that the at least % 10 of the similar users also liked.
    all_=ratings[(ratings.movieId.isin(recs.index)==True) & (ratings.rating>4)]
    ## Calculate the ratio of the total population who liked the movies that the similar users liked.
    all_recs=all_['movieId'].value_counts()/len(all_['userId'].unique())
    ## Concatenate the ratio tables to see the comparison
    combined_recs = pd.concat([recs,all_recs],axis=1)
    ## Rename columns
    combined_recs.columns = ['similar','all']
    ## To calculate the score we use the percentages. If a movie is liked by similar people but not popular among the total population, it is assumed to be a better recommendation, because recommendation, in its nature, is valuable when the asker do not know about the movie. So we take the raio between the score among the similar people and the total population; the score is amplified when divided.
    combined_recs['score'] = 2*combined_recs['similar'] + combined_recs['all']
    ## Sort the data frame by score
    combined_recs=combined_recs.sort_values('score',ascending=False)
    ## Merge scores and the movies data frames on movieId column, filter 3 columns and the first 10 rows.

    results = combined_recs.merge(df,left_index=True,right_on='movieId')[['title','genres','score']]


    genr = results.genres.iloc[0].split('|')

    frame = results.genres.apply(lambda x: 1 if len([k for k in genr if k in x.split('|')])>0 else 0)

    idx = frame[frame==1].index

    return results[results.index.isin(idx)].head(20)

In [98]:
movie_input_name = widgets.Text(          ## Create a text input widget
    value='Toy Story',                         ## initial value
    description = 'Movie Title:',
    disabled = False)

recommendation_list = widgets.Output()
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title= data['new']
        if len(title)>3:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(recommendation(movie_id))

movie_input_name.observe(on_type,names='value')
display(movie_input_name,recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()