In [167]:
import pandas as pd
import numpy as np
import requests
from sklearn.feature_extraction.text import TfidfVectorizer         ## To vectorize the movie names
import re                                                           ## To clean text
from sklearn.metrics.pairwise import cosine_similarity              ## To calculate similarity
from ipywidgets import  widgets                                     ## To create interactive interface
from IPython.display import display                                   ## To create interactive interface
from zipfile import ZipFile

In [168]:
zip = 'data.zip'
zip_data = ZipFile(zip)
et = {}
with zip_data:
    for idx, file in enumerate(zip_data.namelist()):
        et['data_' + str(idx)] = zip_data.extract(file)
et

{'data_0': 'C:\\Users\\yusuf\\GitHub\\Movie_Recommend\\movies.parquet',
 'data_1': 'C:\\Users\\yusuf\\GitHub\\Movie_Recommend\\ratings1.parquet'}

In [169]:
df = pd.read_parquet(et['data_0'])
ratings = pd.read_parquet(et['data_1'])

In [170]:
def clean_title(x):                                             ## Create a function
    return re.sub('[^\w ]','',x)                               ## This code removes anything except numbers,letters and blanks

In [171]:
df['clean_title']=df.title.apply(lambda x:clean_title(x.strip()))                  ## Use the function to clean the title text in each row.

In [172]:
vec = TfidfVectorizer(ngram_range=(1,2))                      ## Vectorizer converts the test into numpy arrays, it takes single words and word pairs into consideration
vec_data=vec.fit_transform(df.clean_title)                    ## Transform the cleaned text column

In [173]:
def search(query):
    query = clean_title(query)                             ## Clean the variable passed in the function
    query = vec.transform([query])                           ## Vectorize the variable   **  Only transform **
    similarity = cosine_similarity(query,vec_data).flatten()             ## Calculate the  similarity score
    # locs = np.append(np.argpartition(similarity,-10)[-10:],np.argmax(similarity))      ## Find 10 indices with the highest score
    locs = np.argsort(similarity)[-10:]   ## Find 10 indices with the highest score
    movies = df.iloc[locs][::-1].drop_duplicates()              ## Pass the indices in the movie data frame and create a new data frame.
    return movies                                          ## Return the data frame

In [174]:
search('Game of Thrones')                       ## Try the function

Unnamed: 0,movieId,title,genres,clean_title
60130,202533,Game of Thrones: The Last Watch (2019),Documentary,Game of Thrones The Last Watch 2019
59481,200950,Game of Thrones - Conquest & Rebellion (2017),Animation|Fantasy,Game of Thrones Conquest Rebellion 2017
52157,184691,Game of Death (2017),Horror|Thriller,Game of Death 2017
19334,100457,Game of Death (2010),Action|Adventure|Thriller,Game of Death 2010
4336,4441,Game of Death (1978),Action,Game of Death 1978
40248,158984,A Game of Death (1945),Action|Adventure|Thriller,A Game of Death 1945
53051,186625,Game of Aces (2016),Action|Adventure|War,Game of Aces 2016
46545,172665,The Game of Truth (2013),Comedy|Drama,The Game of Truth 2013
23517,118324,Game of Werewolves (2011),Comedy|Horror,Game of Werewolves 2011
49587,179193,Enter the Game of Death (1978),Action,Enter the Game of Death 1978


In [175]:
movie_input = widgets.Text(value = 'Toy Story',                                 ## Create a widget for an interactive interface
                           description = 'Movie Title',
                           disabled = False)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        # display(data)
        title = data['new']
        if len(title)>3:
            display(search(title))


movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [176]:
def recommendation(movie_id):
    ## Get userIds of people who liked the movie registered with the specified movie id. We can assume those users are similar users. I will refer this group as similar users to make things clear.
    similar_users = ratings[(ratings.movieId==movie_id) & (ratings.rating>4)]['userId'].unique()
    ## Collect the Ids of the other movies that similar people liked. Assume that similar people generally like similar movies.
    recs = ratings[(ratings.userId.isin(similar_users)==True) & (ratings.rating>4)]['movieId']
    ## Calculate which movie is liked how many times by similar users and divide it to the total number of the group. It shows us the percentage of people who like the movie
    recs = recs.value_counts() / len(similar_users)
    ## Filter the movies that are liked by at least %10 of the group.
    recs = recs[recs > 0.1]
    ## The data that show all users who liked the movies that the at least % 10 of the similar users also liked.
    all_=ratings[(ratings.movieId.isin(recs.index)==True) & (ratings.rating>4)]
    ## Calculate the ratio of the total population who liked the movies that the similar users liked.
    all_recs=all_['movieId'].value_counts()/len(all_['userId'].unique())
    ## Concatenate the ratio tables to see the comparison
    combined_recs = pd.concat([recs,all_recs],axis=1)
    ## Rename columns
    combined_recs.columns = ['similar','all']
    ## To calculate the score we use the percentages. If a movie is liked by similar people but not popular among the total population, it is assumed to be a better recommendation, because recommendation, in its nature, is valuable when the asker do not know about the movie. So we take the raio between the score among the similar people and the total population; the score is amplified when divided.
    combined_recs['score'] = combined_recs['similar'] / combined_recs['all']
    ## Sort the data frame by score
    combined_recs=combined_recs.sort_values('score',ascending=False)
    ## Merge scores and the movies data frames on movieId column, filter 3 columns and the first 10 rows.
    return combined_recs.merge(df,left_index=True,right_on='movieId').head(10)[['title','genres','score']]

In [177]:
movie_input_name = widgets.Text(          ## Create a text input widget
    value='Toy Story',                         ## initial value
    description = 'Movie Title:',
    disabled = False)

recommendation_list = widgets.Output()
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title= data['new']
        if len(title)>3:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(recommendation(movie_id))

movie_input_name.observe(on_type,names='value')
display(movie_input_name,recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [178]:
q = 'inception'
query = clean_title(q)                             ## Clean the variable passed in the function
query = vec.transform([query])                           ## Vectorize the variable   **  Only transform **
similarity = cosine_similarity(query,vec_data).flatten()             ## Calculate the  similarity score
locs = np.append(np.argpartition(similarity,-10)[-10:],np.argmax(similarity))      ## Find 10 indices with the highest score
# locs = np.argsort(similarity)[-10:]   ## Find 10 indices with the highest score
movies = df.iloc[locs][::-1].drop_duplicates()
movies

Unnamed: 0,movieId,title,genres,clean_title
14937,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX,Inception 2010
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama,Women of Devils Island 1962
20805,107561,Someone Like Him (Einer wie Bruno) (2011),Comedy|Drama,Someone Like Him Einer wie Bruno 2011
20806,107563,"Princess for Christmas, A (2011)",Children|Comedy,Princess for Christmas A 2011
20807,107565,"Fuck You, Goethe (Fack Ju Göhte) (2013)",Comedy,Fuck You Goethe Fack Ju Göhte 2013
20808,107573,Apnea (Apnoia) (2010),Drama,Apnea Apnoia 2010
20804,107557,Fun Size (2012),Comedy,Fun Size 2012
20803,107548,Ice Quake (2010),Action|Sci-Fi|Thriller,Ice Quake 2010
20811,107603,Heiter bis wolkig (2012),Comedy|Drama|Romance,Heiter bis wolkig 2012
20809,107591,Open Up to Me (Kerron sinulle kaiken) (2013),Drama,Open Up to Me Kerron sinulle kaiken 2013


In [179]:
b[b.argsort()][-3:][::-1]

array([50, 41, 28])

In [180]:
b[np.argsort(b)]

array([10, 23, 28, 41, 50])