In [120]:
import pandas as pd
import numpy as np
import requests
from sklearn.feature_extraction.text import TfidfVectorizer         ## To vectorize the movie names
import re                                                           ## To clean text
from sklearn.metrics.pairwise import cosine_similarity              ## To calculate similarity
from ipywidgets import  widgets                                     ## To create interactive interface
from IPython.display import display                                   ## To create interactive interface
from zipfile import ZipFile

In [121]:
zip = 'data.zip'
zip_data = ZipFile(zip)
et = {}
with zip_data:
    for idx, file in enumerate(zip_data.namelist()):
        et['data_' + str(idx)] = zip_data.extract(file)
et

{'data_0': 'C:\\Users\\yusuf\\GitHub\\Movie_Recommend\\movies.parquet',
 'data_1': 'C:\\Users\\yusuf\\GitHub\\Movie_Recommend\\ratings1.parquet'}

In [122]:
df = pd.read_parquet(et['data_0'])
ratings = pd.read_parquet(et['data_1'])
# df = df[~df.genres.str.contains('\(')]

In [123]:
# df[~df.genres.str.contains('\(')]

In [124]:
def clean_title(x):                                             ## Create a function
    return re.sub('[^\w ]','',x)                               ## This code removes anything except numbers,letters and blanks

In [125]:
df['clean_title']=df.title.apply(lambda x:clean_title(x.strip()))                  ## Use the function to clean the title text in each row.

In [126]:
vec = TfidfVectorizer(ngram_range=(1,2))                      ## Vectorizer converts the test into numpy arrays, it takes single words and word pairs into consideration
vec_data=vec.fit_transform(df.clean_title)                    ## Transform the cleaned text column

In [127]:
def search(query):
    query = clean_title(query)                             ## Clean the variable passed in the function
    query = vec.transform([query])                           ## Vectorize the variable   **  Only transform **
    similarity = cosine_similarity(query,vec_data).flatten()             ## Calculate the  similarity score
    # locs = np.append(np.argpartition(similarity,-10)[-10:],np.argmax(similarity))      ## Find 10 indices with the highest score
    locs = np.argsort(similarity)[-10:]   ## Find 10 indices with the highest score
    movies = df.iloc[locs][::-1].drop_duplicates()              ## Pass the indices in the movie data frame and create a new data frame.
    return movies                                          ## Return the data frame

In [128]:
search('Dark knight')                       ## Try the function

Unnamed: 0,movieId,title,genres,clean_title
27811,130219,The Dark Knight (2011),Action|Crime|Drama|Thriller,The Dark Knight 2011
12221,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,Dark Knight The 2008
17464,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,Dark Knight Rises The 2012
4793,4899,Black Knight (2001),Adventure|Comedy|Fantasy,Black Knight 2001
166,168,First Knight (1995),Action|Drama|Romance,First Knight 1995
28055,130820,Street Knight (1993),(no genres listed),Street Knight 1993
46381,172327,Knight Rider (2008),(no genres listed),Knight Rider 2008
33974,144392,Underdog Knight 2 (2011),Action,Underdog Knight 2 2011
6881,7006,Knight Moves (1992),Mystery|Thriller,Knight Moves 1992
26083,125147,The Black Knight (1954),Action|Adventure,The Black Knight 1954


In [129]:
movie_input = widgets.Text(value = 'Toy Story',                                 ## Create a widget for an interactive interface
                           description = 'Movie Title',
                           disabled = False)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        # display(data)
        title = data['new']
        if len(title)>3:
            display(search(title))


movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [130]:
def recommendation(movie_id):
    ## Get userIds of people who liked the movie registered with the specified movie id. We can assume those users are similar users. I will refer this group as similar users to make things clear.
    similar_users = ratings[(ratings.movieId==movie_id) & (ratings.rating>4)]['userId'].unique()
    ## Collect the Ids of the other movies that similar people liked. Assume that similar people generally like similar movies.
    recs = ratings[(ratings.userId.isin(similar_users)==True) & (ratings.rating>4)]['movieId']
    ## Calculate which movie is liked how many times by similar users and divide it to the total number of the group. It shows us the percentage of people who like the movie
    recs = recs.value_counts() / len(similar_users)
    ## Filter the movies that are liked by at least %10 of the group.
    recs = recs[recs > 0.1]
    ## The data that show all users who liked the movies that the at least % 10 of the similar users also liked.
    all_=ratings[(ratings.movieId.isin(recs.index)==True) & (ratings.rating>4)]
    ## Calculate the ratio of the total population who liked the movies that the similar users liked.
    all_recs=all_['movieId'].value_counts()/len(all_['userId'].unique())
    ## Concatenate the ratio tables to see the comparison
    combined_recs = pd.concat([recs,all_recs],axis=1)
    ## Rename columns
    combined_recs.columns = ['similar','all']
    ## To calculate the score we use the percentages. If a movie is liked by similar people but not popular among the total population, it is assumed to be a better recommendation, because recommendation, in its nature, is valuable when the asker do not know about the movie. So we take the raio between the score among the similar people and the total population; the score is amplified when divided.
    combined_recs['score'] = 2*combined_recs['similar'] + combined_recs['all']
    ## Sort the data frame by score
    combined_recs=combined_recs.sort_values('score',ascending=False)
    ## Merge scores and the movies data frames on movieId column, filter 3 columns and the first 10 rows.

    results = combined_recs.merge(df,left_index=True,right_on='movieId')[['title','genres','score']]

    genr = results.genres.iloc[0].split('|')

    frame = results.genres.apply(lambda x: 1 if len([k for k in genr if k in x.split('|')])>0 else 0)

    idx = frame[frame==1].index

    return results[results.index.isin(idx)].head(20)

In [131]:
movie_input_name = widgets.Text(          ## Create a text input widget
    value='Toy Story',                         ## initial value
    description = 'Movie Title:',
    disabled = False)

recommendation_list = widgets.Output()
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title= data['new']
        if len(title)>3:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(recommendation(movie_id))

movie_input_name.observe(on_type,names='value')
display(movie_input_name,recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [132]:
def recommendation1(movie_id):
    ## Get userIds of people who liked the movie registered with the specified movie id. We can assume those users are similar users. I will refer this group as similar users to make things clear.
    similar_users = ratings[(ratings.movieId==movie_id) & (ratings.rating>4)]['userId'].unique()
    ## Collect the Ids of the other movies that similar people liked. Assume that similar people generally like similar movies.
    recs = ratings[(ratings.userId.isin(similar_users)==True) & (ratings.rating>4)]['movieId']
    ## Calculate which movie is liked how many times by similar users and divide it to the total number of the group. It shows us the percentage of people who like the movie
    recs = recs.value_counts() / len(similar_users)
    ## Filter the movies that are liked by at least %10 of the group.
    recs = recs[recs > 0.1]
    ## The data that show all users who liked the movies that the at least % 10 of the similar users also liked.
    all_=ratings[(ratings.movieId.isin(recs.index)==True) & (ratings.rating>4)]
    ## Calculate the ratio of the total population who liked the movies that the similar users liked.
    all_recs=all_['movieId'].value_counts()/len(all_['userId'].unique())
    ## Concatenate the ratio tables to see the comparison
    combined_recs = pd.concat([recs,all_recs],axis=1)
    ## Rename columns
    combined_recs.columns = ['similar','all']
    ## To calculate the score we use the percentages. If a movie is liked by similar people but not popular among the total population, it is assumed to be a better recommendation, because recommendation, in its nature, is valuable when the asker do not know about the movie. So we take the raio between the score among the similar people and the total population; the score is amplified when divided.
    combined_recs['score'] = combined_recs['similar'] / combined_recs['all']
    ## Sort the data frame by score
    combined_recs=combined_recs.sort_values('score',ascending=False)
    ## Merge scores and the movies data frames on movieId column, filter 3 columns and the first 10 rows.

    results = combined_recs.merge(df,left_index=True,right_on='movieId')[['title','genres','score']]

    return results

    # genr = results.genres.iloc[0].split('|')
    #
    # frame = results.genres.apply(lambda x: 1 if len([k for k in genr if k in x.split('|')])>0 else 0)
    #
    # idx = frame[frame==1].index
    #
    # return results[results.index.isin(idx)].head(20)

In [133]:
te = recommendation1(2026).genres.iloc[0].split('|')
frr = recommendation1(2026).genres.apply(lambda x: 1 if len([k for k in te if k in x.split('|')])>0 else 0 )
ids = frr[frr==1].index

In [134]:
recommendation1(2026)[recommendation1(2026).index.isin(ids)]

Unnamed: 0,title,genres,score
1937,Disturbing Behavior (1998),Horror|Thriller,3210.250000
2457,"Rage: Carrie 2, The (1999)",Horror,987.769231
2188,Urban Legend (1998),Horror|Thriller,428.033333
2224,Bride of Chucky (Child's Play 4) (1998),Comedy|Horror|Thriller,396.327160
2714,Teaching Mrs. Tingle (1999),Comedy|Thriller,391.493902
...,...,...,...
536,Blade Runner (1982),Action|Sci-Fi|Thriller,1.109150
49,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1.060223
600,Fargo (1996),Comedy|Crime|Drama|Thriller,1.031287
292,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0.821456
