In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [46]:
meta = pd.read_csv( '/kaggle/input/the-movies-dataset/movies_metadata.csv' )

meta = meta[ ['id', 'original_title', 'original_language', 'genres'] ]
meta = meta.rename(columns={'id':'movieId','original_title': 'title',
'original_language': 'language'})
meta = meta.loc[meta['language'] == 'en',:]

meta.movieId = pd.to_numeric(meta.movieId)

def str_to_set(x):
    genre_set = set()
    for item in eval(x):
        genre_set.add(item['name'])
    return genre_set
    
meta.genres = meta.genres.apply(str_to_set)
meta.genres


In [47]:
keywords = pd.read_csv( '/kaggle/input/the-movies-dataset/keywords.csv' )
keywords['keywords'] = keywords['keywords'].apply(str_to_set)
keywords = keywords.rename(columns={'id': 'movieId'})
keywords.movieId = pd.to_numeric(keywords.movieId)

meta = pd.merge(meta, keywords, on='movieId', how='inner')

dk = meta.loc[meta.title == 'The Dark Knight'].iloc[0]
dkr = meta.loc[meta.title == 'The Dark Knight Rises'].iloc[0]
pd.concat([dk, dkr], axis=1).T

def jaccard_similarity(s1, s2):
    if len(s1|s2) == 0:
        return 0
    return len(s1&s2)/len(s1|s2)

jaccard_similarity(dk.genres|dk.keywords, dkr.genres|dkr.keywords)

ratings = pd.read_csv( '/kaggle/input/the-movies-dataset/ratings_small.csv' )
ratings.movieId = pd.to_numeric(ratings.movieId)
ratings = pd.merge(ratings, meta[['movieId', 'title']], on='movieId', how='inner')

matrix = ratings.pivot_table(index= 'userId', columns='title', values='rating')

def pearson_similarity(u1, u2):
    u1_c = u1 - u1.mean()
    u2_c = u2 - u2.mean()
    denom = np.sqrt(np.sum(u1_c ** 2) * np.sum(u2_c ** 2))
    if denom != 0:
        return np.sum(u1_c * u2_c)/denom
    else:
        return 0

dk_rating = matrix['The Dark Knight']
pk_rating = matrix['Prom Night']
pearson_similarity(dk_rating, pk_rating)

In [53]:
merged = pd.merge(meta, keywords, on='movieId', how='inner')
def find_similar_movies (input_title , matrix, n, alpha):
    input_meta = meta.loc[ meta[ 'title'] == input_title].iloc[ 0]
    input_set = input_meta.genres | input_meta.keywords
 
    result = []
 
    for this_title in matrix.columns:
        if this_title == input_title:
            continue
        this_meta = meta.loc[ meta[ 'title'] == this_title].iloc[ 0]
        this_set = this_meta.genres | this_meta.keywords
 
        pearson = pearson_similarity(matrix[this_title], matrix[input_title])
        jaccard = jaccard_similarity(this_set, input_set)
 
        score = alpha * pearson + ( 1-alpha) * jaccard
        result.append( (this_title, pearson, jaccard, score) )
 
        result.sort(key= lambda r: r[3], reverse= True)
 
    return result[:n]

result = find_similar_movies('The Dark Knight', matrix, 10, 0.3)
pd.DataFrame(result, columns = ['title', 'pearson', 'jaccard', 'score'])