In [1]:
import pandas as pd

movies = pd.read_csv(r'data\movies.csv', low_memory=False)
ratings = pd.read_csv(r'data\ratings.csv', low_memory=False)
movie_ratings = pd.merge(movies, ratings, left_on='movieId', right_on='movieId').drop_duplicates(['userId', 'title'])

In [2]:
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [3]:
# make a list
movie_ratings2 = movie_ratings.copy()
movie_ratings2['genres'] = movie_ratings2.genres.str.split('|').tolist()
movie_ratings2 = movie_ratings2[['title', 'genres']] 

In [4]:
movie_ratings2.head()

Unnamed: 0,title,genres
0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
3,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
4,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"


In [5]:
movie_ratings2 = movie_ratings2.explode('genres')
movie_ratings2.head()

Unnamed: 0,title,genres
0,Toy Story (1995),Adventure
0,Toy Story (1995),Animation
0,Toy Story (1995),Children
0,Toy Story (1995),Comedy
0,Toy Story (1995),Fantasy


In [6]:
# crosstab
movie_category = pd.crosstab(movie_ratings2['title'], movie_ratings2['genres'])

In [7]:
movie_category.head()

genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
'71 (2014),0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
'Hellboy': The Seeds of Creation (2004),0,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
'Round Midnight (1986),0,0,0,0,0,0,0,0,2,0,0,0,0,2,0,0,0,0,0,0
'Salem's Lot (2004),0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
'Til There Was You (1997),0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0


In [8]:
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

# calculate the distances
jaccard_distances = pdist(movie_category.values, metric='jaccard')
square_jaccard_distances = squareform(jaccard_distances)
jaccard_distances

array([0.875     , 1.        , 0.66666667, ..., 1.        , 1.        ,
       1.        ])

In [9]:
# invert
jaccard_similarity_array = 1 - square_jaccard_distances
jaccard_similarity_array

array([[1.        , 0.125     , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.125     , 1.        , 0.        , ..., 0.        , 0.        ,
        0.16666667],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.16666667, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [10]:
# create a new dataframe with the scores and game titles
distance_df = pd.DataFrame(jaccard_similarity_array, index=movie_category.index, columns=movie_category.index)

distance_df

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,0.125000,0.000000,0.333333,0.000000,0.0,0.0,0.25,0.0,0.0,...,0.4,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.000000
'Hellboy': The Seeds of Creation (2004),0.125000,1.000000,0.000000,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667
'Round Midnight (1986),0.000000,0.000000,1.000000,0.000000,0.333333,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
'Salem's Lot (2004),0.333333,0.000000,0.000000,1.000000,0.000000,0.0,0.0,0.25,0.0,0.0,...,0.4,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.000000
'Til There Was You (1997),0.000000,0.000000,0.333333,0.000000,1.000000,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000
xXx (2002),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
xXx: State of the Union (2005),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000
¡Three Amigos! (1986),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000


In [20]:
# search for amovie
distance_df['Bungo Stray Dogs: Dead Apple (2018)'].sort_values(ascending=False)

title
Bungo Stray Dogs: Dead Apple (2018)                                            1.000000
Superman/Doomsday (2007)                                                       1.000000
Street Fighter II: The Animated Movie (Sutorîto Faitâ II gekijô-ban) (1994)    1.000000
Mortal Kombat: The Journey Begins (1995)                                       1.000000
Ghost in the Shell Arise - Border 1: Ghost Pain (2013)                         0.666667
                                                                                 ...   
Newton Boys, The (1998)                                                        0.000000
Next (2007)                                                                    0.000000
Next Best Thing, The (2000)                                                    0.000000
Next Friday (2000)                                                             0.000000
À nous la liberté (Freedom for Us) (1931)                                      0.000000
Name: Bungo Stray Dogs: De