In [59]:
import pandas as pd

import warnings

warnings.simplefilter('ignore')

In [60]:
df_ratings = pd.read_csv('./ml-latest-small/ratings.csv')
df_movies = pd.read_csv('./ml-latest-small/movies.csv')

print('ratings rows {}, columns {}.'.format(*df_ratings.shape))
print('movies rows {}, columns {}.'.format(*df_movies.shape))

ratings rows 100836, columns 4.
movies rows 9742, columns 3.


In [61]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [62]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [63]:
df_merge = df_ratings.merge(df_movies[['movieId','title']], how='left', on='movieId')
df_merge.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,1,3,4.0,964981247,Grumpier Old Men (1995)
2,1,6,4.0,964982224,Heat (1995)
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,964982931,"Usual Suspects, The (1995)"


In [64]:
df_user_item = df_merge.pivot_table(index=['userId'], columns=['title'], values='rating')
df_user_item.tail(5)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,
610,4.0,,,,,,,,3.5,,...,,4.0,3.5,3.0,,,2.0,1.5,,


In [65]:
print('df_user_item rows {}, columns {}.'.format(*df_user_item.shape))

df_user_item rows 610, columns 9719.


In [66]:
corr_matrix = df_user_item.corr(method='pearson', min_periods=100)

In [67]:
corr_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),,,,,,,,,,,...,,,,,,,,,,
xXx (2002),,,,,,,,,,,...,,,,,,,,,,
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,
¡Three Amigos! (1986),,,,,,,,,,,...,,,,,,,,,,


In [68]:
df_user_item.loc[7]

title
'71 (2014)                                  NaN
'Hellboy': The Seeds of Creation (2004)     NaN
'Round Midnight (1986)                      NaN
'Salem's Lot (2004)                         NaN
'Til There Was You (1997)                   NaN
                                             ..
eXistenZ (1999)                             NaN
xXx (2002)                                  NaN
xXx: State of the Union (2005)              NaN
¡Three Amigos! (1986)                       NaN
À nous la liberté (Freedom for Us) (1931)   NaN
Name: 7, Length: 9719, dtype: float64

In [69]:
user_ratings = df_user_item.loc[7].dropna()
user_ratings

title
2001: A Space Odyssey (1968)           4.0
A.I. Artificial Intelligence (2001)    4.5
Aladdin (1992)                         3.0
American Beauty (1999)                 4.0
Apocalypse Now (1979)                  4.0
                                      ... 
What Women Want (2000)                 4.0
Wild Wild West (1999)                  1.5
X-Men (2000)                           3.5
X-Men: The Last Stand (2006)           4.0
X2: X-Men United (2003)                4.0
Name: 7, Length: 152, dtype: float64

In [70]:
simCandidates = pd.Series()

for i in range(0, len(user_ratings.index)):
    sims = corr_matrix[user_ratings.index[i]].dropna()
    sims = sims.map(lambda x: x * user_ratings[i])
    simCandidates = simCandidates.append(sims)

simCandidates.sort_values(inplace=True, ascending=False)

In [71]:
simCandidates

Silence of the Lambs, The (1991)             5.000000
Back to the Future (1985)                    5.000000
Terminator, The (1984)                       5.000000
Forrest Gump (1994)                          5.000000
Jurassic Park (1993)                         5.000000
                                               ...   
Pulp Fiction (1994)                         -0.376668
Pulp Fiction (1994)                         -0.431635
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   -0.474771
Pulp Fiction (1994)                         -0.719040
Fargo (1996)                                -0.801146
Length: 697, dtype: float64

In [72]:
simCandidates = simCandidates.groupby(by=simCandidates.index).sum()
simCandidates.sort_values(inplace=True, ascending=False)
simCandidates

Matrix, The (1999)                                       30.707095
Jurassic Park (1993)                                     28.078650
Star Wars: Episode IV - A New Hope (1977)                27.955474
Star Wars: Episode V - The Empire Strikes Back (1980)    27.739345
Terminator 2: Judgment Day (1991)                        27.421780
                                                           ...    
Dumb & Dumber (Dumb and Dumber) (1994)                    1.096868
Departed, The (2006)                                      1.000000
Babe (1995)                                               0.472172
Eternal Sunshine of the Spotless Mind (2004)              0.467916
Kill Bill: Vol. 1 (2003)                                 -0.046993
Length: 91, dtype: float64

In [73]:
intersection_set = set.intersection(set(simCandidates.index), set(user_ratings.index))
not_watched = list(set(simCandidates.index) - set(user_ratings.index))

print(f'intersection_set {len(intersection_set)}')
print(f'not_watched {len(not_watched)}')

intersection_set 47
not_watched 44


In [74]:
filteredSims = simCandidates[not_watched].sort_values(ascending=False)
filteredSims[:3]

Matrix, The (1999)                                                                30.707095
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    22.002139
Shawshank Redemption, The (1994)                                                  18.925514
dtype: float64