In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

## Import data into df's

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')

In [3]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [4]:
df = ratings.merge(movies, on='movieId')

In [5]:
df['movieId'].nunique()

9724

In [6]:
df['title'].nunique()

9719

### new user query

In [7]:
user = {'title' : ["Fight Club (1999)", "Pretty Woman (1990)" , "The Butterfly Effect (2004)", 
                   "Inception (2010)", "Toy Story (1995)","Devil Wears Prada, The (2006)"],
        'rating' : [5,1,5,5,2,1]}

user1 = {'title' : ["Fight Club (1999)", "Pretty Woman (1990)" , "The Butterfly Effect (2004)", 
                   "Inception (2010)", "Toy Story (1995)","Devil Wears Prada, The (2006)"],
        'rating' : [1,5,1,1,4,5]}

In [8]:
user = pd.DataFrame(user)

In [9]:
user

Unnamed: 0,title,rating
0,Fight Club (1999),5
1,Pretty Woman (1990),1
2,The Butterfly Effect (2004),5
3,Inception (2010),5
4,Toy Story (1995),2
5,"Devil Wears Prada, The (2006)",1


In [10]:
mIDs = ratings['movieId'].unique()
mIDs = pd.DataFrame(mIDs)

In [11]:
movie_info = pd.merge(mIDs, movies, left_on = 0, right_on = 'movieId')

In [12]:
user_ratings = pd.merge(movie_info, user, left_on = 'title', right_on = 'title', how = 'left')

In [13]:
query = user_ratings['rating']
query = np.array(query)
len(query), type(query)

(9724, numpy.ndarray)

### Transform into a matrix... userID x movieID

In [14]:
mm = df.pivot_table(values='rating', index='userId', columns='movieId')

In [15]:
mm.loc['e'] = query

In [16]:
mm = mm.sub(mm.mean(axis=0), axis=1)

In [17]:
mm.fillna(0,inplace=True)

In [18]:
#mm.fillna(2.5,inplace=True)

In [19]:
mm.shape

(611, 9724)

In [20]:
mm.tail()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
607,0.087963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,-1.412037,-1.431818,-1.259615,0.0,0.0,0.0,0.0,0.0,0.0,0.503788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,-0.912037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.503788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,1.087963,0.0,0.0,0.0,0.0,1.053922,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e,-1.912037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Create cosim matrix

In [21]:
cosim = cosine_similarity(mm)[-1]

In [22]:
cosim = pd.DataFrame(cosim)
cosim

Unnamed: 0,0
0,-0.002411
1,0.000000
2,0.000000
3,0.000000
4,-0.006764
...,...
606,-0.002886
607,0.024193
608,0.131349
609,-0.022067


### pick the top 10 most similar users

In [23]:
top10 = cosim.sort_values(by=[0], ascending=[False]).head(11)  #order by most similar users
top10

Unnamed: 0,0
610,1.0
129,0.192018
192,0.143451
608,0.131349
213,0.11425
292,0.110773
133,0.105576
75,0.100965
53,0.100729
13,0.095791


In [24]:
similar_users = list(top10.index)
similar_users = similar_users[1:]
similar_users

[129, 192, 608, 213, 292, 133, 75, 53, 13, 411]

### With these subset of users, calculate the average rating
* Optionally: Use the distance to the active user as a weight when calculating the average.

In [25]:
users_r = mm.loc[similar_users, :]
users_r

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.503788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,-1.412037,-1.431818,-1.259615,0.0,0.0,0.0,0.0,0.0,0.0,0.503788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
213,-0.412037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,0.087963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003788,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411,1.087963,0.568182,0.0,-0.357143,0.0,0.0,-0.185185,0.0,0.0,-0.496212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
movie_ratings_avg = users_r.mean()
movie_ratings_avg = pd.DataFrame(movie_ratings_avg)

### Recommend movies that the similar users liked most and that the active user has not seen yet.

In [27]:
movie_ratings_avg.sort_values(by=[0], ascending=[False]).head(10)

Unnamed: 0_level_0,0
movieId,Unnamed: 1_level_1
1544,0.323881
53996,0.296154
33679,0.294068
31221,0.29
3623,0.285714
5219,0.283333
2616,0.273077
4148,0.262821
34048,0.255
185,0.229911


In [28]:
recommended_movies = movie_ratings_avg.sort_values(by=[0], ascending=[False]).head(10)

In [29]:
recommended_movies = pd.merge(recommended_movies, movies, left_on = 'movieId', right_on = 'movieId', how = 'left')

In [30]:
recommended_movies

Unnamed: 0,movieId,0,title,genres
0,1544,0.323881,"Lost World: Jurassic Park, The (1997)",Action|Adventure|Sci-Fi|Thriller
1,53996,0.296154,Transformers (2007),Action|Sci-Fi|Thriller|IMAX
2,33679,0.294068,Mr. & Mrs. Smith (2005),Action|Adventure|Comedy|Romance
3,31221,0.29,Elektra (2005),Action|Adventure|Crime|Drama
4,3623,0.285714,Mission: Impossible II (2000),Action|Adventure|Thriller
5,5219,0.283333,Resident Evil (2002),Action|Horror|Sci-Fi|Thriller
6,2616,0.273077,Dick Tracy (1990),Action|Crime
7,4148,0.262821,Hannibal (2001),Horror|Thriller
8,34048,0.255,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
9,185,0.229911,"Net, The (1995)",Action|Crime|Thriller


In [31]:
recommended_movies = recommended_movies['title']
recommended_movies

0    Lost World: Jurassic Park, The (1997)
1                      Transformers (2007)
2                  Mr. & Mrs. Smith (2005)
3                           Elektra (2005)
4            Mission: Impossible II (2000)
5                     Resident Evil (2002)
6                        Dick Tracy (1990)
7                          Hannibal (2001)
8                 War of the Worlds (2005)
9                          Net, The (1995)
Name: title, dtype: object

## Test A: likes drama, Test B: likes action

#### test A, 2.5 fill method:
-   Lord of the Rings: The Return of the King, The...
-                                   Matrix, The (1999)
-    Lord of the Rings: The Fellowship of the Ring,...
-                              Dark Knight, The (2008)
-                                  Forrest Gump (1994)
-                     Shawshank Redemption, The (1994)
-        Lord of the Rings: The Two Towers, The (2002)
-    Raiders of the Lost Ark (Indiana Jones and the...
-                                     Gladiator (2000)
-                           Saving Private Ryan (1998)

#### test B, 2.5 fill method:
-                                  Forrest Gump (1994)
-    Lord of the Rings: The Return of the King, The...
-    Lord of the Rings: The Fellowship of the Ring,...
-                                   Matrix, The (1999)
-                     Shawshank Redemption, The (1994)
-                              Schindler's List (1993)
-        Lord of the Rings: The Two Towers, The (2002)
-                           Saving Private Ryan (1998)
-                                         Shrek (2001)
-                                    Gladiator (2000)

#### test A, df.sub fill method:
-                    Mask of Zorro, The (1998)
-    Star Wars: Episode IV - A New Hope (1977)
-                            Braveheart (1995)
-                             Gladiator (2000)
-          Monty Python's Life of Brian (1979)
-                       American Beauty (1999)
-                             Mask, The (1994)
-                           Patch Adams (1998)
-       Monty Python and the Holy Grail (1975)
-                   Princess Bride, The (1987)

#### test B, df.sub fill method:
-    Lost World: Jurassic Park, The (1997)
-                     Transformers (2007)
-                  Mr. & Mrs. Smith (2005)
-                           Elektra (2005)
-            Mission: Impossible II (2000)
-                     Resident Evil (2002)
-                        Dick Tracy (1990)
-                         Hannibal (2001)
-                 War of the Worlds (2005)
-                         Net, The (1995)