In [14]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

## Import data into df's

In [40]:
movies = pd.read_csv('ml-latest-small/movies.csv')

In [41]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [42]:
df = ratings.merge(movies, on='movieId')

In [43]:
df['movieId'].nunique()

9724

In [44]:
df['title'].nunique()

9719

### new user query

In [63]:
user = {'title' : ["Fight Club (1999)", "Pretty Woman (1990)" , "The Butterfly Effect (2004)", 
                   "Inception (2010)", "(500) Days of Summer (2009)","Devil Wears Prada, The (2006)"],
        'rating' : [5,2,4,4,2,1]}

In [64]:
user = pd.DataFrame(user)

In [65]:
user

Unnamed: 0,title,rating
0,Fight Club (1999),5
1,Pretty Woman (1990),2
2,The Butterfly Effect (2004),4
3,Inception (2010),4
4,(500) Days of Summer (2009),2
5,"Devil Wears Prada, The (2006)",1


In [66]:
titles = df['title'].unique()
titles = pd.DataFrame(titles)

In [67]:
user_merge = pd.merge(titles, user, left_on = 0, right_on = 'title', how = 'left')

In [68]:
query = user_merge['rating']
query = np.array(query)
len(query), type(query)

(9719, numpy.ndarray)

In [69]:
query[5:15]

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

### Transform into a matrix... userID x movieID

In [70]:
mm = df.pivot_table(values='rating', index='userId', columns='title')

In [71]:
mm.loc['e'] = query

In [72]:
def numberOfNonNans(data):
    count = 0
    for i in data:
        if not np.isnan(i):
            count += 1
    return count 

numberOfNonNans(query)

6

In [73]:
#avg = mm.sub(mm.mean(axis=0), axis=1)

In [74]:
#mm.fillna(2.5,inplace=True)

In [75]:
mm.shape

(611, 9719)

In [76]:
mm.tail()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,
610,4.0,,,,,,,,3.5,,...,,4.0,3.5,3.0,,,2.0,1.5,,
e,,,,,,,,,,,...,,,,,,,,,,


In [62]:
# here again the same issue, the new user "e" should have given a rating **2** for (500) Days of Summer (2009)
# but the rating is not inputed in the right place...

### Create cosim matrix

In [None]:
cosim = cosine_similarity(mm)[-1]

In [None]:
pd.DataFrame(cosim)

In [None]:
cosim.shape

In [None]:
#sns.heatmap(cosim)

### pick an active user and find the top 10 most similar users

In [None]:
u = cosim[e]    #select an user (id=4) from the cosim -- this is the row with correlation to all other users
u = pd.DataFrame(u) #transform into a DF -- indexes= userIDs, column= correlations

In [None]:
u = u.sort_values(by=[4], ascending=[False]).head(11)  #order by most similar users
u

In [None]:
similar_usersid = list(u.index)
similar_usersid = similar_usersid[1:]

### With these subset of users, calculate the average rating
* Optionally: Use the distance to the active user as a weight when calculating the average.

In [None]:
similar_users = mm.loc[similar_usersid, :]
similar_users

In [None]:
movie_ratings_avg = similar_users.mean()
movie_ratings_avg = pd.DataFrame(movie_ratings_avg)
movie_ratings_avg.head()

### Recommend movies that the similar users liked most and that the active user has not seen yet.

In [None]:
movie_ratings_avg.sort_values(by=[0], ascending=[False]).head(10)

In [None]:
recommended_movies = movie_ratings_avg.sort_values(by=[0], ascending=[False]).head(10)

In [None]:
recommended_movies = pd.merge(recommended_movies, movies, left_on = 'movieId', right_on = 'movieId', how = 'left')

In [None]:
recommended_movies['title']