## User-Item Collaborative Filtering

In [6]:
import pandas as pd

movies = pd.read_csv(r'C:\Users\Arne\Documents\DataScience\Personalisation-for-Media\assignment2\data\movies.csv', low_memory=False)
ratings = pd.read_csv(r'C:\Users\Arne\Documents\DataScience\Personalisation-for-Media\assignment2\data\ratings.csv', low_memory=False)
movie_ratings = pd.merge(movies, ratings, left_on='movieId', right_on='movieId').drop_duplicates(['userId', 'title'])

In [7]:
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
# Additional columns (friends_id)
movie_ratings['friend_ids'] = ''
movie_ratings['friend_ids'] = movie_ratings[movie_ratings.userId==1].apply(lambda x: [5,7,9,10], axis=1) # Example encoding of data for friends of target userID=1

In [9]:
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,friend_ids
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703,"[5, 7, 9, 10]"
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970,
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483,


In [10]:
# Subset for only users that are friends with the target user
target_user = 1
friends_target_user = movie_ratings[movie_ratings.userId==target_user]['friend_ids']

In [11]:
friends_list = friends_target_user.values[0]

In [12]:
friends_list

[5, 7, 9, 10]

In [13]:
friend_index = friends_list
smaller_selection = movie_ratings.loc[movie_ratings['userId'].isin(friend_index)]

In [14]:
smaller_selection.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,friend_ids
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962,
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946,
1146,21,Get Shorty (1995),Comedy|Crime|Thriller,5,4.0,847435238,
1679,34,Babe (1995),Children|Drama,5,4.0,847434881,
1807,36,Dead Man Walking (1995),Crime|Drama,5,4.0,847435292,


In [11]:
# count total reviews
avg_user = movie_ratings.groupby(['userId']).size().reset_index(name='counts')
avg_user.head()

Unnamed: 0,userId,counts
0,1,232
1,2,29
2,3,39
3,4,216
4,5,44


In [12]:
# set index for later
avg_user = avg_user.set_index('userId')

# select x-number of reviews
avg_user = avg_user[avg_user['counts'] > 20]

# user the index to create an list of ids that have more than x-number 
user_index = avg_user.index

# iloc (so look for the user id which is int) and make a smaller selection now including the review
#smaller_selection = user_ratings[user_ratings['user_id'] == users_index]

smaller_selection = movie_ratings.loc[movie_ratings['userId'].isin(user_index)]

# previous steps could be easily chained together requiring less lines of code. But that does not help the explainability of the code

In [13]:
smaller_selection.userId.values

array([  1,   5,   7, ..., 184, 184, 331], dtype=int64)

In [14]:
#user_item matrix
user_ratings_pivot = smaller_selection.pivot(index='userId', columns='title', values='rating')

In [15]:
# normalize the rating values around 0 as there are many NAN values
#
avg_ratings = user_ratings_pivot.mean(axis=1) # row average
#save for later
user_ratings_table = user_ratings_pivot
user_ratings_pivot = user_ratings_pivot.sub(avg_ratings, axis=0)

In [16]:
user_ratings_pivot.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,-0.366379,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [19]:
user_ratings_pivot = user_ratings_pivot.fillna(0)
user_ratings_pivot.loc[331].sort_values(ascending=False).head()

title
Best in Show (2000)                 1.372024
Pulp Fiction (1994)                 1.372024
Grand Budapest Hotel, The (2014)    1.372024
Lost in Translation (2003)          1.372024
Usual Suspects, The (1995)          1.372024
Name: 331, dtype: float64

In [20]:
user_ratings_pivot.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.366379,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
#if we repeat with very different users we might get negative values
#calcualting similarity matrix for the whole table
user_similarities = cosine_similarity(user_ratings_pivot)
#tiding it up in a dataframe with the row and column names as the Ids of the users
cosine_similarity_df = pd.DataFrame(user_similarities, index=user_ratings_pivot.index,columns=user_ratings_pivot.index).fillna(0)

In [22]:
cosine_similarity_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.0062,0.047013,0.01951,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.0,0.0,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.0,0.003012,...,-0.050551,-0.031581,-0.001688,0.0,0.0,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.0,1.0,-0.01126,-0.031539,0.0048,0.0,-0.032471,0.0,0.0,...,-0.004904,-0.016117,0.017749,0.0,-0.001431,-0.037289,-0.007789,-0.013001,0.0,0.01955
4,0.048419,-0.017164,-0.01126,1.0,-0.02962,0.013956,0.058091,0.002065,-0.005874,0.05159,...,-0.037687,0.063122,0.02764,-0.013782,0.040037,0.02059,0.014628,-0.037569,-0.017884,-0.000995
5,0.021847,0.021796,-0.031539,-0.02962,1.0,0.009111,0.010117,-0.012284,0.0,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.000278


In [23]:
cosine_similarity_series = cosine_similarity_df.loc[187]
#we order it
ordered_similarities = cosine_similarity_series.sort_values(ascending=False)

In [24]:
ordered_similarities.head()

userId
187    1.000000
434    0.201040
296    0.152105
391    0.140172
137    0.135574
Name: 187, dtype: float64

In [25]:
#pick the first 3
#KNN: find the most similar k neighbors to the usre in question, (here say k= 3) and then averages 
#the ratings those usres gave to the item we are trying to get a rating for..
#this gives us a predicted rating or how a user might feel about an item they haven't seen before
nearest_neighbors = ordered_similarities[1:4]

In [26]:
def get_movies(user):
    ''' 
    Return the top recommended movies for a user given their K-nearest neighbors

    :param input: UserId
    :return top_movies_sorted: ordered list of movies with predicted rating
    '''
    all_movies = []
    for title in list(movie_ratings.title.unique()):
        cosine_similarity_series = cosine_similarity_df.loc[user]
        #we order it
        ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
        #pick the first 3
        #KNN: find the most similar k neighbors to the users in question, (here say k= 3) and then averages 
        #the ratings those users gave to the item we are trying to get a rating for..
        #this gives us a predicted rating or how a user might feel about an item they haven't seen before
        nearest_neighbors = ordered_similarities[0:1]
        neighbor_ratings = user_ratings_table.reindex(nearest_neighbors.index)
        all_movies.append(neighbor_ratings[title].mean())

    top_movies = pd.Series(all_movies, index=neighbor_ratings.columns)
    top_movies_sorted = top_movies.sort_values(ascending=False)
    return top_movies_sorted

In [27]:
x = get_movies(187)

In [28]:
x[:100]

title
Mothra (Mosura) (1961)                        5.0
Beautiful Thing (1996)                        5.0
Indignation (2016)                            5.0
Mrs. Dalloway (1997)                          5.0
Monkey Trouble (1994)                         5.0
                                             ... 
Amos & Andrew (1993)                          4.0
Fiendish Plot of Dr. Fu Manchu, The (1980)    4.0
Agony and the Ecstasy, The (1965)             4.0
Aelita: The Queen of Mars (Aelita) (1924)     4.0
Dangerous Lives of Altar Boys, The (2002)     4.0
Length: 100, dtype: float64

In [None]:
import numpy as np
def get_recommendations(user,title):
    cosine_similarity_series = cosine_similarity_df.loc[user]
    #we order it
    ordered_similarities = cosine_similarity_series.sort_values(ascending=False)
    #pick the first 3
    #KNN: find the most similar k neighbors to the users in question, (here say k= 3) and then averages 
    #the ratings those users gave to the item we are trying to get a rating for..
    #this gives us a predicted rating or how a user might feel about an item they haven't seen before
    nearest_neighbors = ordered_similarities[1:4]
    neighbor_ratings = user_ratings_table.reindex(nearest_neighbors.index)
    return round(neighbor_ratings[title].mean()*2)/2 

In [None]:
user_ratings_table.loc[187,'Dark Knight, The (2008)']   # No official rating

In [None]:
get_recommendations(187,'Dark Knight, The (2008)')

In [None]:
user_ratings_table.loc[187,'Hangover, The (2009)'] # No official rating