# Week 11: User-based recommender system (using Mr. Bagus' method)

In this implementation, I've used hotel rating dataset I found on Kaggle. The dataset can be accesed at the following link: https://www.kaggle.com/datasets/manohar676/hotel-reviews-segmentation-recommended-system?resource=download

In [1]:
# data processing
import pandas as pd
import numpy as np
import scipy.stats
from math import sqrt

# visualization
import seaborn as sns

#similarity (using sklearn)
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings_df = pd.read_csv("user_hotel_rating.csv")
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271935 entries, 0 to 271934
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   userid         271935 non-null  object
 1   Hotelid        271935 non-null  object
 2   OverallRating  271935 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 6.2+ MB


In [3]:
ratings_df.head()

Unnamed: 0,userid,Hotelid,OverallRating
0,user_78131,hotel_558,3
1,user_78131,hotel_603,2
2,user_78131,hotel_610,3
3,user_78131,hotel_574,3
4,user_78131,hotel_570,3


In [4]:
ratings_df

Unnamed: 0,userid,Hotelid,OverallRating
0,user_78131,hotel_558,3
1,user_78131,hotel_603,2
2,user_78131,hotel_610,3
3,user_78131,hotel_574,3
4,user_78131,hotel_570,3
...,...,...,...
271930,user_68476,hotel_639,3
271931,user_68476,hotel_628,3
271932,user_68476,hotel_542,2
271933,user_68476,hotel_510,3


In [5]:
user_input = [{'Hotelid':'hotel_558', 'OverallRating': 5},
             {'Hotelid':'hotel_639', 'OverallRating': 3},
             {'Hotelid':'hotel_563', 'OverallRating': 2},
             {'Hotelid':'hotel_610', 'OverallRating': 4},
             {'Hotelid':'hotel_628', 'OverallRating': 2}]
input_df = pd.DataFrame(user_input)
input_df

Unnamed: 0,Hotelid,OverallRating
0,hotel_558,5
1,hotel_639,3
2,hotel_563,2
3,hotel_610,4
4,hotel_628,2


In [6]:
user_subset = ratings_df[ratings_df['Hotelid'].isin(input_df['Hotelid'].tolist())]
user_subset.groupby('Hotelid').count()

Unnamed: 0_level_0,userid,OverallRating
Hotelid,Unnamed: 1_level_1,Unnamed: 2_level_1
hotel_558,62,62
hotel_563,2320,2320
hotel_610,2420,2420
hotel_628,2337,2337
hotel_639,2318,2318


In [7]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = user_subset.groupby(['userid'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
userSubsetGroup[0:5]

[('user_53090',           userid    Hotelid  OverallRating
  3448  user_53090  hotel_558              1
  3454  user_53090  hotel_563              3
  3467  user_53090  hotel_628              3
  3468  user_53090  hotel_610              3
  3480  user_53090  hotel_639              3),
 ('user_60914',           userid    Hotelid  OverallRating
  4418  user_60914  hotel_558              2
  4430  user_60914  hotel_563              4
  4441  user_60914  hotel_628              4
  4454  user_60914  hotel_610              3
  4456  user_60914  hotel_639              3),
 ('user_75024',           userid    Hotelid  OverallRating
  5545  user_75024  hotel_558              4
  5551  user_75024  hotel_610              4
  5567  user_75024  hotel_628              3
  5571  user_75024  hotel_639              3
  5578  user_75024  hotel_563              3),
 ('user_75658',           userid    Hotelid  OverallRating
  5647  user_75658  hotel_558              1
  5668  user_75658  hotel_628         

In [8]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='Hotelid')
    input_df = input_df.sort_values(by='Hotelid')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = input_df[input_df['Hotelid'].isin(group['Hotelid'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['OverallRating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['OverallRating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [9]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userid'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userid
0,-0.771744,user_53090
1,-0.962533,user_60914
2,0.910182,user_75024
3,-0.78278,user_75658
4,0.0,user_21042


In [10]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userid
41,1.0,user_30675
2,0.910182,user_75024
25,0.904534,user_27348
30,0.894427,user_27463
50,0.870388,user_34929


In [11]:
topUsersRating=topUsers.merge(ratings_df, left_on='userid', right_on='userid', how='inner')
topUsersRating.head(100)

Unnamed: 0,similarityIndex,userid,Hotelid,OverallRating
0,1.000000,user_30675,hotel_515,4
1,1.000000,user_30675,hotel_628,2
2,1.000000,user_30675,hotel_624,4
3,1.000000,user_30675,hotel_612,4
4,1.000000,user_30675,hotel_509,4
...,...,...,...,...
95,0.910182,user_75024,hotel_623,3
96,0.910182,user_75024,hotel_633,3
97,0.910182,user_75024,hotel_508,3
98,0.910182,user_75024,hotel_591,3


In [12]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['OverallRating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userid,Hotelid,OverallRating,weightedRating
0,1.0,user_30675,hotel_515,4,4.0
1,1.0,user_30675,hotel_628,2,2.0
2,1.0,user_30675,hotel_624,4,4.0
3,1.0,user_30675,hotel_612,4,4.0
4,1.0,user_30675,hotel_509,4,4.0


In [13]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('Hotelid').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
Hotelid,Unnamed: 1_level_1,Unnamed: 2_level_1
hotel_501,0.894427,3.577709
hotel_502,7.354862,20.054614
hotel_503,7.692657,23.14254
hotel_504,6.754256,19.600084
hotel_506,16.489646,55.975575


In [14]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['Hotelid'] = tempTopUsersRating.index
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,Hotelid
Hotelid,Unnamed: 1_level_1,Unnamed: 2_level_1
hotel_501,4.0,hotel_501
hotel_502,2.726715,hotel_502
hotel_503,3.008394,hotel_503
hotel_504,2.901887,hotel_504
hotel_506,3.394589,hotel_506
hotel_507,3.601248,hotel_507
hotel_508,3.843631,hotel_508
hotel_509,3.547435,hotel_509
hotel_510,3.771159,hotel_510
hotel_511,3.339224,hotel_511


In [15]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,Hotelid
Hotelid,Unnamed: 1_level_1,Unnamed: 2_level_1
hotel_601,4.281064,hotel_601
hotel_559,4.015557,hotel_559
hotel_501,4.000000,hotel_501
hotel_517,4.000000,hotel_517
hotel_558,4.000000,hotel_558
...,...,...
hotel_562,2.509477,hotel_562
hotel_543,2.175411,hotel_543
hotel_525,2.146979,hotel_525
hotel_565,2.000000,hotel_565


In [16]:
recommended_hotel=ratings_df.loc[ratings_df['Hotelid'].isin(recommendation_df['Hotelid'])]

#we don't want to recommend the same movie
recommended_hotel=recommended_hotel.loc[~recommended_hotel.Hotelid.isin(user_subset['Hotelid'])]

recommended_hotel

Unnamed: 0,userid,Hotelid,OverallRating
1,user_78131,hotel_603,2
3,user_78131,hotel_574,3
4,user_78131,hotel_570,3
5,user_78131,hotel_587,3
6,user_78131,hotel_593,4
...,...,...,...
271927,user_68476,hotel_524,2
271928,user_68476,hotel_543,2
271929,user_68476,hotel_526,3
271932,user_68476,hotel_542,2
