**User Based Recommender System**

Steps in a user-based recommendation system:

1. Select a user with the movies the user has watched
2. Based on his rating to movies, find the top x neighbours
3. Get the watched movie record of the user for each neighbour.
4. Calculate a similarity score using some formula
5. Recommend the items with the highest score

In [6]:
import pandas as pd
from math import sqrt
import numpy as np

In [7]:
# Read the data from the CSV file
cellphones_df = pd.read_csv('cellphones_data.csv')
ratings_df = pd.read_csv('cellphones_ratings.csv')

print(cellphones_df.info())
# Source code: https://www.kaggle.com/code/takkimsncn/cellphones-recommendations/input?

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cellphone_id      33 non-null     int64  
 1   brand             33 non-null     object 
 2   model             33 non-null     object 
 3   operating system  33 non-null     object 
 4   internal memory   33 non-null     int64  
 5   RAM               33 non-null     int64  
 6   performance       33 non-null     float64
 7   main camera       33 non-null     int64  
 8   selfie camera     33 non-null     int64  
 9   battery size      33 non-null     int64  
 10  screen size       33 non-null     float64
 11  weight            33 non-null     int64  
 12  price             33 non-null     int64  
 13  release date      33 non-null     object 
dtypes: float64(2), int64(8), object(4)
memory usage: 3.7+ KB
None


In [8]:
# Creating a list of dictionaries representing user input data for cellphone models and ratings
userInput = [{'model':'iPhone XR', 'rating':5},
             {'model':'iPhone 13 Mini', 'rating':1},
             {'model':'Galaxy A53', 'rating':1},
             {'model':'Galaxy A32', 'rating':5},
             {'model':'iPhone 13 Pro', 'rating':4.5}]

inputCellphones = pd.DataFrame(userInput)
print(inputCellphones)

            model  rating
0       iPhone XR     5.0
1  iPhone 13 Mini     1.0
2      Galaxy A53     1.0
3      Galaxy A32     5.0
4   iPhone 13 Pro     4.5


In [9]:
# Filtering the original cellphones DataFrame to get only the rows corresponding to the models in the user input
inputId = cellphones_df[cellphones_df['model'].isin(inputCellphones['model'].tolist())]

# Merging the filtered cellphones DataFrame with the user input DataFrame based on the 'model' column
inputCellphones = pd.merge(inputId, inputCellphones)

# Selecting specific columns ('cellphone_id', 'model', 'rating') from the merged DataFrame
inputCellphones = inputCellphones[['cellphone_id','model','rating']]

print(inputCellphones)

   cellphone_id           model  rating
0             1  iPhone 13 Mini     1.0
1             3   iPhone 13 Pro     4.5
2             5       iPhone XR     5.0
3             8      Galaxy A32     5.0
4             9      Galaxy A53     1.0


In [10]:
# Filtering the ratings DataFrame to get only the rows corresponding to the cellphone IDs in the user input
userSubset = ratings_df[ratings_df['cellphone_id'].isin(inputCellphones['cellphone_id'].tolist())]

print(userSubset.groupby('cellphone_id').count())

              user_id  rating
cellphone_id                 
1                  24      24
3                  29      29
5                  31      31
8                  39      39
9                  29      29


In [11]:
# Grouping the filtered ratings DataFrame by 'user_id'
userSubsetGroup = userSubset.groupby(['user_id'])

# Defining a function 'take_5_elem' that takes a group of data and returns the number of elements in that group
def take_5_elem(x):
    return len(x[1])

# Sorting the groups based on the number of elements in each group in descending order
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

# Taking the top 100 groups (users) based on the number of elements (ratings) in each group
userSubsetGroup = userSubsetGroup[0:100]

print(userSubsetGroup[0:5])

[(0,    user_id  cellphone_id  rating
1        0             5       3
3        0             9       3
5        0             8       2
9        0             3      10), (110,      user_id  cellphone_id  rating
370      110             8       2
375      110             5       2
376      110             3       3
378      110             9       7), (169,      user_id  cellphone_id  rating
641      169             1       5
642      169             5       7
644      169             3      10
648      169             9       8), (194,      user_id  cellphone_id  rating
671      194             8       8
672      194             1       9
676      194             9       3
678      194             3       6), (28,      user_id  cellphone_id  rating
112       28             5      10
113       28             8       1
118       28             3      10)]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [12]:
# Dictionary to store Pearson Correlation, where key is user id and value is the coefficient
pearsonCorrelationDict = {}

# For each user group in the subset
for name, group in userSubsetGroup:

    # Sort input and current user group for aligned values
    group = group.sort_values(by='cellphone_id')
    inputCellphones = inputCellphones.sort_values(by='cellphone_id')

    # Calculate N for the formula
    nRatings = len(group)

    # Get review scores for common cellphones
    temp_df = inputCellphones[inputCellphones['cellphone_id'].isin(group['cellphone_id'].tolist())]

    # Store review scores in lists for future calculations
    tempRatingList = temp_df['rating'].tolist()

    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()

    # Calculate Pearson correlation manually
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    # If denominator is not zero, then calculate correlation, else, assign 0 correlation
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [13]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')

# Rename the column to 'similarityIndex' for clarity
pearsonDF.columns = ['similarityIndex']

# Add 'user_id' column and set its values to be the user ids from the index
pearsonDF['user_id'] = pearsonDF.index

# Reset the index for sequential numbering and display the first few rows of the
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  user_id
0         0.151749        0
1        -0.997038      110
2         0.497026      169
3         0.246222      194
4        -0.500000       28


In [14]:
# Select the top 50 users based on Pearson correlation similarity index
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  user_id
23              1.0       38
19              1.0        6
37              1.0      203
32              1.0      126
29              1.0      113


In [15]:
# Merge the topUsers DataFrame with the ratings DataFrame based on 'user_id' using inner join
topUsersRating=topUsers.merge(ratings_df, left_on='user_id', right_on='user_id', how='inner')
print(topUsersRating.head(100))

    similarityIndex  user_id  cellphone_id  rating
0               1.0       38             1       7
1               1.0       38            30       4
2               1.0       38             5       8
3               1.0       38            24       7
4               1.0       38            10       7
..              ...      ...           ...     ...
95              0.5      162            32       7
96              0.5      162            10       6
97              0.5      162             5       8
98              0.5      162             0       7
99              0.5      162             3       8

[100 rows x 4 columns]


In [16]:
# Calculate weighted ratings by multiplying user similarity index with their ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  user_id  cellphone_id  rating  weightedRating
0              1.0       38             1       7             7.0
1              1.0       38            30       4             4.0
2              1.0       38             5       8             8.0
3              1.0       38            24       7             7.0
4              1.0       38            10       7             7.0


In [17]:
# Group and sum the topUsersRating DataFrame by cellphone_id to calculate total similarity and weighted ratings
tempTopUsersRating = topUsersRating.groupby('cellphone_id').sum()[['similarityIndex','weightedRating']]

# Rename the columns
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

              sum_similarityIndex  sum_weightedRating
cellphone_id                                         
0                        2.500000           21.500000
1                        9.213973           53.996203
2                        2.997026           26.970261
3                        3.394996           25.965078
4                        3.500000           30.000000


In [18]:
# Create an empty DataFrame for recommendations
recommendation_df = pd.DataFrame()

# Calculate weighted average recommendation scores by dividing sum_weightedRating by sum_similarityIndex
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']

# Set 'cellphone_id' column using the index values of tempTopUsersRating
recommendation_df['cellphone_id'] = tempTopUsersRating.index

print(recommendation_df.head(10))

              weighted average recommendation score  cellphone_id
cellphone_id                                                     
0                                          8.600000             0
1                                          5.860252             1
2                                          8.999008             2
3                                          7.648043             3
4                                          8.571429             4
5                                          8.574325             5
6                                          3.666667             6
7                                          8.575049             7
8                                          8.083370             8
9                                          6.465667             9


In [19]:
# Sort recommendation_df based on 'weighted average recommendation score' in descending order
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)

# Filter for recommendations with a score greater than 4.9
recommendation_df_sorted = recommendation_df[recommendation_df['weighted average recommendation score'] > 4.9]

print(recommendation_df_sorted)

              weighted average recommendation score  cellphone_id
cellphone_id                                                     
2                                          8.999008             2
12                                         8.856171            12
0                                          8.600000             0
7                                          8.575049             7
5                                          8.574325             5
4                                          8.571429             4
8                                          8.083370             8
26                                         7.857021            26
20                                         7.663382            20
3                                          7.648043             3
24                                         7.600000            24
28                                         7.333333            28
10                                         7.296386            10
32        

In [24]:
# Filter cellphones_df to include only those with cellphone_id in recommendation_df_sorted
recommended_cellphone = cellphones_df.loc[cellphones_df['cellphone_id'].isin(recommendation_df_sorted['cellphone_id'])]

# Exclude cellphones that the user has already rated from the recommendations
recommended_cellphone = recommended_cellphone.loc[~recommended_cellphone.cellphone_id.isin(userSubset['cellphone_id'])]

# Display the top 5 recommended cellphones
print(recommended_cellphone.head(5))

    cellphone_id    brand              model operating system  \
0              0    Apple   iPhone SE (2022)              iOS   
2              2    Apple          iPhone 13              iOS   
4              4    Apple  iPhone 13 Pro Max              iOS   
7              7  Samsung         Galaxy A13          Android   
10            10  Samsung         Galaxy S22          Android   

    internal memory  RAM  performance  main camera  selfie camera  \
0               128    4         7.23           12              7   
2               128    4         7.75           12             12   
4               256    6         8.01           12             12   
7                32    3         1.36           50              8   
10              128    8         8.81           50             10   

    battery size  screen size  weight  price release date  
0           2018          4.7     144    429   18/03/2022  
2           3240          6.1     174    699   24/09/2021  
4           43