# Recommender System Project (User-Based Collaborative Filtering)

## Imports

In [78]:
import pandas as pd
import numpy as np

## Reading Data Item Data

In [79]:
items_df = pd.read_csv("items.csv")

print(items_df.shape)
items_df.head(3)

(720, 10)


Unnamed: 0,asin,brand,title,url,image,rating,reviewUrl,totalReviews,price,originalPrice
0,B0000SX2UC,,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,0.0,0.0
1,B0009N5L7K,Motorola,Motorola I265 phone,https://www.amazon.com/Motorola-i265-I265-phon...,https://m.media-amazon.com/images/I/419WBAVDAR...,3.0,https://www.amazon.com/product-reviews/B0009N5L7K,7,49.95,0.0
2,B000SKTZ0S,Motorola,MOTOROLA C168i AT&T CINGULAR PREPAID GOPHONE C...,https://www.amazon.com/MOTOROLA-C168i-CINGULAR...,https://m.media-amazon.com/images/I/71b+q3ydkI...,2.7,https://www.amazon.com/product-reviews/B000SKTZ0S,22,99.99,0.0


## Preprocessing Item Data

In [80]:
items_df = items_df[["asin", "rating"]]

items_df.head(3)

Unnamed: 0,asin,rating
0,B0000SX2UC,3.0
1,B0009N5L7K,3.0
2,B000SKTZ0S,2.7


## Reading Review Data

In [81]:
reviews_df = pd.read_csv("reviews.csv")

print(reviews_df.shape)
reviews_df.head(3)

(67986, 8)


Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0
2,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0


## Preprocessing Review Data

In [82]:
reviews_df = reviews_df[["asin", "name", "rating"]]
reviews_df = reviews_df.groupby(['name', 'asin'])['rating'].mean().reset_index() # When a user scores a product multiple times, only save their means

reviews_df.head(3)

Unnamed: 0,name,asin,rating
0,"""I am"" Bradley",B00HPP3QD6,4.0
1,"""MASTERPIECE"" AND PEACE SHALL BE THE REWARD!",B07H8Q3C9T,5.0
2,"""monarca5""",B01N4E0RF1,5.0


## User Data

In [83]:
userInput = [
    {"asin": "B00HPP3QD6", "rating": 3.5},
    {"asin": "B07H8Q3C9T", "rating": 4.8},
    {"asin": "B01N4E0RF1", "rating": 2.1},
    {"asin": "B00MWI4KKE", "rating": 4.2},
    {"asin": "B00NKR9MJA", "rating": 3.9},
    {"asin": "B0096QYH80", "rating": 5.0},
    {"asin": "B07P8MQHSH", "rating": 2.8}
]

input_df = pd.DataFrame(userInput)
input_df.head()

Unnamed: 0,asin,rating
0,B00HPP3QD6,3.5
1,B07H8Q3C9T,4.8
2,B01N4E0RF1,2.1
3,B00MWI4KKE,4.2
4,B00NKR9MJA,3.9


## Finding Similar Users Bought the Same Products

In [84]:
reviews_subset = reviews_df[reviews_df.asin.isin(input_df.asin.to_list())]
reviews_subset = reviews_subset.dropna(subset=["asin", "name", "rating"])

print(reviews_subset.shape)

(2495, 3)


## Grouping Reviews Subset by User Names

In [85]:
grouped_reviews_subset = reviews_subset.groupby(("name"))

# grouped_reviews_subset.head()

sample_user_name = "Amazon Customer"
grouped_reviews_subset.get_group(sample_user_name)

Unnamed: 0,name,asin,rating
2539,Amazon Customer,B0096QYH80,3.03125
2562,Amazon Customer,B00HPP3QD6,3.8
2573,Amazon Customer,B00MWI4KKE,4.065217
2576,Amazon Customer,B00NKR9MJA,3.0
2674,Amazon Customer,B01N4E0RF1,3.254902
2873,Amazon Customer,B07H8Q3C9T,4.266667
2942,Amazon Customer,B07P8MQHSH,4.25


## Sorting The Groups (Users With The Same Bought Products Will Be at Top)

In [86]:
sorted_grouped_reviews_subset = sorted(grouped_reviews_subset, key=(lambda g: len(g[1])), reverse=True)
sorted_grouped_reviews_subset = sorted_grouped_reviews_subset[:100]

sorted_grouped_reviews_subset[0]

('Amazon Customer',
                  name        asin    rating
 2539  Amazon Customer  B0096QYH80  3.031250
 2562  Amazon Customer  B00HPP3QD6  3.800000
 2573  Amazon Customer  B00MWI4KKE  4.065217
 2576  Amazon Customer  B00NKR9MJA  3.000000
 2674  Amazon Customer  B01N4E0RF1  3.254902
 2873  Amazon Customer  B07H8Q3C9T  4.266667
 2942  Amazon Customer  B07P8MQHSH  4.250000)

## Calculating Pearson Correlation

In [87]:
from math import sqrt

pearson_corelation = {}
input_df = input_df.sort_values(by="asin")

for username, data_df in sorted_grouped_reviews_subset:
    data_df = data_df.sort_values(by="asin")

    input_same_products = input_df[input_df.asin.isin(data_df.asin.to_list())].sort_values(by="asin")
    input_same_products_count = len(input_same_products)

    if input_same_products_count == 0: continue
    data_ratings = data_df.rating.to_list()
    input_same_products_ratings = input_same_products.rating.to_list()

    # Computing Pearson
    Sxy = sum([i*j for i, j in zip(data_ratings, input_same_products_ratings)]) - (sum(data_ratings)*sum(input_same_products_ratings))/float(input_same_products_count)
    Sxx = sum([r**2 for r in input_same_products_ratings]) - (sum(input_same_products_ratings)**2)/float(input_same_products_count)
    Syy = sum([r**2 for r in data_ratings]) - (sum(data_ratings)**2)/float(input_same_products_count)

    if (Sxx != 0 and Syy != 0):
        pearson_corelation[username] = Sxy / sqrt(Sxx * Syy)
    else:
        pearson_corelation[username] = 0

    # pearson_corelation[username] = np.corrcoef(data_ratings, input_same_products_ratings)[0]
    
pearson_corelation.values()

dict_values([0.001977010363319115, 0.10601950080823827, -0.6546536707079765, 0, 0, -0.6025490242570914, 0.9773555548504415, -0.8846153846153815, 0, 0.9999999999999979, 0, -1.0000000000000004, 0, 1.0000000000000013, 1.0000000000000004, 0.9999999999999902, 0, -1.0000000000000038, 0, -1.0, 0, -0.9999999999999992, -0.9999999999999849, 0, -1.0, 0, 0, -1.0000000000000093, -0.9999999999999979, 1.000000000000001, 0.9999999999999973, 0, 1.000000000000001, 0.9999999999999979, -1.0000000000000056, 0, 0, 0, 0, 1.0000000000000042, 0.9999999999999998, -1.0, -1.0000000000000093, 0.9999999999999913, 0, 0, 0, 1.0000000000000042, 0, 0, -0.999999999999997, 1.0000000000000042, 0.9999999999999996, 0.9999999999999986, 1.0000000000002736, 0.9999999999999992, 1.0000000000000004, 0, 0, -1.0000000000000038, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])