In [4]:
import gzip
from collections import defaultdict
import random
import numpy as np
import scipy.optimize
import os

In [13]:
path = "/Users/spartan/Downloads/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")

In [14]:
header = f.readline()
print(header)
header = header.strip().split('\t')

marketplace	customer_id	review_id	product_id	product_parent	product_title	product_category	star_rating	helpful_votes	total_votes	vine	verified_purchase	review_headline	review_body	review_date



In [15]:
header

['marketplace',
 'customer_id',
 'review_id',
 'product_id',
 'product_parent',
 'product_title',
 'product_category',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'verified_purchase',
 'review_headline',
 'review_body',
 'review_date']

In [17]:
dataset = []
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

In [18]:
dataset[0]

{'marketplace': 'US',
 'customer_id': '45610553',
 'review_id': 'RMDCHWD0Y5OZ9',
 'product_id': 'B00HH62VB6',
 'product_parent': '618218723',
 'product_title': 'AGPtek® 10 Isolated Output 9V 12V 18V Guitar Pedal Board Power Supply Effect Pedals with Isolated Short Cricuit / Overcurrent Protection',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'N',
 'review_headline': 'Three Stars',
 'review_body': 'Works very good, but induces ALOT of noise.',
 'review_date': '2015-08-31'}

In [24]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it (item
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated (user

In [25]:
itemNames = {}
ratingDict = {}  # To retrieve a rating for a specific user/item pair

In [26]:
for d in dataset:
 user, item = d['customer_id'], d['product_id']
 usersPerItem[item].add(user)
 itemsPerUser[user].add(item)
 ratingDict[(user, item)] = d['star_rating']
 itemNames[item] = d['product_title']

In [27]:
# Extract per-user and per-item averages (useful later for rating prediction)
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
 rs = [ratingDict[(u, i)] for i in itemsPerUser[u]]
 userAverages[u] = sum(rs) / len(rs)
for i in usersPerItem:
 rs = [ratingDict[(u, i)] for u in usersPerItem[i]]
 itemAverages[i] = sum(rs) / len(rs)

### Similarity Metrics

In [28]:
import math
## Jaccard Similarity
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer/denom

## Cosine Similarity
def cosine(s1, s2):
    # Not a proper implementation, operates on sets so correct for interaction
    numer = len(s1.intersection(s2))
    denom = math.sqrt(len(s1)) * math.sqrt(len(s2))
    if denom == 0:
        return 0
    return numer / denom

def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u, i1)]*ratingDict[(u, i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u, i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u, i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0:
        return 0
    return numer / denom

def Pearson(i1, i2):
    # Between two items
    iBar1 = itemAverages[i1]
    iBar2 = itemAverages[i2]
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u, i1)] - iBar1)*(ratingDict[(u, i2)] - iBar2)
    for u in inter: # usersPerItem[i1]:
        denom1 += (ratingDict[(u, i1)] - iBar1)**2
    # for u in usersPerItem[i2]:
    denom2 += (ratingDict[(u, i2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0:
        return 0
    return numer / denom

In [29]:
#retrieve N items most similar to a candidate item i
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i] #find all users who purchased i
    for i2 in usersPerItem: #iterate over all other items (profiles) and com
        if i2 == i:
            continue
        sim = Jaccard(users, usersPerItem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straigh
        similarities.append((sim, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [30]:
dataset[2]

{'marketplace': 'US',
 'customer_id': '6111003',
 'review_id': 'RIZR67JKUDBI0',
 'product_id': 'B0006VMBHI',
 'product_parent': '603261968',
 'product_title': 'AudioQuest LP record clean brush',
 'product_category': 'Musical Instruments',
 'star_rating': 3,
 'helpful_votes': 0,
 'total_votes': 1,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Three Stars',
 'review_body': 'removes dust. does not clean',
 'review_date': '2015-08-31'}

In [31]:
query = dataset[2]['product_id']

In [32]:
ms = mostSimilar(query, 10)

In [33]:
ms
#prints top-10 list: (sim, product_id)

[(0.028446389496717725, 'B00006I5SD'),
 (0.01694915254237288, 'B00006I5SB'),
 (0.015065913370998116, 'B000AJR482'),
 (0.014204545454545454, 'B00E7MVP3S'),
 (0.008955223880597015, 'B001255YL2'),
 (0.008849557522123894, 'B003EIRVO8'),
 (0.008333333333333333, 'B0015VEZ22'),
 (0.00821917808219178, 'B00006I5UH'),
 (0.008021390374331552, 'B00008BWM7'),
 (0.007656967840735069, 'B000H2BC4E')]

In [34]:
print("Input query: ", itemNames[query])
print("")
print("Recommended items: ")
[itemNames[x[1]] for x in ms]

Input query:  AudioQuest LP record clean brush

Recommended items: 


['Shure SFG-2 Stylus Tracking Force Gauge',
 'Shure M97xE High-Performance Magnetic Phono Cartridge',
 'ART Pro Audio DJPRE II Phono Turntable Preamplifier',
 'Signstek Blue LCD Backlight Digital Long-Playing LP Turntable Stylus Force Scale Gauge Tester',
 'Audio Technica AT120E/T Standard Mount Phono Cartridge',
 'Technics: 45 Adaptor for Technics 1200 (SFWE010)',
 'GruvGlide GRUVGLIDE DJ Package',
 'STANTON MAGNETICS Record Cleaner Kit',
 'Shure M97xE High-Performance Magnetic Phono Cartridge',
 'Behringer PP400 Ultra Compact Phono Preamplifier']

In [35]:
def mostSimilarFast(i, N):
 similarities = []
 users = usersPerItem[i]
 candidateItems = set()
 for u in users:
    candidateItems = candidateItems.union(itemsPerUser[u]) #iterate over i
 for i2 in candidateItems:
    if i2 == i:
        continue
 sim = Jaccard(users, usersPerItem[i2])
 similarities.append((sim, i2))
 similarities.sort(reverse=True)
 return similarities[:N]

In [36]:
mostSimilarFast(query, 10)

[(0.002967359050445104, 'B00IMPJV7S')]

Similarity-based rating estimation

In [37]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

In [38]:
for d in dataset:
 user, item = d['customer_id'], d['product_id']
 reviewsPerUser[user].append(d)
 reviewsPerItem[item].append(d)

In [39]:
ratingMean = sum([d['star_rating'] for d in dataset]) / len(dataset)
ratingMean  #avg rating of the entire dataset

4.251102772543146

In [40]:
def predictRating(user, item):
 ratings = []
 similarities = []
 for d in reviewsPerUser[user]:
    i2 = d['product_id']
    if i2 == item:
        continue
 ratings.append(d['star_rating']) #rating of user for item i2
 similarities.append(Jaccard(usersPerItem[item], usersPerItem[i2])) #si
 if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x, y in zip(ratings, similarities)] #weig
    return sum(weightedRatings) / sum(similarities) #weighted average
 else:
    # User hasn't rated any similar items
    return ratingMean

In [52]:
# Exercise 1:
def predictRating(user ,item):
    ratings = [] # Collect ratings over which to average
    sims = [] # and similarity scores
    for d in reviewsPerUser[user]:
        j = d['product_id']
        if j == item: 
            continue # Skip the query item
        ratings.append(d['star_rating'] - itemAverages[j])
        sims.append(Jaccard(usersPerItem[item], usersPerItem[j]))
    if (sum(sims) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings ,sims)]
        return itemAverages[item] + sum( weightedRatings ) / sum(sims)
    else:
        # User hasn't rated any similar items
        return ratingMean

In [51]:
#Exercise 2
def predictRating(user, item):
 ratings = []
 similarities = []
 for d in reviewsPerItem[item]:
    u2 = d['customer_id']
    if u2 == user:
        continue
 ratings.append(d['star_rating']) #rating of item for user u2
 similarities.append(Jaccard(itemsPerUser[user], itemsPerUser[u2])) #si
 if (sum(similarities) > 0):
    weightedRatings = [(x*y) for x, y in zip(ratings, similarities)] #weig
    return sum(weightedRatings) / sum(similarities) #weighted average
 else:
    return ratingMean

In [None]:
# Exercise 3
'''Question: Would the modification above work for metrics other than
Jaccard? Why/Why not?'''

# For the modification above will work for CosineSet, however, won't work for Cosine and Pearson. This is because the Cosine and Pearson metrics are 
# for user-to-user similarity and would need to rely on 'itemPerUser', whereas the current implimention is for item-based similarities, and 
# uses 'usersPerItems' and 'itemAverages'

'''If not, then which parts of the code need to be updated as well?'''
# The function definition of the Cosine and Pearson metrics need to be updated to correctly reflect user-to-user similarity using 'itemsPerUser'

In [53]:
dataset[1]

{'marketplace': 'US',
 'customer_id': '14640079',
 'review_id': 'RZSL0BALIYUNU',
 'product_id': 'B003LRN53I',
 'product_parent': '986692292',
 'product_title': 'Sennheiser HD203 Closed-Back DJ Headphones',
 'product_category': 'Musical Instruments',
 'star_rating': 5,
 'helpful_votes': 0,
 'total_votes': 0,
 'vine': 'N',
 'verified_purchase': 'Y',
 'review_headline': 'Five Stars',
 'review_body': 'Nice headphones at a reasonable price.',
 'review_date': '2015-08-31'}

In [54]:
u, i = dataset[1]['customer_id'], dataset[1]['product_id']

In [55]:
predictRating(u, i)

4.509357030989021

Evaluate across the entire corpus

In [56]:
# Compute the MSE for a model based on this heuristic
def MSE(predictions, labels):
 differences = [(x-y)**2 for x, y in zip(predictions, labels)]
 return sum(differences) / len(differences)

In [57]:
alwaysPredictMean = [ratingMean for d in dataset]

In [58]:
simPredictions = [predictRating(
 d['customer_id'], d['product_id']) for d in dataset]

In [59]:
labels = [d['star_rating'] for d in dataset]

In [60]:
MSE(alwaysPredictMean, labels)

1.4796142779712909

In [61]:
MSE(simPredictions, labels)

1.446725779491265