In [1]:
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
                      "Norah Jones": 4.5, "Phoenix": 5.0,
                      "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill": {"Blues Traveler": 2.0, "Broken Bells": 3.5,
                  "Deadmau5": 4.0, "Phoenix": 2.0,
                  "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
                  "Deadmau5": 1.0, "Norah Jones": 3.0,
                  "Phoenix": 5, "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
                 "Deadmau5": 4.5, "Phoenix": 3.0,
                 "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                 "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
                    "Norah Jones": 4.0, "The Strokes": 4.0,
                    "Vampire Weekend": 1.0},
         "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0,
                    "Phoenix": 5.0, "Slightly Stoopid": 4.5,
                    "The Strokes": 4.0, "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
                 "Norah Jones": 3.0, "Phoenix": 5.0,
                 "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
                      "Phoenix": 4.0, "Slightly Stoopid": 2.5,"The Strokes": 3.0}
         }


In [5]:
def manhattan(rating1, rating2):
    distance = 0
    for key in rating1:
        if key in rating2:
            distance += abs(rating1[key] - rating2[key])
    return distance

def computeNearestNeighbor(username, users):
    distances = []
    for user in users:
        if user != username:
            distance = manhattan(users[user], users[username])
            distances.append((distance, user))
    distances.sort()
    return distances

In [6]:
print(manhattan(users['Hailey'], users['Veronica']))

print(manhattan(users['Hailey'], users['Jordyn']))

2.0
7.5


In [7]:
computeNearestNeighbor("Hailey", users)

[(2.0, 'Veronica'),
 (4.0, 'Chan'),
 (4.0, 'Sam'),
 (4.5, 'Dan'),
 (5.0, 'Angelica'),
 (5.5, 'Bill'),
 (7.5, 'Jordyn')]

In [17]:
def recommend(username, users):
    nearest = computeNearestNeighbor(username, users)[0][1]
    recommendations = []
    userRatings = users[username]
    neighborRatings = users[nearest]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
    return sorted(recommendations, key = lambda x:x[1], reverse = True)

In [18]:
recommend('Hailey', users)

[('Phoenix', 4.0), ('Blues Traveler', 3.0), ('Slightly Stoopid', 2.5)]

##  Using minkowski distance to make recommendation

In [19]:
def minkowski(rating1, rating2, r):
    distance = 0
    commonRating = False
    for key in rating1:
        if key in rating2:
            distance += pow(abs(rating1[key] - rating2[key]),r)
            commonRating = True
    if commonRating:
        return pow(distance, 1/r)
    else:
        return 0


In [26]:
print(minkowski(users['Hailey'], users['Veronica'],2))

print(minkowski(users['Hailey'], users['Jordyn'],2))

1.4142135623730951
4.387482193696061


In [29]:
def computeNearestNeighbor2(username, users):
    distances = []
    for user in users:
        if user != username:
            distance = minkowski(users[user], users[username],2)
            distances.append((distance, user))
    distances.sort()
    return distances

In [30]:
computeNearestNeighbor2("Hailey", users)

[(1.4142135623730951, 'Veronica'),
 (2.449489742783178, 'Sam'),
 (2.7386127875258306, 'Angelica'),
 (3.1622776601683795, 'Chan'),
 (3.640054944640259, 'Bill'),
 (3.640054944640259, 'Dan'),
 (4.387482193696061, 'Jordyn')]

In [32]:
def recommend2(username, users):
    nearest = computeNearestNeighbor2(username, users)[0][1]
    recommendations = []
    userRatings = users[username]
    neighborRatings = users[nearest]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
    return sorted(recommendations, key = lambda x:x[1], reverse = True)

In [33]:
recommend2('Hailey', users)

[('Phoenix', 4.0), ('Blues Traveler', 3.0), ('Slightly Stoopid', 2.5)]

In [38]:
from math import sqrt
def pearson(rating1, rating2):
    sum_xy, sum_x, sum_y, sum_x2, sum_y2 = 0,0,0,0,0
    n = 0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x*y
            sum_x += x
            sum_y += y
            sum_x2 += x**2
            sum_y2 += y**2
    if n == 0:
        return 0
    denominator = sqrt(sum_x2 - (sum_x**2)/n) * sqrt(sum_y2 - (sum_y**2)/n)
    if denominator == 0:
        return 0
    else:
        return (sum_xy - (sum_x * sum_y)/n)/denominator


In [39]:
pearson(users['Angelica'], users['Bill'])

-0.9040534990682699

## By k-nearest neighbors 
   Implement the class of recommendation system
   

In [2]:
from math import sqrt
users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0,
                      "Norah Jones": 4.5, "Phoenix": 5.0,
                      "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill": {"Blues Traveler": 2.0, "Broken Bells": 3.5,
                  "Deadmau5": 4.0, "Phoenix": 2.0,
                  "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0,
                  "Deadmau5": 1.0, "Norah Jones": 3.0,
                  "Phoenix": 5, "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0,
                 "Deadmau5": 4.5, "Phoenix": 3.0,
                 "Slightly Stoopid": 4.5, "The Strokes": 4.0,
                 "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0,
                    "Norah Jones": 4.0, "The Strokes": 4.0,
                    "Vampire Weekend": 1.0},
         "Jordyn": {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0,
                    "Phoenix": 5.0, "Slightly Stoopid": 4.5,
                    "The Strokes": 4.0, "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0,
                 "Norah Jones": 3.0, "Phoenix": 5.0,
                 "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0,
                      "Phoenix": 4.0, "Slightly Stoopid": 2.5,"The Strokes": 3.0}
         }


In [9]:
import codecs
class recommender:
    def pearson(self, rating1, rating2):
        sum_xy, sum_x, sum_y, sum_x2, sum_y2 = 0,0,0,0,0
        n = 0
        for key in rating1:
            if key in rating2:
                n += 1
                x = rating1[key]
                y = rating2[key]
                sum_xy += x*y
                sum_x += x
                sum_y += y
                sum_x2 += x**2
                sum_y2 += y**2
        if n == 0:
            return 0
        denominator = sqrt(sum_x2 - (sum_x**2)/n) * sqrt(sum_y2 - (sum_y**2)/n)
        if denominator == 0:
            return 0
        else:
            return (sum_xy - (sum_x * sum_y)/n)/denominator
        
    def __init__(self, data, k=1, metric='pearson', n=5):
        '''intialize recommender
        if data is dictionary, ther recommender is intialized
        Fora all other data types of data, no initialization occurs
        k is the k value for k nearest neighbor
        metirc is which distance formula to use
        n is the maximum number of recommendations to make'''
        self.k = k
        self.n = n
        self.username2id = {}
        self.userid2name = {}
        self.productid2name = {}
        # save the name of the metric
        self.metric = metric
        if self.metric == 'pearson':
            self.fn = self.pearson
        
        if type(data).__name__ =='dict':
            self.data = data
        
    def convertProductID2name(self, id):
        """give product id number return product name"""
        if id in self.productid2name:
            return self.productid2name[id]
        else:
            return id
    
    def userRatings(self, id, n):
        """Return n top ratings for user with id"""
        print ("Ratings for " + self.userid2name[id])
        ratings = self.data[id]
        print(len(ratings))
        ratings = list(ratings.items())
        ratings = [(self.convertProductID2name(k), v) for (k, v) in ratings]
            # finally sort and return
        ratings.sort(key=lambda artistTuple: artistTuple[1],
        reverse = True)
        ratings = ratings[:n]
        for rating in ratings:
            print("%s\t%i" % (rating[0], rating[1]))
            
    def loadBookDB(self, path=''):
        """loads the BX book dataset
        Path is where the BX files are located"""
        self.data = {}
        i = 0
        head = False
        f = codecs.open(path + "BX-Book-Ratings.csv", 'r', encoding = "ISO-8859-1")
        for line in f:
            if not head:
                head = True
                continue
            i+=1
            #separate line into fields
            fields = line.split(';')
            user = fields[0].strip('"')
            book = fields[1].strip('"')
            rating = fields[2].strip().strip('"')
            rating = int(rating)
            if user in self.data:
                currentRatings = self.data[user]
            else:
                currentRatings = {}
            currentRatings[book] = rating
            self.data[user] = currentRatings
        f.close()
        #
        # Now load books into self.productid2name
        # Books contains isbn, title, and author among other fields
        #
        f = codecs.open(path + "BX-Books.csv", 'r', encoding = "ISO-8859-1")
        head = False
        for line in f:
            if not head:
                head = True
                continue
            i += 1
            # separate line into fields
            fields = line.split(';')
            isbn = fields[0].strip('"')
            title = fields[1].strip('"')
            author = fields[2].strip().strip('"')
            title = title + ' by ' + author
            self.productid2name[isbn] = title
        f.close()
        f = codecs.open(path + "BX-Users.csv",'r', encoding = "ISO-8859-1")
        head = False
        for line in f:
            if not head:
                head = True
                continue
            i += 1
            #separate line into fields
            fields = line.split(';')
            userid = fields[0].strip('"')
            location = fields[1].strip('"')
            if len(fields) > 3:
                age = fields[2].strip().strip('"')
            else:
                age = 'NULL'
            if age != 'NULL':
                value = location +' (age:' + age + ')'
            else:
                value = location
            self.userid2name[userid] = value
            self.username2id[location] = userid
        f.close()
        print(i)
            

            
    def computeNearestNeighbor(self, username):
        """creates a sorted list of users based on their distance to            
        username"""
        distances = []
        for instance in self.data:
            if instance != username:
                distance = self.fn(self.data[username],self.data[instance])
                distances.append((instance, distance))
        # sort based on distance -- closest first
        distances.sort(key=lambda artistTuple: artistTuple[1],reverse=True)
        return distances
    
    def recommend(self, user):
        recommendations = {}
        #get list of users ordered by nearness
        nearest = self.computeNearestNeighbor(user)            
        # now get the ratings for the user
        userRatings = self.data[user]
        ##
        # determine the total distance
        totalDistance = 0.0
        for i in range(self.k):
            totalDistance += nearest[i][1]
        # now iterate through the k nearest neighbors
        # accumulating their ratings
        for i in range(self.k):
            # compute slice of pie
            weight = nearest[i][1] / totalDistance
            # get the name of the person
            name = nearest[i][0]
            # get the ratings for this person
            neighborRatings = self.data[name]
            # get the name of the person
            # now find bands neighbor rated that user didn't
            for artist in neighborRatings:
                if not artist in userRatings:
                    if artist not in recommendations:
                        recommendations[artist] = (neighborRatings[artist] * weight)
                    else:
                        recommendations[artist] = (recommendations[artist] +\
                                                   neighborRatings[artist] * weight)
        recommendations = list(recommendations.items())
        recommendations = [(self.convertProductID2name(k),v) \
                              for (k, v) in recommendations]
        # finally sort and return
        recommendations.sort(key=lambda x:x[1],reverse =True)
        # Return the first n items
        return recommendations[:self.n]

In [10]:
r = recommender(users)
r.recommend('Jordyn')

[('Blues Traveler', 5.0)]

In [11]:
r.loadBookDB('BX-CSV/')

1700018


In [15]:
r.recommend('171118')

[("A Swiftly Tilting Planet by Madeleine L'Engle", 10.0),
 ("The Godmother's Apprentice by Elizabeth Ann Scarborough", 10.0),
 ("The Godmother's Web by Elizabeth Ann Scarborough", 10.0),
 ("The Irrational Season (The Crosswicks Journal, Book 3) by Madeleine L'Engle",
  10.0),
 ('The Girl Who Loved Tom Gordon by Stephen King', 9.0)]

In [14]:
r.userRatings('171118',5)

Ratings for toronto, ontario, canada
2421
0006479502	10
Time Power: The Revolutionary Time Management System That Can Change Your Professional and Personal by Charles Hobbs	10
Pilgrim at Tinker Creek by Annie Dillard	10
0099430967	10
Just So Stories (Penguin Twentieth-Century Classics) by Rudyard Kipling	10
