In [1]:
import csv
import math
import codecs
import pandas as pd
import numpy as np
from pandas import DataFrame

In [2]:
users_df = pd.read_csv('BX-Dump/BX-Users.csv', sep=';', 
                       names=['userid', 'location', 'age'],
                       dtype={'userid':str, 'location':str, 'age':str},
                       quotechar='"',
                       quoting=csv.QUOTE_ALL,
                       escapechar='\\', 
                       encoding='utf-8')

In [3]:
users_df["age"] = users_df["age"].apply(lambda x: int(x) if x.isnumeric() else np.nan)

In [4]:
users_df.head(3)

Unnamed: 0,userid,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",


In [5]:
book_ratings_df = pd.read_csv('BX-Dump/BX-Book-Ratings.csv', sep=';', 
                       names=['userid', 'bookid', 'rating'],
                       dtype={'userid':str, 'bookid':str, 'rating':int},
                       quotechar='"',
                       quoting=csv.QUOTE_ALL,
                       escapechar='\\', 
                       encoding='utf-8')

In [6]:
book_ratings_df.head(3)

Unnamed: 0,userid,bookid,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0


In [7]:
books_df = pd.read_csv('BX-Dump/BX-Books.csv', sep=';', 
                       names=['id', 'title', 'author', 'year', 'publisher', 'imgurl1', 'imgurl2', 'imgurl3'],
                       dtype={'id':str, 'title':str, 'author':str, 'year':int, 'publisher':str, 
                              'imgurl1':str, 'imgurl2':str, 'imgurl3':str},
                       quotechar='"',
                       quoting=csv.QUOTE_ALL,
                       escapechar='\\', 
                       encoding='utf-8')

In [8]:
books_df = books_df.drop(['imgurl1', 'imgurl2', 'imgurl3'], axis=1)

In [9]:
books_df.head(3)

Unnamed: 0,id,title,author,year,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial


In [26]:
books = books_df.set_index("id").to_dict()['title']
books

{u'0449139026': u'Dennis the Menace: Make Believe Angel (Fawcett Gold Medal Book)',
 u'0439110246': u'Losers, Inc.',
 u'0802139310': u'All That Counts',
 u'0140431357': u"Tess of the D'Urbervilles (Penguin Classics)",
 u'0486220125': u'How the Other Half Lives: Studies Among the Tenements of New York',
 u'0451150937': u'Cassidy',
 u'0965916480': u'Journal for Grief and Healing: When Someone You Love Dies',
 u'0520006534': u'Diaries of Paul Klee, 1898-1918',
 u'0373262140': u'Writers Of The Purple Sage',
 u'0312325614': u'The Caddie',
 u'1400041287': u"Haiku (Everyman's Library Pocket Poets)",
 u'0749321172': u'Travels with Charley (Mandarin Classic)',
 u'0399200525': u'Everyday Life in Ancient Rome',
 u'0764912623': u'The Library of Congress: An Architectural Alphabet',
 u'3548236820': u'The Commitments / Snapper',
 u'0449502147': u'In My Ladys Chamber',
 u'3548244467': u'Hochspannung.',
 u'0887299253': u'Insight Pocket Guides Paris (Insight Pocket Guides)',
 u'0394825942': u'My Cat Be

In [11]:
book_ratings = {}
for row in book_ratings_df.values:
    br = {}
    if book_ratings.has_key(row[0]):
        br = book_ratings[row[0]]
    br[row[1]] = row[2]
    book_ratings[row[0]] = br        

In [12]:
book_ratings['7676']

{u'0806516542': 8}

In [13]:
br = book_ratings_df[book_ratings_df.userid == '276725']
br["bookid"].tolist()

[u'034545104X']

In [16]:
def getRatingsForUser(user):
    if not users.has_key(user):
        br = book_ratings_df[book_ratings_df.userid == user]
        booklist = br["bookid"].tolist()
        ratinglist = br["rating"].tolist()
        brmap = {}
        i = 0
        for book in booklist:
            brmap[book] = ratinglist[i]
            i = i + 1
        users[user] = brmap
    return users[user]

In [168]:
class recommender:
    def __init__(self, ratings_data, books, metric='pearson', k=5, n=10):
        self.ratings_data = ratings_data
        self.books = books
        self.k = k
        self.n = n
        self.metric = metric
        if metric == 'manhattan':
            self.fn = self.manhattan
        elif metric == 'euclidean':
            self.fn = self.euclidean
        else:
            self.fn = self.pearson
    def getTemp(self):
        return self.temp
    def pearson(self, rating1, rating2):
        sumxy = 0.0
        sumx = 0.0
        sumy = 0.0
        sumx2 = 0.0
        sumy2 = 0.0
        n = 0
        for key in rating1:
            if key in rating2:
                sumx += rating1[key]
                sumy += rating2[key]
                sumxy += rating1[key] * rating2[key]
                sumx2 += pow(rating1[key], 2)
                sumy2 += pow(rating2[key], 2)
                n += 1
        if n == 0:
            return 0
        denominator = math.sqrt(sumx2 - pow(sumx, 2) / n) * math.sqrt(sumy2 - pow(sumy, 2) / n)
        if denominator == 0:
            return 0;
        return (sumxy - ((sumx * sumy) / n)) / denominator
    def manhattan(self, rating1, rating2):
        distance = 0.0;
        for key in rating1:
            if key in rating2:
                distance += abs(rating1[key] - rating2[key])
#             else:
#                 distance += rating1[key]
#         for key in rating2:
#             if key not in rating1:
#                 distance += rating2[key]
        return distance
    def euclidean(self, rating1, rating2):
        distance = 0.0;
        for key in rating1:
            if key in rating2:
                distance += pow(rating1[key] - rating2[key], 2)
#             else:
#                 distance += pow(rating1[key], 2)
#         for key in rating2:
#             if key not in rating1:
#                 distance += pow(rating2[key], 2)
        return math.sqrt(distance)
    def computeNearestNeighbor(self, username):
        distances = []
        for user in self.ratings_data:
            if user != username:
                distance = self.fn(self.ratings_data[username], self.ratings_data[user])
                distances.append((user, distance))
        if self.metric == 'pearson':        
            # for pearson, higher correlation value means closer
            distances.sort(key = lambda x: x[1], reverse = True)
        else:
            distances.sort(key = lambda x: x[1])
        return distances
    def recommend(self, username):
        recommendations = {}
        total_distance = 0.0
        nearest_neighbors = self.computeNearestNeighbor(username)
        user_ratings = self.ratings_data[username]
        for i in range(self.k):
            total_distance += nearest_neighbors[i][1]            
        for i in range(self.k):
            if total_distance == 0:
                weight = 1
            else:
                weight = nearest_neighbors[i][1] / total_distance
            name = nearest_neighbors[i][0]
            neighborRatings = self.ratings_data[name]            
            for bookid in neighborRatings:
                recommendations[bookid] = 0.0
                if bookid not in user_ratings:
                    recommendations[bookid] += neighborRatings[bookid] * weight
        recommendations = list(recommendations.items())
#         recommendations = [(self.books[k], v) for (k, v) in recommendations]
        if self.metric == 'pearson':
            recommendations.sort(key = lambda x: x[1], reverse=True)
        else:
            recommendations.sort(key = lambda x: x[1])
        recommendations = recommendations[:self.n]
        return [(self.books[k] if k in self.books else k, v) for (k, v) in recommendations]

In [169]:
obj = recommender(book_ratings, books);
obj.pearson({'a': 2, 'b': 3, 'c':4}, {'a': 3, 'b': 2, 'c':5})

0.6546536707079773

In [170]:
obj.manhattan({'a': 2, 'b': 3, 'c':4}, {'a': 3, 'b': 2})

2.0

In [171]:
obj.euclidean({'a': 2, 'b': 3, 'c':4}, {'a': 3, 'b': 2})    

1.4142135623730951

In [172]:
book_ratings['7676']

{u'0806516542': 8}

In [173]:
obj.computeNearestNeighbor('7676')
# books['0140280391']
len(books)

271379

In [174]:
obj.recommend('7676')

[(u'After Rain', 9.0),
 (u'Housekeeping', 7.0),
 (u'Cheese', 7.0),
 (u'Fortune', 6.0),
 (u'Spasm: A Memoir with Lies', 6.0),
 (u'0140280391', 2.0),
 (u"The Fortune Teller's Daughter", 0.0),
 (u'Molly Takes Flight (American Girls Short Stories)', 0.0),
 (u'Haunting Rachel', 0.0),
 (u'The Accursed Mountains; Journeys in Albania', 0.0)]

In [98]:
g = [5, 7, 0, 2, 3, 4]
g[-2:]

[4]

In [23]:
list({'a': 2, 'b': 3, 'c':4}.items())

[('a', 2), ('c', 4), ('b', 3)]

In [90]:
book_ratings.keys()

[u'228054',
 u'228050',
 u'228053',
 u'228052',
 u'89372',
 u'89374',
 u'89375',
 u'89376',
 u'89377',
 u'243522',
 u'5988',
 u'256901',
 u'256904',
 u'83892',
 u'5982',
 u'5983',
 u'256908',
 u'5981',
 u'5984',
 u'103548',
 u'127479',
 u'83890',
 u'103541',
 u'103540',
 u'103542',
 u'103545',
 u'82446',
 u'82445',
 u'82444',
 u'120129',
 u'243524',
 u'254376',
 u'120122',
 u'36601',
 u'120125',
 u'252560',
 u'235389',
 u'235381',
 u'232173',
 u'40136',
 u'173155',
 u'173154',
 u'243529',
 u'254370',
 u'173159',
 u'97153',
 u'97152',
 u'252567',
 u'97154',
 u'260299',
 u'254813',
 u'260297',
 u'272',
 u'273',
 u'195515',
 u'16708',
 u'278',
 u'16704',
 u'16701',
 u'88023',
 u'88022',
 u'88021',
 u'245823',
 u'88029',
 u'88028',
 u'245829',
 u'46706',
 u'100017',
 u'100015',
 u'119869',
 u'100010',
 u'52461',
 u'119867',
 u'52463',
 u'17254',
 u'52465',
 u'100018',
 u'63024',
 u'10709',
 u'10708',
 u'46702',
 u'95799',
 u'63023',
 u'235966',
 u'53731',
 u'10700',
 u'53736',
 u'176062',
