In [2]:
#import required modules
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [3]:
books = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
users = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']



  books = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8

In [4]:
#combining books and users via isbn
combine_book_rating = pd.merge(ratings, books, on='ISBN')
columns = ['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL']
combine_book_rating = combine_book_rating.drop(columns, axis=1)
print(combine_book_rating.head())

   userID        ISBN  bookRating             bookTitle
0  276725  034545104X           0  Flesh Tones: A Novel
1    2313  034545104X           5  Flesh Tones: A Novel
2    6543  034545104X           0  Flesh Tones: A Novel
3    8680  034545104X           5  Flesh Tones: A Novel
4   10314  034545104X           9  Flesh Tones: A Novel


In [5]:
#count the ratings
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['bookTitle'])

book_ratingCount = (combine_book_rating.
     groupby(by = ['bookTitle'])['bookRating'].
     count().
     reset_index().
     rename(columns = {'bookRating': 'totalRatingCount'})
     [['bookTitle', 'totalRatingCount']]
    )
print(book_ratingCount.head())

                                           bookTitle  totalRatingCount
0   A Light in the Storm: The Civil War Diary of ...                 4
1                              Always Have Popsicles                 1
2               Apple Magic (The Collector's series)                 1
3   Ask Lily (Young Women of Faith: Lily Series, ...                 1
4   Beyond IBM: Leadership Marketing and Finance ...                 1


In [6]:
#merge rate count
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


In [7]:

book_ratingCount['totalRatingCount'].describe()

count    241071.000000
mean          4.277312
std          16.738685
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max        2502.000000
Name: totalRatingCount, dtype: float64

In [8]:
#only taking books with rate count > 50
popularity_threshold = 50
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
print(rating_popular_book.head())

   userID        ISBN  bookRating             bookTitle  totalRatingCount
0  276725  034545104X           0  Flesh Tones: A Novel                60
1    2313  034545104X           5  Flesh Tones: A Novel                60
2    6543  034545104X           0  Flesh Tones: A Novel                60
3    8680  034545104X           5  Flesh Tones: A Novel                60
4   10314  034545104X           9  Flesh Tones: A Novel                60


In [46]:
#getting rating based on country
combined = rating_popular_book.merge(users, left_on = 'userID', right_on = 'userID', how = 'left')

country_user_rating = combined[combined['Location'].str.contains("canada")]
country_user_rating=country_user_rating.drop('Age', axis=1)
country_user_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,Location
73,24878,446520802,7,The Notebook,650,"ottawa, ontario, canada"
75,27617,446520802,9,The Notebook,650,"beaverton, ontario, canada"
76,28204,446520802,0,The Notebook,650,"south ohio, nova scotia, canada"
121,157823,446520802,10,The Notebook,650,"guelph, ontario, canada"
133,188010,446520802,0,The Notebook,650,"edmonton, alberta, canada"


In [47]:
country_user_rating = country_user_rating.drop_duplicates(['userID', 'bookTitle'])
country_user_rating_pivot = country_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)
country_user_rating_matrix = csr_matrix(country_user_rating_pivot.values)
country_user_rating_matrix

<2402x4949 sparse matrix of type '<class 'numpy.float64'>'
	with 10834 stored elements in Compressed Sparse Row format>

In [48]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(country_user_rating_matrix)
model_knn

In [49]:
query_index = np.random.choice(country_user_rating_pivot.shape[0])
query_index

1859

In [50]:
distances, indices = model_knn.kneighbors(country_user_rating_pivot.iloc[query_index].values.reshape(1, -1), n_neighbors = 5)
country_user_rating_pivot.index[query_index]

"The Girls' Guide to Hunting and Fishing"

In [51]:
for i in range(0, len(distances.flatten())):
    if i == 0.0:
        print('Recommendations for {0}:\n'.format(country_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, country_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for The Girls' Guide to Hunting and Fishing:

1: The Weight of Water : A Novel Tag - Author of Resistance and Strange Fits of Passion, with distance of 0.7916767766445126:
2: Flight Lessons, with distance of 0.7959807759496249:
3: Nerd in Shining Armor, with distance of 0.7959807759496249:
4: To Trust a Stranger, with distance of 0.7959807759496249:


In [22]:
country_user_rating_pivot.iloc[query_index].values.reshape(1,-1)

array([[0., 0., 0., ..., 0., 0., 0.]])

numpy.ndarray