In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore')

In [6]:
book = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding='latin-1')
book.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'Publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']

user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding='latin-1')
user.columns = ['userID', 'Location', 'Age']

rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding='latin-1')
rating.columns = ['userID', 'ISBN', 'bookRating']

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [7]:
book.drop(['imageUrlS', 'imageUrlM', 'imageUrlL'], axis=1, inplace=True)
book.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [9]:
combine_book_rating = pd.merge(rating, book, on='ISBN')
columns = ['yearOfPublication', 'Publisher', 'bookAuthor']
combine_book_rating = combine_book_rating.drop(columns, axis=1)
combine_book_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel


In [10]:
combine_book_rating = combine_book_rating.dropna(axis=0, subset= ['bookTitle'])

In [12]:
book_ratingCount = (combine_book_rating.groupby(by = ['bookTitle'])['bookRating'].count().reset_index().rename(columns = {'bookRating' : 'totalRatingCount'})[['bookTitle', 'totalRatingCount']])
book_ratingCount.head()

Unnamed: 0,bookTitle,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [13]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how ='left')
rating_with_totalRatingCount.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,60
1,2313,034545104X,5,Flesh Tones: A Novel,60
2,6543,034545104X,0,Flesh Tones: A Novel,60
3,8680,034545104X,5,Flesh Tones: A Novel,60
4,10314,034545104X,9,Flesh Tones: A Novel,60


In [14]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

count   241071.000
mean         4.277
std         16.739
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64


In [15]:
print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))

0.900    7.000
0.910    8.000
0.920    9.000
0.930   10.000
0.940   11.000
0.950   13.000
0.960   16.000
0.970   20.000
0.980   29.000
0.990   50.000
Name: totalRatingCount, dtype: float64


In [16]:
popularity_threshold = 100
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount
62,276727,446520802,0,The Notebook,650
63,278418,446520802,0,The Notebook,650
64,638,446520802,0,The Notebook,650
65,3363,446520802,0,The Notebook,650
66,7158,446520802,10,The Notebook,650


### FILTER TO USERS IN US & CANADA ONLY

In [17]:
combined = rating_popular_book.merge(user, left_on ='userID', right_on= 'userID', how='left')

us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating = us_canada_user_rating.drop('Age', axis = 1)

us_canada_user_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,totalRatingCount,Location
1,278418,446520802,0,The Notebook,650,"omaha, nebraska, usa"
2,638,446520802,0,The Notebook,650,"san diego, california, usa"
3,3363,446520802,0,The Notebook,650,"knoxville, tennessee, usa"
4,7158,446520802,10,The Notebook,650,"omaha, nebraska, usa"
5,8253,446520802,10,The Notebook,650,"tulsa, oklahoma, usa"


### IMPLEMENTING kNN

In [19]:
us_canada_user_rating = us_canada_user_rating.drop_duplicates(['userID', 'bookTitle'])
us_canada_user_rating_pivot = us_canada_user_rating.pivot_table(index = 'bookTitle', columns= 'userID', values = 'bookRating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

In [20]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

### Test our model & make some recommendations:

In [33]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

for i in range(0, len(distances.flatten())):
    if i==0:
        print('Recommendations for', format(us_canada_user_rating_pivot.index[query_index]), ':')
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]],distances.flatten()[i]))

Recommendations for 4 Blondes :
1: Memoirs of a Geisha Uk, with distance of 0.9182904847856824:
2: One Hundred Years of Solitude, with distance of 0.9367029403575525:
3: Simple Abundance:  A Daybook of Comfort and Joy, with distance of 0.9370710607695715:
4: The Beach, with distance of 0.9371893960185701:
5: The Crimson Petal and the White, with distance of 0.9409455414495981:


### Collaborative Fitering Using Matrix Factorization

In [35]:
us_canada_user_rating_pivot2 = us_canada_user_rating.pivot_table(index = 'userID', columns= 'bookTitle', values = 'bookRating').fillna(0)
us_canada_user_rating_pivot2.head()

bookTitle,1984,1st to Die: A Novel,24 Hours,2nd Chance,4 Blondes,84 Charing Cross Road,A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,A Bend in the Road,A Case of Need,"A Child Called \It\"": One Child's Courage to Survive""",...,Without Remorse,"Wizard and Glass (The Dark Tower, Book 4)",Women Who Run with the Wolves,"Word Freak: Heartbreak, Triumph, Genius, and Obsession in the World of Competitive Scrabble Players",Wuthering Heights,Year of Wonders,You Belong To Me,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw"""
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
us_canada_user_rating_pivot2.shape

(32184, 914)

In [37]:
x = us_canada_user_rating_pivot2.values.T
x.shape

(914, 32184)

In [38]:
SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(x)
matrix.shape

(914, 12)

In [39]:
corr = np.corrcoef(matrix)
corr.shape

(914, 914)

In [40]:
us_canada_book_title = us_canada_user_rating_pivot2.columns
us_canada_book_list = list(us_canada_book_title)
recommended_book = us_canada_book_list.index("1984")
print(recommended_book)

0


In [163]:
corr_recommended_book = corr[recommended_book]
recommendations = list(us_canada_book_title[(corr_recommended_book<1.0) & (corr_recommended_book>=0.9)])
print(*recommendations, sep="\n")

1984
American Gods
American Psycho (Vintage Contemporaries)
Animal Farm
Atlas Shrugged
Brave New World
Catch 22
Cry to Heaven
Dune (Remembering Tomorrow)
Ender's Game (Ender Wiggins Saga (Paperback))
Fast Food Nation: The Dark Side of the All-American Meal
Good Omens
Lord of the Flies
Neverwhere
Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death
Stardust
The Angel of Darkness
The Color Purple
The Fountainhead
The Great Gatsby
The Hitchhiker's Guide to the Galaxy
The Princess Bride: S Morgenstern's Classic Tale of True Love and High Adventure
Watership Down
Wuthering Heights
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values
