In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import correlation, cosine
import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import sys, os
from contextlib import contextmanager
import sklearn.metrics as metrics

In [2]:
books = pd.read_csv('Book_reviews/BX-Books.csv', sep = ';', error_bad_lines=False, encoding='latin-1')
users = pd.read_csv('Book_reviews/BX-Users.csv', sep = ';', error_bad_lines=False, encoding='latin-1')
ratings = pd.read_csv('Book_reviews/BX-Book-Ratings.csv', sep = ';', error_bad_lines=False, encoding='latin-1')

FileNotFoundError: [Errno 2] No such file or directory: 'Book_reviews/BX-Books.csv'

In [None]:
print(books.shape)
print(users.shape)
print(ratings.shape)

In [None]:
books.head()

In [None]:
books.columns

In [None]:
books.drop(['Image-URL-S','Image-URL-M','Image-URL-L'], axis=1, inplace=True)

In [None]:
books.head()

In [None]:
books.dtypes

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
books['Year-Of-Publication'].unique()

In [None]:
books.loc[books['Year-Of-Publication']=='DK Publishing Inc']

In [None]:
books.loc[books['ISBN']== '0789466953', 'Year-Of-Publication'] = 2000
books.loc[books['ISBN']== '0789466953', 'Book-Author'] = 'James Buckley'
books.loc[books['ISBN']== '0789466953', 'Publisher'] = 'DK Publishing Inc'
books.loc[books['ISBN']== '0789466953', 'Book-Title'] = 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'

In [None]:
books['Year-Of-Publication'].unique()

In [None]:
books.loc[books['ISBN']== '078946697X', 'Year-Of-Publication'] = 2000
books.loc[books['ISBN']== '078946697X', 'Book-Author'] = 'Michael Teitelbaum'
books.loc[books['ISBN']== '078946697X', 'Publisher'] = 'DK Publishing Inc'
books.loc[books['ISBN']== '078946697X', 'Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'

In [None]:
books.loc[books['Year-Of-Publication']=='Gallimard']

In [None]:
books.loc[books['ISBN']== '2070426769', 'Year-Of-Publication'] = 2003
books.loc[books['ISBN']== '2070426769', 'Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'
books.loc[books['ISBN']== '2070426769', 'Publisher'] = 'Gallimard'
books.loc[books['ISBN']== '2070426769', 'Book-Title'] = "Peuple du ciel, suivi de 'Les Bergers'"

In [None]:
books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors = 'coerce')

In [None]:
(books[books['Year-Of-Publication'] == 0].shape[0]/books.shape[0])*100

In [None]:
books.loc[(books['Year-Of-Publication'] == 0 ) | (books['Year-Of-Publication'] >2006)].shape[0]

In [None]:
books.loc[(books['Year-Of-Publication']>2006)| (books['Year-Of-Publication']==0), 'Year-Of-Publication'] = np.NAN

In [None]:
books['Year-Of-Publication'].fillna(round(books['Year-Of-Publication'].mean()), inplace=True)

In [None]:
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(np.int32)

In [None]:
books.loc[books['Publisher'].isna(), 'Publisher'] = 'other'

In [None]:
books.loc[books['Publisher']=='other']

In [None]:
books.loc[128890,'Year-Of-Publication']

In [None]:
users.shape

In [None]:
users.head()

In [None]:
(users.Age.isna().sum())/users.Age.shape[0]

In [None]:
users.Age.unique()

In [None]:
users['User-ID'].values

In [None]:
users.loc[(users.Age >90)| (users.Age <5), 'Age'] = np.NAN
users.Age = users.Age.fillna(users.Age.mean())

In [None]:
users.Age = users.Age.astype(np.int32)

In [None]:
users.Age.dtype

In [None]:
users.Location.unique()

In [None]:
ratings.shape

In [None]:
sparse=(users.shape[0])*(books.shape[0])

In [None]:
ratings.head()

In [None]:
ratings_new = ratings[ratings.ISBN.isin(books.ISBN)]

In [None]:
ratings_new.shape

In [None]:
ratings_new = ratings_new[ratings_new['User-ID'].isin(users['User-ID'])]

In [None]:
print(ratings.shape)
ratings_new.shape

In [None]:
ratings_new

In [None]:
sparsity = 1.0 - len(ratings_new)/float(sparse)
print('The sparsity level of Book Crossing dataset is '+ str(sparsity*100)+' %')

In [None]:
ratings.head()

In [None]:
ratings['Book-Rating'].unique()

In [None]:
ratings_explicit = ratings_new[ratings_new['Book-Rating']!=0]
ratings_implicit = ratings_new[ratings_new['Book-Rating'] == 0]

In [None]:
users_exp_ratings = users[users['User-ID'].isin(ratings_explicit['User-ID'])]
users_imp_ratings = users[users['User-ID'].isin(ratings_implicit['User-ID'])]

In [None]:
ratings_implicit.shape

In [None]:
sns.countplot(data=ratings_explicit, x='Book-Rating')

Popularity Based Recommendation

In [None]:
ratings_count = pd.DataFrame(ratings_explicit.groupby(['ISBN'])['Book-Rating'].sum())

In [None]:
ratings_count

In [None]:
top10 = ratings_count.sort_values('Book-Rating', ascending=False).head(10)
top10

In [None]:
print("Following books are recommended")
top10.merge(books, left_index = True, right_on = 'ISBN')

Collaborative Filtering based Recommendation System

In [None]:
counts1 = ratings_explicit['User-ID'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['User-ID'].isin(counts1[counts1>=100].index)]
counts = ratings_explicit['Book-Rating'].value_counts()
ratings_explicit = ratings_explicit[ratings_explicit['Book-Rating'].isin(counts[counts>=100].index)]


In [None]:
ratings_explicit

In [None]:
ratings_matrix = ratings_explicit.pivot(index = 'User-ID', columns='ISBN', values = 'Book-Rating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix.head()

In [None]:
ratings_matrix = ratings_matrix.fillna(0)

In [None]:
ratings_matrix

In [None]:
global k, metric
k = 5
metric = 'correlation'

1. Find the Similar users with KNN algorithm

In [None]:
def findksimilarusers(user_id,ratings,metric = metric, k = k):
    similarities = []
    indices = []
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    user_loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[user_loc, :].values.reshape(1,-1), n_neighbors=k+1)
    similarities = 1-distances.flatten()
    return similarities,indices

2. Get the ratings for the particular item that would be rated by the user with the help of similar users obtained

In [None]:
def predict_userbased(user_id, item_id, ratings, metric = metric, k = k):
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices = findksimilarusers(user_id, ratings, metric = metric, k = k)
    mean_rating = ratings.iloc[user_loc].mean()
    sum_wtd = np.sum(similarities)-1
    product = 1
    wtd_sum = 0
    for i in range(len(indices.flatten())):
        if indices.flatten()[i]==user_loc:
            continue
        else:
            rating_diff = ratings.iloc[indices.flatten()[i], item_loc] - mean_rating
            product = rating_diff * similarities[i]
            wtd_sum = wtd_sum + product
    prediction = int(round(mean_rating + (wtd_sum/sum_wtd)))
    if prediction <=0:
        prediction = 1
    elif prediction >10:
        prediction = 10
    
#     print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))
    return prediction

3. Get the top 10 recommended items for the user based on his predicted ratings

In [None]:
def recommendItem(user_id, ratings):
    predictions = []
    if (user_id not in ratings.index.values) | (type(user_id) is not int ):
        print('The User doesnot exist in our System. Please choose valid user.')
    else:
        for i in range(ratings.shape[1]):
            if (ratings[str(ratings.columns[i])][user_id] !=0):
                predictions.append(predict_userbased(user_id, str(ratings.columns[i]), ratings, metric))
            else:
                predictions.append(-1)
            
#     prediction = pd.Series(predictions)
#     prediction = predictions.sort_values(ascending = False)
#     recommended = prediction[:10]
#     print('Following books are recommended...')
#     for i in range(len(recommended)):
#         print('{0}. {1}'.format(i+1, books['Book-Title'] recommended.index[i].encode('utf-8')]))
    return predictions

In [None]:
# recommendItem(274301, ratings_matrix)

Item Based

In [None]:
def findksimilaritems(item_id,ratings,metric = metric, k = k):
    similarities = []
    indices = []
    ratings = ratings.T
    model_knn = NearestNeighbors(metric = metric, algorithm = 'brute')
    model_knn.fit(ratings)
    item_loc = ratings.index.get_loc(item_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[item_loc, :].values.reshape(1,-1), n_neighbors=k+1)
    similarities = 1-distances.flatten()
    return similarities,indices

In [None]:
def predict_itembased(user_id, item_id, ratings, metric = metric, k = k):
    prediction = wtd_sum = 0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices = findksimilaritems(item_id, ratings)
    sum_wtd = np.sum(similarities)-1
    product = 1
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i]==item_loc:
            continue;
        else:
            product = ratings.iloc[user_loc,indices.flatten()[i]]* (similarities[i])
            wtd_sum = wtd_sum + product
    prediction = int(round(wtd_sum/sum_wtd))
    if prediction <= 0:
        prediction = 1
    elif prediction > 10:
        prediction = 10
#     print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))
    return prediction

In [None]:
# prediction = predict_itembased(11676,'0001056107', ratings_matrix)

In [None]:
def recommendItembased(user_id, ratings, metric = metric):
    if (user_id not in ratings.index.values) | (type(user_id) is not int):
        print('The User doesnot exist in our System. Please choose valid user.')
    else:
        prediction = []
        for i in range(ratings.shape[1]):
            if (ratings[str(ratings.columns[i])][user_id] !=0):
                prediction.append(predict_itembased(user_id, str(ratings.columns[i]), ratings, metric))
                
            else:
                prediction.append(-1)
            
    prediction = pd.Series(prediction)
    prediction = prediction.sort_values(ascending = False)
    recommended = prediction[:10]
    print('Following books are recommended...')
    for i in range(len(recommended)):
        print('{0}. {1}'.format(i+1, books['Book-Title'][recommended.index[i]].encode('utf-8')))

In [None]:
# recommendItembased(4385, ratings_matrix)