In [1]:
import pandas as pd
import numpy as np

from IPython.display import Image, HTML, Markdown
from IPython.core.display import HTML 
import warnings
warnings.filterwarnings('ignore')

import ipywidgets as widgets

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# import matplotlib.pyplot as plt
# %matplotlib inline

from surprise import Reader, accuracy, Dataset
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [156]:
import itables.interactive

In [126]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to show/hide code"></form>''')

In [2]:
display(HTML("<style>.container { width:90% !important; }</style>"))

# Intro
**Author** Albina Gallyavova  
**Date** May 21, 2020  
**Objective**  

Build a KNN-based interactive recommender tool allowing users to find top-n similar items without creating profiles/sharing data  

*Advantages*
- suitable for new users and addressing 'cold start' issue
- search inputs could be used to build profile for new users
- could be further enhanced by including metadata allowing for more specific search

*Disadvantages*
- similar items solely depend on ratings as opposed to items characteristics (such as genre, author, etc.)

**Data** Book-Crossings  
**Tools**  Python, `surprise`  

# Data
The Book-Crossing dataset is a collection of user ratings of books. It comes with both explicit ratings (1-10 stars) and implicit ratings (user interacted with the book). It consists of 3 separate datasets: BX-Users, BX-Books, BX-Ratings

In [171]:
books = pd.read_csv('BX-Books.csv',sep=';', error_bad_lines=False, warn_bad_lines=False, encoding='latin-1',low_memory=False)
users = pd.read_csv('BX-Users.csv',sep=';', error_bad_lines=False, warn_bad_lines=False, encoding='latin-1')
ratings = pd.read_csv('BX-Book-Ratings.csv',sep=';', error_bad_lines=False, warn_bad_lines=False, encoding='latin-1')

# assign data type to reduce memory
ratings['User-ID'] = ratings['User-ID'].astype('int32')
ratings['ISBN'] = ratings['ISBN'].astype('category')
ratings['Book-Rating'] = ratings['Book-Rating'].astype('int8')
# ratings.columns = ['user','item','rating']

### BX-Users  
Contains the users. Note that user IDs (`User-ID`) have been anonymized and map to integers. Demographic data is provided (`Location`, `Age`) if available. Otherwise, these fields contain NULL-values.

In [162]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [163]:
users.head(3)

User-ID,Location,Age


Approximately 40% of `Age` information is missing. 
`Location` appears to include city, state and country. We will try to split information and see if we can examine books ratings by geographical location.

In [164]:
users['Loc_split'] = users['Location'].str.split(',')
users['len'] = users['Loc_split'].str.len()
print("There are {} users that don't have standard location format".format(len(users[users['len']>3])))

There are 1508 users that don't have standard location format


It appears the location column is not standard and more cleaning would require to extract information. Examples below:

In [165]:
users[users['len']>3].head(3)

Unnamed: 0,User-ID,Location,Age,Loc_split,len


Despite non-standard format, the last item in location seems to include country but a quick look indicates that here is no standard naming convention, i.e., same country spelled in different ways or languages. It also appears to allow free text, and some interesting locations specified by users include ''n/a - on the road', 'universe','fredonia - land of the brave and free', 'the world tomorrow', and ''doodedoo'.

In [166]:
users['country'] = users['Loc_split'].str[-1].str.strip()
users['country'].unique()

array(['usa', 'russia', 'portugal', 'united kingdom', 'canada', 'spain',
       'australia', 'brazil', '', 'germany', 'mexico', 'china', 'italy',
       'distrito federal', 'france', 'netherlands', 'iraq', 'finland',
       'new zealand', 'india', 'ghana', 'switzerland', 'slovenia', 'iran',
       'peru', 'bosnia and herzegovina', 'vietnam', 'sri lanka',
       'pakistan', 'denmark', 'belgium', 'malaysia', 'u.a.e', 'turkey',
       'philippines', 'greece', 'colombia', 'norway', 'kuwait', 'chile',
       'quit', 'lj', 'taiwan', 'denmark"', 'españa', 'sweden',
       'argentina', 'nigeria', 'london', 'bulgaria', 'austria', 'romania',
       'singapore', 'albania', 'egypt', 'tunisia', 'uzbekistan', 'qatar',
       'syria', 'saudi arabia', 'indonesia', 'sudan', 'somewherein space',
       'poland', 'thailand', 'ireland', 'malaysia"', 'venezuela',
       'paraguay', 'mozambique', 'morocco', 'afghanistan', 'estonia',
       'clackamas', 'spain"', '"', 'lesotho', 'yugoslavia', 'slovakia',
   

### BX-Books
Books are identified by their respective ISBN. Invalid ISBNs have already been removed from the dataset. Moreover, some content-based information is given (`Book-Title`, `Book-Author`, `Year-Of-Publication`, `Publisher`), obtained from Amazon Web Services. Note that in case of several authors, only the first is provided. URLs linking to cover images are also given, appearing in three different flavours (`Image-URL-S`, `Image-URL-M`, `Image-URL-L`), i.e., small, medium, large. These URLs point to the Amazon web site.

In [167]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


We will keep column with medium picture of a book for displaying in recommender, but get rid of the other two. The resulting examples below:

In [172]:
# drop columns with S and L image links
books.drop(['Image-URL-S', 'Image-URL-L'],axis=1,inplace=True)
# books['ISBN'] = books['ISBN'].astype('category')
books.head(3)

ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M


At a first glance, we have 271360 unique books in the dataset but a closer examination reveals that there are numerous books with the same title and by the same author that in some cases have as many as 21 different ISBN numbers. For example:

In [173]:
books.groupby(['Book-Title','Book-Author'])['ISBN'].nunique().reset_index().sort_values('ISBN',ascending=False).head(5)

Unnamed: 0,Book-Title,Book-Author,ISBN


For consistency, we will reassign a single ISBN to the books with multiple ISBN, and replace those ISBN in ratings table as well in later steps.

In [174]:
# group books by title and author to count unique ISBNs, filter teh ones that have more than one ISBN
dupl = books.groupby(['Book-Title','Book-Author'])['ISBN'].nunique().reset_index().sort_values('ISBN',ascending=False)
dupl = dupl[dupl['ISBN']>1]
print('There are {} books that have multiple ISBNs'.format(len(dupl)))

# assign a single ISBN to books with multiple ISBNs
dupl_isbn = books[books['Book-Title'].isin(dupl['Book-Title'])].groupby(['Book-Title','Book-Author'])['ISBN'].unique().reset_index()
display(dupl_isbn.head())

There are 15746 books that have multiple ISBNs


Book-Title,Book-Author,ISBN


In [175]:
dupl_isbn = books[books['Book-Title'].isin(dupl['Book-Title'])].groupby(['Book-Title','Book-Author']).agg({'ISBN':'unique',
                                                                                       'Image-URL-M':'unique'}).reset_index()

one_isbn = pd.DataFrame(dupl_isbn['ISBN'].values.tolist()).add_prefix('isbn_')['isbn_0']
one_link = pd.DataFrame(dupl_isbn['Image-URL-M'].values.tolist()).add_prefix('link_')['link_0']

dupl_isbn = pd.concat([dupl_isbn,one_isbn],axis=1,sort=False)
dupl_isbn = pd.concat([dupl_isbn,one_link],axis=1,sort=False)

books = books.merge(dupl_isbn,how='left',left_on=['Book-Title','Book-Author'],right_on=['Book-Title','Book-Author'])
books['one_isbn'] = np.where(books['isbn_0'].notnull(),books['isbn_0'],books['ISBN_x'])
books['one_link'] = np.where(books['link_0'].notnull(),books['link_0'],books['Image-URL-M_x'])

Resulting books table is depicted below:

In [176]:
books.head(3)

Unnamed: 0,ISBN_x,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M_x,ISBN_y,Image-URL-M_y,isbn_0,link_0,one_isbn,one_link


### BX-Ratings
Contains the book rating information. Ratings (`Book-Rating`) are either explicit, expressed on a scale from 1-10 (higher values denoting higher appreciation), or implicit, expressed by 0.

In [140]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype   
---  ------       --------------    -----   
 0   User-ID      1149780 non-null  int32   
 1   ISBN         1149780 non-null  category
 2   Book-Rating  1149780 non-null  int8    
dtypes: category(1), int32(1), int8(1)
memory usage: 22.5 MB


The initial ratings table includes over 1M ratings. We only include ratings for the books that are present in books table, which reduces rating by ~120k.

In [177]:
ratings = ratings.merge(books[['ISBN_x','one_isbn']], how='left',left_on='ISBN',right_on='ISBN_x').dropna()
ratings_new = ratings[['User-ID','one_isbn','Book-Rating']]
ratings_new.columns = ['user','item','rating']
ratings_new.head()
print("Ratings after removing books not present in BX-Books: {}".format(len(ratings_new)))

Ratings after removing books not present in BX-Books: 1031136


As mentioned in data description, the ratings include implicit ratings expressed by 0, which represent a bigger portion of the dataset. 

In [178]:
tmp = ratings_new.groupby('rating')['item'].count().reset_index()
fig = px.bar(tmp,x='rating',y='item')
fig.update_layout(title= 'Ratings distribution',
                  template = 'plotly_dark',
                  height=300)
fig.show()

After removing multiple ISBNs for the same books, we can explore the top 20 books by average rating and number of users who have rated the book. 

In [179]:
tmp = ratings_new.groupby('item').agg({'user':'sum',
                                 'rating':'mean'}).reset_index().sort_values(['rating','user'],ascending=False).head(20)
top_20 = tmp.merge(books[['one_isbn','Book-Title','Book-Author']],how='left',left_on='item', right_on='one_isbn').drop_duplicates()
top_20.rename(columns = {'Book-Title':'Title',
                        'Book-Author':'Author', 'rating':'Avg rating','user':'Users rated'},inplace=True)
top_20.drop(['item','one_isbn'],axis=1,inplace=True)
top_20

Unnamed: 0,Users rated,Avg rating,Title,Author


In [None]:
# ratings_new.groupby(['item'])['rating'].count().reset_index().sort_values('rating',ascending=False)[:10000]

As the purpose of the system to recommend books that have similar ratings, we will remove books that have fewer than 15 ratings (the number is selectedin such a way that we have more than 10k books in the final dataset). 

In [181]:
min_book_ratings_new = 14
filter_books = ratings_new['item'].value_counts() > min_book_ratings_new
filter_books = filter_books[filter_books].index.tolist()

min_user_ratings_new = 50
filter_users = ratings_new['user'].value_counts() > min_user_ratings_new
filter_users = filter_users[filter_users].index.tolist()


short_ratings_new = ratings_new[(ratings_new['item'].isin(filter_books)) & (ratings_new['user'].isin(filter_users))]
short_ratings_new.reset_index(inplace=True,drop=True)
# print('The original data frame shape:\t{}'.format(ratings_new.shape))
# print('The new data frame shape:\t{}'.format(short_ratings_new.shape))

print('The resulting rating matrix include')
print('Books: {}'.format(pd.Series(filter_books).nunique()))
print('Users: {}'.format(pd.Series(filter_users).nunique()))

The resulting rating matrix include
Books: 10470
Users: 3012


# Model

Because the objective is to identify top-n similar books, we have to use nearest-neighbor-based methods that are based on the similarity between pairs of items or users. We will be using Python's surprise package to train and test 3 k-NN based models.
We will first perform a Grid Search to find optimal parameters for k-neighbors and similarity measures.

### Grid Search

In [25]:
# A reader is needed but only the rating_scale param is required when reading data from df 
reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, item id and ratings (in that order)
data = Dataset.load_from_df(short_ratings_new[['user', 'item', 'rating']], reader)

In [26]:
%%time
param_grid = {'k': [10,15,20,25,30],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [False]}
              }

knnbasic_gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)
knnbasic_gs.fit(data)

knnmeans_gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=3)
knnmeans_gs.fit(data)

knnz_gs = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse', 'mae'], cv=3)
knnz_gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matr

After Grid Search, it is evident that models that use MSD, Mean Squared Difference similarity between all pairs of items, as similarity measure perform better than models using Cosine similarity. Increasing k parameter also seems to improve model performance. 

In [151]:
knnbasic_res_df = pd.DataFrame(knnbasic_gs.cv_results)
knnbasic_res_df['sim'] = knnbasic_res_df['param_sim_options'].str['name']
knnbasic_res_df['algo'] = 'knn_basic'

knnmeans_res_df = pd.DataFrame(knnmeans_gs.cv_results)
knnmeans_res_df['sim'] = knnmeans_res_df['param_sim_options'].str['name']
knnmeans_res_df['algo'] = 'knn_means'

knnz_res_df = pd.DataFrame(knnz_gs.cv_results)
knnz_res_df['sim'] = knnz_res_df['param_sim_options'].str['name']
knnz_res_df['algo'] = 'knn_zscore'

cv_res = pd.concat([knnbasic_res_df,knnmeans_res_df,knnz_res_df], axis=0)

colors =  px.colors.qualitative.Safe #px.colors.sequential.Burg[3:6] + px.colors.sequential.Mint[3:6]

fig = go.Figure()

tmp = cv_res[(cv_res['sim']=='msd')].copy()
for algo in cv_res['algo'].unique():
    fig.add_trace(go.Scatter(x = tmp[(tmp['algo'] == algo)]['param_k'], 
                             y = tmp[(tmp['algo'] == algo)]['mean_test_rmse'], 
                             name = algo + '_msd'))

tmp = cv_res[(cv_res['sim']=='cosine')].copy()
for algo in cv_res['algo'].unique():
    fig.add_trace(go.Scatter(x = tmp[(tmp['algo'] == algo)]['param_k'], 
                             y = tmp[(tmp['algo'] == algo)]['mean_test_rmse'], 
                             name = algo + '_cosine'))

fig.update_layout(title = '<b>Knn Grid Search results</b>',
                  title_font={'size':24},
                  xaxis_title = 'k param',
                  yaxis_title = 'Mean Test RMSE',
                  template='plotly_dark',
                  height=350,
                  margin = {'l':30,'r':20,'t':50,'b':50},
                 legend_orientation='h',
                 colorway = colors)
fig.show()

We extract `best.estimator()` for each model to compare the results.

In [120]:
knnbasic_best = knnbasic_gs.best_estimator['rmse']
knnmeans_best = knnmeans_gs.best_estimator['rmse']
knnz_best = knnz_gs.best_estimator['rmse']

knnbasic_cv = cross_validate(knnbasic_best, data, cv=3, verbose=False)
knnmeans_cv = cross_validate(knnmeans_best, data, cv=3, verbose=False)
knnz_cv = cross_validate(knnz_best, data, cv=3, verbose=False)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


According to the RMSE accuracy measure, K-nn basic model with 30 k using MSD similarity performs best across all permulations of selected parameters.

In [122]:
knnbasic_best_res = pd.DataFrame(knnbasic_cv)
knnbasic_best_res['algo'] = 'knn_basic'

knnmeans_best_res = pd.DataFrame(knnmeans_cv)
knnmeans_best_res['algo'] = 'knn_means'

knnz_best_res = pd.DataFrame(knnz_cv)
knnz_best_res['algo'] = 'knn_zscore'

best_res_res = pd.concat([knnbasic_best_res,knnmeans_best_res,knnz_best_res], axis=0)

best_res_gr = round(best_res.groupby('algo').mean(),4)
best_res_gr

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
algo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
knn_basic,3.4124,2.6981,6.0759,18.8501
knn_means,3.4467,2.7837,6.3209,19.9522
knn_zscore,3.4759,2.7749,6.6506,21.0509


In [123]:
fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(x = best_res_gr.index,
                         y = best_res_gr['test_rmse'], 
                         name = 'RMSE'),row=1,col=1)
fig.add_trace(go.Scatter(x = best_res_gr.index,
                         y = best_res_gr['test_mae'],
                         name = 'MAE'),row=1,col=1)

fig.add_trace(go.Scatter(x = best_res_gr.index,
                         y = best_res_gr['fit_time'],
                         name = 'Fit Time'),row=1,col=2)
fig.add_trace(go.Scatter(x = best_res_gr.index,
                         y = best_res_gr['test_time'],
                         name = 'Test Time'),row=1,col=2)

fig.update_layout(title = '<b>Best Models Comparison </b>',
                  title_font={'size':24},
                  template='plotly_dark',
                  height=350,
                  margin = {'l':30,'r':20,'t':50,'b':50},
                  legend_orientation='h', legend= {'x':.35,'y':-.1},
                 colorway = colors)

fig.show()

We will use the best model based on RMSE to train and fit on all of the data.

In [183]:
%%time

## train model on all data
trainset = data.build_full_trainset()

# To use item-based cosine similarity
# sim_options = {
#     "name": "cosine",
#     "user_based": False,  # Compute  similarities between items
# }
algo = knnbasic_best
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.
CPU times: user 8.54 s, sys: 1.28 s, total: 9.82 s
Wall time: 10.4 s


<surprise.prediction_algorithms.knns.KNNBasic at 0x12acf15c0>

## Recommendations

In [184]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to show/hide code"></form>''')

In [194]:
short_books = books[books['one_isbn'].isin(short_ratings_new['item'].unique())].copy().reset_index()
name_to_rid = pd.Series(short_books['one_isbn'].values,index = short_books['Book-Title'].values).to_dict()
rid_to_name = pd.Series(short_books['Book-Title'].values,index = short_books['one_isbn'].values).to_dict()

sh_bks_gr = short_books.groupby(['Book-Title','Book-Author','one_isbn','one_link'])['Year-Of-Publication'].unique().reset_index()

select_book = widgets.Dropdown(
    options=[i for i in short_books.sort_values(by='Book-Title',ascending=True)['Book-Title'].unique()],
    description='Select book:',
    disabled=False,
)

out = widgets.Output()

def response(change):
    with out:
    
        out.clear_output()

        book = select_book.value
        raw_id = name_to_rid[book]
        inner_id = algo.trainset.to_inner_iid(raw_id)

        # Retrieve inner ids of the nearest neighbors
        neighbors = algo.get_neighbors(inner_id, k=10)

        # Convert inner ids of the neighbors into names.
        neighbors = (algo.trainset.to_raw_iid(inner_id) 
                     for inner_id in neighbors)

        neighbors = (rid_to_name[rid] for rid in neighbors)

        t = tuple(neighbors)

        res = pd.DataFrame(t)
        res.columns = ['Title']

        res = res.merge(sh_bks_gr, how='left',left_on = 'Title', right_on = 'Book-Title')
        res['image'] = '<img src=' + res['one_link'] + '/>'
        res = res[['image','Book-Title','one_isbn','Book-Author']]
        res.index = res.index + 1
        
    #     out.append_display_data(HTML(res.to_html(escape=False ,formatters=dict(Image=res['image']))))
#         display(HTML(res.to_html(escape=False ,formatters=dict(Image=res['image']))))
        display(res)
#         show(res.to_frame().T, columnDefs=[{"width": "120px", "targets": "_all"}])

select_book.observe(response,names='value')

display(select_book)
display(out)

Dropdown(description='Select book:', options=(' Q-Space (Star Trek The Next Generation, Book 47)', "'Salem's L…

Output()

## Resources

https://github.com/ashwanidv100/Recommendation-System---Book-Crossing-Dataset  
https://surprise.readthedocs.io/en/stable/FAQ.html?highlight=inner#how-to-get-the-k-nearest-neighbors-of-a-user-or-item  
https://towardsdatascience.com/recommender-systems-in-practice-cef9033bb23a  
https://realpython.com/build-recommendation-engine-collaborative-filtering/#algorithms-based-on-k-nearest-neighbours-k-nn  
https://surprise.readthedocs.io/en/stable/getting_started.html#use-a-custom-dataset  
https://towardsdatascience.com/my-journey-to-building-book-recommendation-system-5ec959c41847  
https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b  
https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise

In [None]:
# book = select_book.value
# raw_id = name_to_rid[book]
# inner_id = algo.trainset.to_inner_iid(raw_id)

# # Retrieve inner ids of the nearest neighbors
# neighbors = algo.get_neighbors(inner_id, k=10)

# # Convert inner ids of the neighbors into names.
# neighbors = (algo.trainset.to_raw_iid(inner_id) 
#              for inner_id in neighbors)

# neighbors = (rid_to_name[rid] for rid in neighbors)

# t = tuple(neighbors)

# res = pd.DataFrame(t)
# res.columns = ['Title']

# sh_bks_gr = short_books.groupby(['Book-Title','Book-Author','one_isbn','one_link'])['Year-Of-Publication'].unique().reset_index()
# res = res.merge(sh_bks_gr, how='left',left_on = 'Title', right_on = 'Book-Title')
# res['image'] = '<img src=' + res['one_link'] + '/>'
# res = res[['image','Book-Title','one_isbn','Book-Author']]
# res.index = res.index + 1

In [None]:
# min_book_ratings = 50
# filter_books = ratings['item'].value_counts() > min_book_ratings
# filter_books = filter_books[filter_books].index.tolist()

# min_user_ratings = 50
# filter_users = ratings['user'].value_counts() > min_user_ratings
# filter_users = filter_users[filter_users].index.tolist()

# short_ratings = ratings[(ratings['item'].isin(filter_books)) & (ratings['user'].isin(filter_users))]
# short_ratings.reset_index(inplace=True,drop=True)
# print('The original data frame shape:\t{}'.format(ratings.shape))
# print('The new data frame shape:\t{}'.format(short_ratings.shape))

# # books['title'] = books['Book-Title'] + '_' + books['Year-Of-Publication']

# # short_ratings = short_ratings[short_ratings['item'].isin(books['ISBN'].unique())]

# short_books = books[books['ISBN'].isin(short_ratings['item'].unique())].copy().reset_index()
# name_to_rid = pd.Series(short_books['ISBN'].values,index = short_books['title'].values).to_dict()
# rid_to_name = pd.Series(short_books['title'].values,index = short_books['ISBN'].values).to_dict()

# # A reader is needed but only the rating_scale param is required when reading data from df 
# reader = Reader(rating_scale=(0, 10))

# # The columns must correspond to user id, item id and ratings (in that order)
# data = Dataset.load_from_df(short_ratings[['user', 'item', 'rating']], reader)

# ## train model on all data
# trainset = data.build_full_trainset()

# # To use item-based cosine similarity
# sim_options = {
#     "name": "cosine",
#     "user_based": False,  # Compute  similarities between items
# }
# algo = KNNWithMeans(sim_options=sim_options)
# algo.fit(trainset)