# Bestselling Amazon Books

In [90]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import missingno as msno
import autoreload
import csv
import warnings
import os
import sys
import re

from collections import defaultdict, Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

sb.set()
sb.set_style('darkgrid')

plt.style.use('seaborn')
plt.tight_layout()

%matplotlib inline
%reload_ext autoreload
%autoreload 2

warnings.filterwarnings('ignore')

In [91]:
df = pd.read_csv(r"D:\Open Classroom\Datasets\Bestselling Books Amazon Dataset\resultbook2.csv")
df.head()

Unnamed: 0,Description,Price,Rating,ReviewCount,Url
0,The 5 AM Club,140.0,4.5,23597,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
1,The Monk Who Sold His Ferrari,140.0,4.5,24949,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
2,The Power of Your Subconscious Mind,105.0,4.5,63010,https://www.amazon.in//Power-Your-Subconscious...
3,Life's Amazing Secrets: How to Find Balance an...,160.0,4.6,18379,https://www.amazon.in//Lifes-Amazing-Secrets-B...
4,"Do It Today: Overcome Procrastination, Improve...",98.0,4.5,3206,https://www.amazon.in//Do-Today-procrastinatio...


In [92]:
df.shape

(880, 5)

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880 entries, 0 to 879
Data columns (total 5 columns):
Description    880 non-null object
Price          880 non-null float64
Rating         880 non-null float64
ReviewCount    880 non-null int64
Url            880 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 34.5+ KB


In [94]:
df.isnull().sum()

Description    0
Price          0
Rating         0
ReviewCount    0
Url            0
dtype: int64

## Feature Engineering

In [95]:
df.head()

Unnamed: 0,Description,Price,Rating,ReviewCount,Url
0,The 5 AM Club,140.0,4.5,23597,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
1,The Monk Who Sold His Ferrari,140.0,4.5,24949,https://www.amazon.in//sspa/click?ie=UTF8&spc=...
2,The Power of Your Subconscious Mind,105.0,4.5,63010,https://www.amazon.in//Power-Your-Subconscious...
3,Life's Amazing Secrets: How to Find Balance an...,160.0,4.6,18379,https://www.amazon.in//Lifes-Amazing-Secrets-B...
4,"Do It Today: Overcome Procrastination, Improve...",98.0,4.5,3206,https://www.amazon.in//Do-Today-procrastinatio...


In [96]:
def word_drop(text):
    text = text.replace('/[^a-zA-Z0-9 ]/g', "")
    return text

In [109]:
df['Description'] = df['Description'].apply(word_drop)

df['Description'] = df['Description'].apply(lambda x: x.lower())

In [110]:
tf_vect = TfidfVectorizer(min_df=3, max_features=None, ngram_range=(1,3), 
                          stop_words='english', strip_accents='unicode')

In [111]:
df['Description'] = df['Description'].fillna('')

In [112]:
desc_matrix = tf_vect.fit_transform(df['Description'])

In [113]:
sig = sigmoid_kernel(desc_matrix, desc_matrix)

In [114]:
indices = pd.Series(df.index, df['Description']).drop_duplicates()
indices.head()

Description
the 5 am club                                                                                                                                     0
the monk who sold his ferrari                                                                                                                     1
the power of your subconscious mind                                                                                                               2
life's amazing secrets: how to find balance and purpose in your life | inspirational zen book on motivation, self-development & healthy living    3
do it today: overcome procrastination, improve productivity                                                                                       4
dtype: int64

In [117]:
indices['the monk who sold his ferrari'][0:3]

the monk who sold his ferrari     1
the monk who sold his ferrari    23
the monk who sold his ferrari    45
dtype: int64

In [118]:
sorted(list(enumerate(sig[1]))[:10])

[(0, 0.7615941559557649),
 (1, 0.762639486019134),
 (2, 0.7615941559557649),
 (3, 0.7615941559557649),
 (4, 0.7615941559557649),
 (5, 0.7615941559557649),
 (6, 0.7615941559557649),
 (7, 0.7615941559557649),
 (8, 0.7615941559557649),
 (9, 0.7615941559557649)]

In [127]:
def run_experiment(text, sig=sig):
    
    bk_list = [x for x in df['Description']]
    
    if text in bk_list:
    
        idx = indices[text]

        idx_list = list(enumerate(idx))

        sort_list = sorted(idx_list, key = lambda x: x[1], reverse = True)

        top_ten = sort_list[0:10]

        book_rec = [x[0] for x in top_ten]

        return df[['Description', 'Price', 'Rating', 'ReviewCount']].iloc[book_rec].sort_values(by = ['Rating', 'ReviewCount'], ascending = False)
    
    else:
        print('__DATABASE ERROR___ During handling of the above exception, another exception occurred: InvalidIndexError(key)')
        print('BOOK DESCRIPTION NOT FOUND')

In [129]:
run_experiment("grandparents' bag of stories")

Unnamed: 0,Description,Price,Rating,ReviewCount
33,atomic habits: the life-changing million copy ...,414.0,4.7,59382
35,grandparents' bag of stories,152.0,4.7,4000
32,ikigai: the japanese secret to a long and happ...,257.0,4.6,39026
38,rich dad poor dad : what the rich teach their ...,395.0,4.6,13899
31,wise and otherwise: a salute to life,185.0,4.6,5250
30,the subtle art of not giving a f*ck: a counter...,151.05,4.5,79543
36,the richest man in babylon,99.0,4.5,32538
39,brain activity book for kids - 200+ activities...,95.0,4.5,2149
34,the power of a positive attitude: your road to...,99.0,4.4,2739
37,"think straight: change your thoughts, change y...",198.0,4.3,4833


In [130]:
run_experiment('the 5 am club')

Unnamed: 0,Description,Price,Rating,ReviewCount
33,atomic habits: the life-changing million copy ...,414.0,4.7,59382
35,grandparents' bag of stories,152.0,4.7,4000
32,ikigai: the japanese secret to a long and happ...,257.0,4.6,39026
38,rich dad poor dad : what the rich teach their ...,395.0,4.6,13899
31,wise and otherwise: a salute to life,185.0,4.6,5250
30,the subtle art of not giving a f*ck: a counter...,151.05,4.5,79543
36,the richest man in babylon,99.0,4.5,32538
39,brain activity book for kids - 200+ activities...,95.0,4.5,2149
34,the power of a positive attitude: your road to...,99.0,4.4,2739
37,"think straight: change your thoughts, change y...",198.0,4.3,4833


In [131]:
run_experiment('5 am club')

__DATABASE ERROR___ During handling of the above exception, another exception occurred: InvalidIndexError(key)
BOOK DESCRIPTION NOT FOUND
