In [1]:
import datetime

from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


TITLE = 'Title'
PRODUCT_CODE = 'ASIN/ISBN (Product Code)'
CATEGORY = 'Category'

In [2]:
products_table_fpath = './preprocessed/products_table.csv'
products_table = pd.read_csv(products_table_fpath, index_col=0)
products_table.head()

Unnamed: 0_level_0,purchasers,Category,Title
ASIN/ISBN (Product Code),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B00IX1I3G6,1157,GIFT_CARD,Amazon.com Gift Card Balance Reload
B086KKT3RX,875,ABIS_GIFT_CARD,Amazon Reload
B07PCMWTSG,543,GIFT_CARD,Amazon.com eGift Card
B004LLIKVU,467,GIFT_CARD,Amazon.com eGift Card
B07FZ8S74R,377,DIGITAL_DEVICE_3,"Echo Dot (3rd Gen, 2018 release) - Smart speak..."


## Handle the books

criteria:
- Category is ABIS_BOOK and

- have ISBNs: https://en.wikipedia.org/wiki/ISBN

I verified that products with ISBNs that are not books had fewer than 50 purchasers. There are some very badly categorized items.

In [3]:
def is_isbn(pcode):
    pcode = str(pcode)
    if (len(pcode) < 9) or (len(pcode) > 13):
        return False
    # Sometimes ISBNs end in X
    if pcode.upper().endswith('X'):
        pcode = pcode[:-1]
    try:
        int(pcode)
        return True
    except Exception as e:
        return False

In [4]:
books_table = products_table[products_table.index.map(is_isbn)==True]
print('%s products in the books table' % len(books_table))
print('%s categories in the books table' % books_table[CATEGORY].nunique())

59113 products in the books table
184 categories in the books table


In [5]:
books_table.head()

Unnamed: 0_level_0,purchasers,Category,Title
ASIN/ISBN (Product Code),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
143127748,65,ABIS_BOOK,"The Body Keeps the Score: Brain, Mind, and Bod..."
1524763136,63,ABIS_BOOK,Becoming
735211299,59,ABIS_BOOK,Atomic Habits: An Easy & Proven Way to Build G...
786965606,52,TABLETOP_GAME,D&D Player’s Handbook (Dungeons & Dragons Core...
1501110365,51,ABIS_BOOK,It Ends with Us: A Novel (1)


## Rate limiting

- 1000 total requests allowed per day
- 100 requests per minute

In [6]:
print('API KEY:')
API_KEY = 'NEVER MIND'
print(API_KEY)

API KEY:
NEVER MIND


In [7]:
# what if we made 100 requests per second?
isbns = books_table.index.tolist()
print('%s ISBNs' % len(isbns))
print('expect to take ~%0.2f hours' % (len(isbns)/100/60))

59113 ISBNs
expect to take ~9.85 hours


In [8]:
"""
Using: https://pypi.org/project/ratelimit/
"""

from ratelimit import limits, sleep_and_retry

# 100 calls per RATE_LIMIT seconds
CALLS = 90
RATE_LIMIT = 60

@sleep_and_retry
@limits(calls=CALLS, period=RATE_LIMIT)
def check_limit():
    """Empty function just to check for calls to API"""
    return

In [9]:
# With the API key -- rate limiting and errors occur
BASE_URL = 'https://www.googleapis.com/books/v1/volumes?key={API_KEY}&q=isbn:'.format(API_KEY=API_KEY)
# Without the API key -- rate limiting does not seem to occur...
BASE_URL = 'https://www.googleapis.com/books/v1/volumes?q=isbn:'

isbn = '0143127748'
url = BASE_URL+isbn
print(url)

https://www.googleapis.com/books/v1/volumes?q=isbn:0143127748


In [10]:
import requests

print('Using BASE_URL', BASE_URL)

def get_isbn_url(isbn):
    return BASE_URL+isbn


def get_google_books_category(isbn):
    check_limit()
    try:
        response = requests.get(get_isbn_url(isbn)).json()
        return response['items'][0]['volumeInfo']['categories'][0] # There is only ever 1 category
    except Exception as e:
        print('Caught exception for %s: %s' % (isbn, e))
    return np.nan

Using BASE_URL https://www.googleapis.com/books/v1/volumes?q=isbn:


In [11]:
print(BASE_URL)

https://www.googleapis.com/books/v1/volumes?q=isbn:


In [12]:
get_google_books_category(isbn)

'Medical'

### About possible limitations

Sometimes this call for categories fails:

With a sample of 90 top purchased books, see ~10% failure rate. 
When including less popular ISBNs, failure rate climbs to ~25%.


Sometimes no categories
- a lot of non-books: coloring books, blank books, 

- but sometimes these are best sellers. 
    - Like this captain underpants book: 1338236598: https://www.amazon.com/Dog-Man-Creator-Captain-Underpants/dp/1338236598

Sometimes no items?
- sometimes not books; planners etc
- sometimes relatively new

Pick up where we left off...

In [None]:
# book_category = dict()
isbn_cat_df = pd.read_csv(google_books_api_isbn_category_fpath, dtype={'ISBN':'str'}, index_col=0)
book_category = isbn_cat_df.dropna()[CATEGORY].to_dict()

In [225]:
found_isbns = isbn_cat_df.dropna().index
for i, isbn in enumerate(isbns):
    if (i % 100)==0:
        print('%s: %s: ISBN %s ----' % (datetime.datetime.now(), i, isbn))
    if (isbn in found_isbns) or (i < 52300): continue
    book_category[isbn] =  get_google_books_category(isbn)

2024-06-22 15:45:42.813446: 0: ISBN 0143127748 ----
2024-06-22 15:45:42.822520: 100: ISBN 0440414806 ----
2024-06-22 15:45:42.822628: 200: ISBN 1979601739 ----
2024-06-22 15:45:42.822717: 300: ISBN 0345538374 ----
2024-06-22 15:45:42.822814: 400: ISBN 1101934743 ----
2024-06-22 15:45:42.822928: 500: ISBN 0312367546 ----
2024-06-22 15:45:42.823008: 600: ISBN 1595586431 ----
2024-06-22 15:45:42.823099: 700: ISBN 1635570301 ----
2024-06-22 15:45:42.823183: 800: ISBN 076790592X ----
2024-06-22 15:45:42.823284: 900: ISBN 1623157900 ----
2024-06-22 15:45:42.823366: 1000: ISBN 0448454513 ----
2024-06-22 15:45:42.823459: 1100: ISBN 0062073486 ----
2024-06-22 15:45:42.823537: 1200: ISBN 0451469801 ----
2024-06-22 15:45:42.823629: 1300: ISBN 0399180192 ----
2024-06-22 15:45:42.823721: 1400: ISBN 1561636150 ----
2024-06-22 15:45:42.823822: 1500: ISBN 1455525251 ----
2024-06-22 15:45:42.823907: 1600: ISBN 141974450X ----
2024-06-22 15:45:42.824031: 1700: ISBN 1974725103 ----
2024-06-22 15:45:42.82

Caught exception for 1412799538: 'categories'
Caught exception for 1408881683: 'categories'
Caught exception for 1408884534: 'categories'
Caught exception for 140888934X: 'categories'
Caught exception for 1408889528: 'categories'
Caught exception for 1409168506: 'categories'
Caught exception for 1409303187: 'categories'
Caught exception for 1406359823: 'categories'
Caught exception for 140713518X: 'items'
Caught exception for 1407163280: 'categories'
Caught exception for 1407196766: 'categories'
Caught exception for 1407197088: 'items'
Caught exception for 1407199870: 'categories'
Caught exception for 1407585924: 'categories'
Caught exception for 1408313057: 'categories'
Caught exception for 1412798086: 'categories'
Caught exception for 1411434919: 'items'
Caught exception for 1411434951: 'items'
Caught exception for 1411469445: 'items'
Caught exception for 1411469631: 'items'
Caught exception for 1411470400: 'categories'
Caught exception for 1411498739: 'items'
Caught exception for 14

Caught exception for 0793553113: 'items'
Caught exception for 0794513905: 'items'
Caught exception for 0793535204: 'items'
Caught exception for 0793535859: 'items'
Caught exception for 0793535867: 'items'
Caught exception for 0793535875: 'items'
Caught exception for 0793536596: 'items'
Caught exception for 0793536650: 'items'
Caught exception for 0793543843: 'items'
Caught exception for 079355294X: 'items'
Caught exception for 079358115X: 'items'
Caught exception for 079358129X: 'items'
Caught exception for 0793582253: 'items'
Caught exception for 0794443923: 'items'
Caught exception for 0794446663: 'items'
Caught exception for 0794448828: 'items'
Caught exception for 0793583926: 'items'
Caught exception for 0793599245: 'items'
Caught exception for 0793517222: 'items'
Caught exception for 0793515149: 'items'
2024-06-22 15:50:06.790092: 53700: ISBN 0789453916 ----
Caught exception for 0789756749: 'items'
Caught exception for 0789342189: 'items'
Caught exception for 0789340828: 'items'
C

Caught exception for 0735354308: 'categories'
Caught exception for 0735355339: 'items'
Caught exception for 0735355533: 'items'
Caught exception for 0735355657: 'items'
Caught exception for 0735355681: 'items'
Caught exception for 0735355746: 'items'
Caught exception for 0735355800: 'items'
Caught exception for 0735356211: 'categories'
Caught exception for 0735364842: 'items'
Caught exception for 0735364907: 'items'
Caught exception for 0735365237: 'categories'
Caught exception for 0735599017: 'categories'
Caught exception for 0735368635: 'items'
Caught exception for 073536642X: 'items'
Caught exception for 0735366489: 'items'
Caught exception for 0735366543: 'items'
Caught exception for 0735367078: 'items'
Caught exception for 0735367094: 'items'
Caught exception for 0735367299: 'items'
Caught exception for 0735367310: 'items'
Caught exception for 0735367884: 'items'
Caught exception for 0735368015: 'categories'
Caught exception for 0735368856: 'items'
Caught exception for 0735369305:

Caught exception for 0736439145: 'items'
Caught exception for 0736431497: 'items'
2024-06-22 15:56:10.264113: 55700: ISBN 0736434070 ----
Caught exception for 0736434070: 'items'
Caught exception for 0736751580: 'items'
Caught exception for 0738726702: 'items'
Caught exception for 0739020412: 'items'
2024-06-22 15:56:19.041181: 55800: ISBN 073901045X ----
Caught exception for 0739044311: 'items'
Caught exception for 0739083023: 'items'
Caught exception for 0739089285: 'items'
Caught exception for 0739089293: 'items'
Caught exception for 0739092464: 'items'
Caught exception for 0739095366: 'items'
Caught exception for 0739095374: 'items'
Caught exception for 0739410385: 'items'
Caught exception for 0739444646: 'items'
Caught exception for 0740303139: 'items'
Caught exception for 0740400460: 'categories'
Caught exception for 0740435590: 'categories'
Caught exception for 0739082426: 'items'
Caught exception for 0739080512: 'items'
Caught exception for 0739076108: 'items'
Caught exception 

Caught exception for 0977021599: 'items'
Caught exception for 097709667X: 'categories'
Caught exception for 0977248011: 'items'
Caught exception for 0977304582: 'items'
Caught exception for 0975533037: 'items'
Caught exception for 0976271877: 'categories'
Caught exception for 0976288826: 'items'
Caught exception for 0976480689: 'items'
Caught exception for 0977476081: 'categories'
2024-06-22 15:59:12.944967: 56900: ISBN 097889751X ----
Caught exception for 097889751X: 'categories'
Caught exception for 0979245524: 'items'
Caught exception for 0979348722: 'categories'
Caught exception for 0977977382: 'categories'
Caught exception for 0978573277: 'categories'
Caught exception for 0978573285: 'categories'
Caught exception for 0978768701: 'items'
Caught exception for 097876871X: 'items'
Caught exception for 0983141169: 'categories'
Caught exception for 0987271415: 'categories'
Caught exception for 0988159309: 'items'
Caught exception for 0988245159: 'categories'
Caught exception for 0988315

2024-06-22 16:06:23.547229: 58300: ISBN 0804852375 ----
Caught exception for 0804842019: 'items'
Caught exception for 0804843074: 'items'
Caught exception for 0804844887: 'items'
Caught exception for 0804846987: 'items'
Caught exception for 0835606481: 'items'
Caught exception for 0838823491: 'items'
Caught exception for 0838877044: 'items'
Caught exception for 0838896715: 'categories'
2024-06-22 16:06:56.150949: 58400: ISBN 083730136X ----
Caught exception for 0835820076: 'categories'
Caught exception for 0826604005: 'items'
Caught exception for 0826185738: 'categories'
Caught exception for 0825890578: 'items'
Caught exception for 0826690092: 'categories'
Caught exception for 0830781854: 'categories'
2024-06-22 16:07:13.053272: 58500: ISBN 082885114X ----
Caught exception for 082885114X: 'categories'
Caught exception for 0825871263: 'items'
Caught exception for 0849760089: 'items'
Caught exception for 0849760194: 'items'
Caught exception for 0849761956: 'items'
Caught exception for 08

In [228]:
failures = sum([1 for cat in book_category.values() if cat is np.nan])

failure_rate = failures/len(book_category)
print('%0.2f=%s/%s failure rate' % (failure_rate, failures,len(book_category)))

0.23=12570/55246 failure rate


In [229]:
isbn_cat_df = pd.DataFrame({
    'ISBN': book_category.keys(),
    CATEGORY: book_category.values(),
}).set_index('ISBN')
isbn_cat_df

Unnamed: 0_level_0,Category
ISBN,Unnamed: 1_level_1
0143127748,Medical
1524763136,Biography & Autobiography
0735211299,Business & Economics
0786965606,Games & Activities
1501110365,Fiction
...,...
0822310732,History
0822316153,Literary Criticism
0822316404,Literary Criticism
0822332493,History


Write these results to file

In [13]:
google_books_api_isbn_category_fpath = './preprocessed/google_books_api_isbn_category.csv'


SAVE_API_RESULTS = False

if SAVE_API_RESULTS == True:
    isbn_cat_df = pd.DataFrame({
        'ISBN': book_category.keys(),
        CATEGORY: book_category.values(),
    }).set_index('ISBN')
    # Save the data
    print('saving to %s...' % google_books_api_isbn_category_fpath)
    isbn_cat_df.to_csv(google_books_api_isbn_category_fpath)
    print('...saved\n')

isbn_cat_df = pd.read_csv(google_books_api_isbn_category_fpath, dtype={'ISBN':'str'}, index_col=0)
print(isbn_cat_df.shape)
isbn_cat_df.head()

(55246, 1)


Unnamed: 0_level_0,Category
ISBN,Unnamed: 1_level_1
143127748,Medical
1524763136,Biography & Autobiography
735211299,Business & Economics
786965606,Games & Activities
1501110365,Fiction


In [14]:
print('Top categories:')
isbn_cat_df[CATEGORY].apply(lambda c: str(c).upper()).value_counts().head(30)

Top categories:


Category
NAN                            12570
FICTION                         5687
JUVENILE FICTION                5595
JUVENILE NONFICTION             2576
COMICS & GRAPHIC NOVELS         2281
BIOGRAPHY & AUTOBIOGRAPHY       1609
COOKING                         1303
RELIGION                        1288
HISTORY                         1271
BUSINESS & ECONOMICS            1239
EDUCATION                        964
YOUNG ADULT FICTION              847
ART                              624
GAMES & ACTIVITIES               618
PSYCHOLOGY                       617
BODY, MIND & SPIRIT              600
HEALTH & FITNESS                 598
SOCIAL SCIENCE                   595
MUSIC                            587
SELF-HELP                        566
FAMILY & RELATIONSHIPS           556
SCIENCE                          496
MEDICAL                          444
POLITICAL SCIENCE                405
COMPUTERS                        398
HUMOR                            332
LANGUAGE ARTS & DISCIPLINES  