# Imports

In [1]:
import googlebooksconfig
import requests
import pymongo
from pprint import pprint
import urllib.parse
import time
import pickle

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

**Hardcover Nonfiction** - DONE

In [4]:
with open('hardcover_nonfiction_isbn13_1766.pkl', 'rb') as f:
    hardcover_nonfiction_isbn13 = pickle.load(f)
len(hardcover_nonfiction_isbn13)

1766

**Paperback Nonfiction** - DONE

In [4]:
with open('paperback_nonfiction_isbn13_563.pkl', 'rb') as f:
    paperback_nonfiction_isbn13 = pickle.load(f)
len(paperback_nonfiction_isbn13)

563

**Advice** - DONE

In [3]:
with open('advice_isbn13_1524.pkl', 'rb') as f:
    advice_isbn13 = pickle.load(f)
len(advice_isbn13)

1524

**Hardcover Fiction, Trade Fiction Paperback, Mass Market Paperback** - DONE

In [3]:
with open('fiction_isbn13_4976.pkl', 'rb') as f:
    fiction_isbn13 = pickle.load(f)
len(fiction_isbn13)

4976

**remaining non-young adult / children's books** - IN PROGRESS

In [10]:
with open('adult_isbn13_8322.pkl', 'rb') as f:
    adult_isbn13 = pickle.load(f)
len(adult_isbn13)

8322

# Google Book Descriptions by ISBN

In [56]:
isbn13 = '9780307269621'
isbn = 'isbn:' + isbn13
base = 'https://www.googleapis.com/books/v1/volumes?'
params = {'key': googlebooksconfig.key, 'q': isbn}
url = base + urllib.parse.urlencode(params)
resp = requests.get(url)

In [57]:
status = resp.status_code
headers = resp.headers

In [58]:
status

200

In [59]:
headers

{'X-XSS-Protection': '1; mode=block', 'Alt-Svc': 'hq=":443"; ma=2592000; quic=51303431; quic=51303339; quic=51303338; quic=51303337; quic=51303335,quic=":443"; ma=2592000; v="41,39,38,37,35"', 'Cache-Control': 'private, max-age=0, must-revalidate, no-transform', 'X-Frame-Options': 'SAMEORIGIN', 'Content-Encoding': 'gzip', 'Date': 'Thu, 01 Mar 2018 23:40:15 GMT', 'Transfer-Encoding': 'chunked', 'Content-Type': 'application/json; charset=UTF-8', 'Server': 'GSE', 'ETag': '"f6QLsd8inUgRKJVNBqJ8xHnPb0w/FvLL0UAf-t2jthvdgUfvAzXlOMQ"', 'Vary': 'Origin, X-Origin', 'Expires': 'Thu, 01 Mar 2018 23:40:15 GMT', 'X-Content-Type-Options': 'nosniff'}

In [60]:
body = resp.json()
body.keys()

dict_keys(['kind', 'items', 'totalItems'])

In [63]:
pprint(body)

{'items': [{'accessInfo': {'accessViewStatus': 'NONE',
                           'country': 'US',
                           'embeddable': False,
                           'epub': {'isAvailable': False},
                           'pdf': {'isAvailable': True},
                           'publicDomain': False,
                           'quoteSharingAllowed': False,
                           'textToSpeechPermission': 'ALLOWED',
                           'viewability': 'NO_PAGES',
                           'webReaderLink': 'http://play.google.com/books/reader?id=SPBPyujwygIC&hl=&printsec=frontcover&source=gbs_api'},
            'etag': 'syIE/r1RS8A',
            'id': 'SPBPyujwygIC',
            'kind': 'books#volume',
            'saleInfo': {'country': 'US',
                         'isEbook': False,
                         'saleability': 'NOT_FOR_SALE'},
            'searchInfo': {'textSnippet': 'Presents a narrative profile of the '
                                          'se

In [61]:
body['items'][0]['volumeInfo']

{'allowAnonLogging': False,
 'authors': ['Joseph J. Ellis'],
 'averageRating': 4.0,
 'canonicalVolumeLink': 'https://books.google.com/books/about/First_Family.html?hl=&id=SPBPyujwygIC',
 'categories': ['Biography & Autobiography'],
 'contentVersion': '1.1.0.0.preview.0',
 'description': 'Presents a narrative profile of the second president and his wife that traces their more than fifty-year partnership in such areas as civic and foreign affairs.',
 'imageLinks': {'smallThumbnail': 'http://books.google.com/books/content?id=SPBPyujwygIC&printsec=frontcover&img=1&zoom=5&source=gbs_api',
  'thumbnail': 'http://books.google.com/books/content?id=SPBPyujwygIC&printsec=frontcover&img=1&zoom=1&source=gbs_api'},
 'industryIdentifiers': [{'identifier': '9780307269621', 'type': 'ISBN_13'},
  {'identifier': '0307269620', 'type': 'ISBN_10'}],
 'infoLink': 'http://books.google.com/books?id=SPBPyujwygIC&dq=isbn:9780307269621&hl=&source=gbs_api',
 'language': 'en',
 'maturityRating': 'NOT_MATURE',
 'pa

## MongoDB Connection

In [4]:
client = pymongo.MongoClient()
db = client.bestsellers_lists
descriptions = db.descriptions

In [5]:
client.database_names()

['admin', 'bestsellers_lists', 'config', 'local']

In [6]:
db.collection_names()

['weekly_publications', 'descriptions', 'lists']

## Query API

In [11]:
def get_descriptions(isbn13_list,start=0,stop=1000,counter=1000):
    base = 'https://www.googleapis.com/books/v1/volumes?'
    for isbn13 in log_progress(isbn13_list[start:stop], every=1):
        isbn = 'isbn:' + isbn13
        params = {'key': googlebooksconfig.key, 'q': isbn}
        url = base + urllib.parse.urlencode(params)
        resp = requests.get(url)
        status = resp.status_code
        counter -= 1
        if status == 200:
            volume_info = resp.json()
            if volume_info['totalItems'] > 0:
                for volume in volume_info['items']:
                    book_result = db.descriptions.insert_one(volume['volumeInfo'])
                    book_id = book_result.inserted_id
                    isbn13_result = db.descriptions.update_one({'_id': book_id }, {'$set': {'nyt_isbn13': isbn13}})
                    print('Successfully Retrieved: {} Rate Left: {}'.format(isbn13,counter))
            else:
                print('No Volume Found: {} Rate Left: {}'.format(isbn13,counter))
        else:
            print('FAILED Status: {} Tried Retrieving: {}'.format(status,isbn13))
        #time.sleep(0.5)
    return 'All done!'

In [17]:
get_descriptions(adult_isbn13,142,1142,1000) # <-- values are already set for running on March 9th

No Volume Found: 9780785153061 Rate Left: 90
FAILED Status: 403 Tried Retrieving: 9780316504836
FAILED Status: 403 Tried Retrieving: 9781451673302
FAILED Status: 403 Tried Retrieving: 9781101874288
FAILED Status: 403 Tried Retrieving: 9780385531399
FAILED Status: 403 Tried Retrieving: 9780062390806
FAILED Status: 403 Tried Retrieving: 9781401233594
FAILED Status: 403 Tried Retrieving: 9780525953098
FAILED Status: 403 Tried Retrieving: 9781401237554
FAILED Status: 403 Tried Retrieving: 9780983987536
FAILED Status: 403 Tried Retrieving: 9781606902042
FAILED Status: 403 Tried Retrieving: 9781439177327
FAILED Status: 403 Tried Retrieving: 9780316261128
FAILED Status: 403 Tried Retrieving: 9781616555504
FAILED Status: 403 Tried Retrieving: 9781400068418
FAILED Status: 403 Tried Retrieving: 9780698163454
FAILED Status: 403 Tried Retrieving: 9780385537353
FAILED Status: 403 Tried Retrieving: A00B01FR7DBL0
FAILED Status: 403 Tried Retrieving: 9780062081926
FAILED Status: 403 Tried Retrieving: 

KeyboardInterrupt: 

In [16]:
len(adult_isbn13)

8322

In [18]:
# total books loaded into mongo collection
descriptions.count()

7203

In [19]:
# books with ISBNs (industry identifiers)
descriptions.find({'industryIdentifiers': {'$nin': [None,'']}}).count()

7063

In [20]:
# books without ISBNs
descriptions.find({'industryIdentifiers': {'$in': [None,'']}}).count()

140

In [19]:
# let's see one book without ISBN
count = 0
for doc in descriptions.find({'industryIdentifiers': {'$in': [None,'']}}).limit(1):
    count += 1
    print('\n-----[ {} ]-----\n'.format(count))
    pprint(doc)


-----[ 1 ]-----

{'_id': ObjectId('5a97a1da4f61b6966baa0888'),
 'allowAnonLogging': False,
 'authors': ['Bruce Schneier'],
 'averageRating': 4.0,
 'canonicalVolumeLink': 'https://books.google.com/books/about/Data_and_Goliath.html?hl=&id=_grPBgAAQBAJ',
 'categories': ['Computers'],
 'contentVersion': 'preview-1.0.0',
 'description': 'You are under surveillance right now. Your cell phone '
                'provider tracks your location and knows who’s with you. Your '
                'online and in-store purchasing patterns are recorded, and '
                "reveal if you're unemployed, sick, or pregnant. Your e-mails "
                'and texts expose your intimate and casual friends. Google '
                'knows what you’re thinking because it saves your private '
                'searches. Facebook can determine your sexual orientation '
                'without you ever mentioning it. The powers that surveil us do '
                'more than simply store this information. Cor

In [13]:
# books with descriptions
descriptions.find({'description': {'$nin': [None,'']}}).count()

4065

In [17]:
# let's see one book with description
count = 0
for doc in descriptions.find({'description': {'$nin': [None,'']}}).limit(1):
    count += 1
    print('\n-----[ {} ]-----\n'.format(count))
    pprint(doc)


-----[ 1 ]-----

{'_id': ObjectId('5a97a1d84f61b6966baa0886'),
 'allowAnonLogging': False,
 'authors': ['Joseph J. Ellis'],
 'averageRating': 4.0,
 'canonicalVolumeLink': 'https://books.google.com/books/about/First_Family.html?hl=&id=SPBPyujwygIC',
 'categories': ['Biography & Autobiography'],
 'contentVersion': '1.1.0.0.preview.0',
 'description': 'Presents a narrative profile of the second president and his '
                'wife that traces their more than fifty-year partnership in '
                'such areas as civic and foreign affairs.',
 'imageLinks': {'smallThumbnail': 'http://books.google.com/books/content?id=SPBPyujwygIC&printsec=frontcover&img=1&zoom=5&source=gbs_api',
                'thumbnail': 'http://books.google.com/books/content?id=SPBPyujwygIC&printsec=frontcover&img=1&zoom=1&source=gbs_api'},
 'industryIdentifiers': [{'identifier': '9780307269621', 'type': 'ISBN_13'},
                         {'identifier': '0307269620', 'type': 'ISBN_10'}],
 'infoLink': 'http:/

In [16]:
# get ISBNs and descriptions only from mongo documents
count = 0
for desc in descriptions.find({'description': {'$nin': [None,'']}},
                              {'industryIdentifiers': 1, 'description': 1, '_id': 0}).limit(5):
    count += 1
    print('-----[ {} ]-----'.format(count))
    if 'industryIdentifiers' in desc:
        print(desc['industryIdentifiers'],'\n',desc['description'])
    else:
        print('NO ISBN\n',desc['description'])

-----[ 1 ]-----
[{'type': 'ISBN_13', 'identifier': '9780307269621'}, {'type': 'ISBN_10', 'identifier': '0307269620'}] 
 Presents a narrative profile of the second president and his wife that traces their more than fifty-year partnership in such areas as civic and foreign affairs.
-----[ 2 ]-----
[{'type': 'ISBN_10', 'identifier': '1591845505'}, {'type': 'ISBN_13', 'identifier': '9781591845508'}] 
 Presents a counterintuitive assessment of the financial crisis to identify what the author believes were its actual causes, outlining recommended changes for strengthening the nation's economy.
-----[ 3 ]-----
NO ISBN
 You are under surveillance right now. Your cell phone provider tracks your location and knows who’s with you. Your online and in-store purchasing patterns are recorded, and reveal if you're unemployed, sick, or pregnant. Your e-mails and texts expose your intimate and casual friends. Google knows what you’re thinking because it saves your private searches. Facebook can determin

In [15]:
# check if ISBNs from NYTimes API used to query Google Books API
# are actually inside the returned book metadata from Google
confirmed_isbn13 = []
for desc in descriptions.find({'industryIdentifiers': {'$nin': [None,'']}},
                              {'industryIdentifiers': 1, '_id': 0}):
    if desc['industryIdentifiers'][0]['identifier'] in fiction_isbn13:
        confirmed_isbn13.append(desc['industryIdentifiers'][0]['identifier'])
print('google volumes with nyt isbn13:',len(confirmed_isbn13))

google volumes with nyt isbn13: 439


- It appears that only half the ISBNs used to query Google Books API are actually inside the returned volume metadata
- A workaround will be matching up the title/author from NYT API with Google Books API title/author
- The more robust solution is to insert the ISBN used for querying Google inside the associated Mongo document

In [16]:
fiction_isbn13[0]

'9780316407021'

In [14]:
for desc in descriptions.find({'industryIdentifiers': {'$nin': [None,'']}}):
    if desc['industryIdentifiers'][0]['identifier'] == hardcover_nonfiction_isbn13[0]:
        book_id = desc['_id']
        pprint(desc)

{'_id': ObjectId('5a97a1d84f61b6966baa0886'),
 'allowAnonLogging': False,
 'authors': ['Joseph J. Ellis'],
 'averageRating': 4.0,
 'canonicalVolumeLink': 'https://books.google.com/books/about/First_Family.html?hl=&id=SPBPyujwygIC',
 'categories': ['Biography & Autobiography'],
 'contentVersion': '1.1.0.0.preview.0',
 'description': 'Presents a narrative profile of the second president and his '
                'wife that traces their more than fifty-year partnership in '
                'such areas as civic and foreign affairs.',
 'imageLinks': {'smallThumbnail': 'http://books.google.com/books/content?id=SPBPyujwygIC&printsec=frontcover&img=1&zoom=5&source=gbs_api',
                'thumbnail': 'http://books.google.com/books/content?id=SPBPyujwygIC&printsec=frontcover&img=1&zoom=1&source=gbs_api'},
 'industryIdentifiers': [{'identifier': '9780307269621', 'type': 'ISBN_13'},
                         {'identifier': '0307269620', 'type': 'ISBN_10'}],
 'infoLink': 'http://books.google.com/

## Upsert Mongo books with NYT ISBN
For all books loaded into Mongo, upsert the original ISBN13 (or alternative below) into the record to facilitate matching of NYT and Google API metadata in Pandas.

In [17]:
print('docs w/   nyt isbn13:',descriptions.find({'nyt_isbn13': {'$nin': [None,'']}}).count())
print('unique    nyt isbn13:',len(descriptions.distinct('nyt_isbn13')))
print('\ndocs w/o  nyt isbn13:',descriptions.find({'nyt_isbn13': {'$in': [None,'']}}).count())

docs w/   nyt isbn13: 4112
unique    nyt isbn13: 4101

docs w/o  nyt isbn13: 3


In [55]:
for doc in log_progress(descriptions.find({'nyt_isbn13': {'$in': [None,'']}}), every=1):
    book_id = doc['_id']
    url = [item for item in doc['infoLink'].split('?')[1].split('&') if '=isbn:' in item]
    if len(url) > 0:
        isbn = url[0].split(':')[1]
        isbn13_result = descriptions.update_one({'_id': book_id }, {'$set': {'nyt_isbn13': isbn}})

## NYT Bestsellers "Missing" from Google Books API
About 16% of the books from NYT's bestsellers lists are not found in Google Books API, using NYT's primary ISBN13 field for lookup. Alternatives include the primary ISBN10 or the list of other ISBNs for various print versions of the same book title. Identify the missing books by comparing records loaded into Mongo with the original primary ISBNs used to query Google, then pull alternative ISBNs from NYT weekly lists to try querying Google API again.