# Imports

In [1]:
import nytconfig
import requests
import pymongo
from pprint import pprint
import urllib.parse
import time
import datetime as dt
import pickle
import pandas as pd

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

# Books on NYT Bestsellers Lists

### Manual API request

In [3]:
url_endpoint = 'https://api.nytimes.com/svc/books/v3/lists/names.json'
mydict = {'api-key': nytconfig.key}
resp = requests.get(url_endpoint, params=mydict)
status = resp.status_code
headers = resp.headers

In [4]:
print(status)
print(headers)

200
{'Access-Control-Allow-Headers': 'Accept, Content-Type, X-Forwarded-For, X-Prototype-Version, X-Requested-With', 'X-RateLimit-Limit-day': '1000', 'X-Varnish': '137423651', 'Access-Control-Allow-Origin': '*', 'Age': '0', 'Date': 'Sun, 04 Mar 2018 01:00:49 GMT', 'X-Kong-Proxy-Latency': '902', 'Accept-Ranges': 'bytes', 'Server': 'Apache/2.2.15 (CentOS)', 'X-RateLimit-Remaining-second': '4', 'Vary': 'Origin', 'Via': 'kong/0.9.5', 'Content-Length': '11208', 'X-Kong-Upstream-Latency': '152', 'Connection': 'keep-alive', 'Access-Control-Expose-Headers': 'Content-Length, X-JSON', 'X-RateLimit-Remaining-day': '994', 'Access-Control-Allow-Methods': 'GET, OPTIONS', 'Content-Type': 'application/json; charset=UTF-8', 'X-Cache': 'MISS', 'X-RateLimit-Limit-second': '5'}


In [3]:
list_names = resp.json()

In [6]:
list_names

{'copyright': 'Copyright (c) 2018 The New York Times Company.  All Rights Reserved.',
 'num_results': 53,
 'results': [{'display_name': 'Combined Print & E-Book Fiction',
   'list_name': 'Combined Print and E-Book Fiction',
   'list_name_encoded': 'combined-print-and-e-book-fiction',
   'newest_published_date': '2018-03-04',
   'oldest_published_date': '2011-02-13',
   'updated': 'WEEKLY'},
  {'display_name': 'Combined Print & E-Book Nonfiction',
   'list_name': 'Combined Print and E-Book Nonfiction',
   'list_name_encoded': 'combined-print-and-e-book-nonfiction',
   'newest_published_date': '2018-03-04',
   'oldest_published_date': '2011-02-13',
   'updated': 'WEEKLY'},
  {'display_name': 'Hardcover Fiction',
   'list_name': 'Hardcover Fiction',
   'list_name_encoded': 'hardcover-fiction',
   'newest_published_date': '2018-03-04',
   'oldest_published_date': '2008-06-08',
   'updated': 'WEEKLY'},
  {'display_name': 'Hardcover Nonfiction',
   'list_name': 'Hardcover Nonfiction',
   'li

In [7]:
len(list_names['results'])

53

In [8]:
list_names['results'][0]

{'display_name': 'Combined Print & E-Book Fiction',
 'list_name': 'Combined Print and E-Book Fiction',
 'list_name_encoded': 'combined-print-and-e-book-fiction',
 'newest_published_date': '2018-03-04',
 'oldest_published_date': '2011-02-13',
 'updated': 'WEEKLY'}

### MongoDB Connection

In [7]:
# create database and collection for list overviews
client = pymongo.MongoClient()
db = client.bestsellers_lists
lists = db.list_names

In [19]:
client.database_names()

['admin', 'bestsellers_lists', 'config', 'local']

In [20]:
db.collection_names()

['weekly_publications', 'descriptions', 'lists']

### Add lists overview to Mongo

In [177]:
# store json response of list names in new collection
result = db.lists.insert_many([i for i in list_names['results']])

In [9]:
db.lists.count()

53

In [8]:
db.lists.find_one()

{'_id': ObjectId('5a9505434f61b659b444fc01'),
 'display_name': 'Combined Print & E-Book Fiction',
 'list_name': 'Combined Print and E-Book Fiction',
 'list_name_encoded': 'combined-print-and-e-book-fiction',
 'newest_published_date': '2018-03-04',
 'oldest_published_date': '2011-02-13',
 'updated': 'WEEKLY'}

### Generate weekly lists to query by category

In [8]:
# create a dictionary of all bestseller lists by category, from oldest to newest date
d = dt.timedelta(days=7)
weekly_lists = {}

for document in db.lists.find({'updated': 'WEEKLY'}):
    weekly_lists[document['list_name_encoded']] = [document['oldest_published_date']]
    current_dt = dt.datetime.strptime(document['oldest_published_date'],'%Y-%m-%d')
    newest_dt = dt.datetime.strptime(document['newest_published_date'],'%Y-%m-%d')
    diff_dt = newest_dt - current_dt
    diff_weeks = diff_dt.days//7
    while current_dt < newest_dt:
        current_dt += dt.timedelta(days=7)
        weekly_lists[document['list_name_encoded']].append(current_dt.strftime('%Y-%m-%d'))
print('number of publications for combined-print-and-e-book-fiction:',len(weekly_lists['combined-print-and-e-book-fiction']))

number of publications for combined-print-and-e-book-fiction: 369


In [6]:
# number of list categories published weekly
len(weekly_lists)

29

In [9]:
# total number of weeks across all list categories
sum(len(v) for v in weekly_lists.values())

8422

In [13]:
# last 30 weeks only
num_queries = 0
dates = []
for key in weekly_lists:
    for value in weekly_lists[key][-30:]:
        if value:
            num_queries += 1
print(num_queries)

870


### Query a specific category of weekly lists across all dates

In [9]:
weekly_publications = db.weekly_publications

In [24]:
def get_weekly_publications_category(weekly_lists,category):
    for value in log_progress(weekly_lists[category][:65], every=1): # already pulled last 35 weeks so stop before that
        base = 'https://api.nytimes.com/svc/books/v3/lists.json?'
        parameters = {'api-key': nytconfig.key, 'list': category, 'date': value}
        url = base + urllib.parse.urlencode(parameters)
        resp = requests.get(url)
        status = resp.status_code
        rate_left = int(resp.headers['X-RateLimit-Remaining-day'])
        attempted = [i for i in resp.url.split('?')[1:][0].split('&') if 'api-key' not in i]
        if status == 200:
            publication_data = resp.json()
            if publication_data['num_results'] > 0:
                result = db.weekly_publications.insert_many([i for i in publication_data['results']])
                print('Successfully Retrieved: {} Rate Left: {}'.format(attempted,rate_left))
            else:
                print('No Results, Skipped: {} Rate Left: {}'.format(attempted,rate_left))
        else:
            print('FAILED Status: {} Tried Retrieving: {}'.format(status,attempted))
        #time.sleep(0.5)
    return 'All done!'

In [25]:
get_weekly_publications_category(weekly_lists,'manga')

Successfully Retrieved: ['date=2009-03-15', 'list=manga'] Rate Left: 64
Successfully Retrieved: ['date=2009-03-22', 'list=manga'] Rate Left: 63
Successfully Retrieved: ['date=2009-03-29', 'list=manga'] Rate Left: 62
Successfully Retrieved: ['date=2009-04-05', 'list=manga'] Rate Left: 61
Successfully Retrieved: ['date=2009-04-12', 'list=manga'] Rate Left: 60
Successfully Retrieved: ['date=2009-04-19', 'list=manga'] Rate Left: 59
Successfully Retrieved: ['date=2009-04-26', 'list=manga'] Rate Left: 58
Successfully Retrieved: ['date=2009-05-03', 'list=manga'] Rate Left: 57
Successfully Retrieved: ['date=2009-05-10', 'list=manga'] Rate Left: 56
Successfully Retrieved: ['date=2009-05-17', 'list=manga'] Rate Left: 55
Successfully Retrieved: ['date=2009-05-24', 'list=manga'] Rate Left: 54
Successfully Retrieved: ['date=2009-05-31', 'list=manga'] Rate Left: 53
Successfully Retrieved: ['date=2009-06-07', 'list=manga'] Rate Left: 52
Successfully Retrieved: ['date=2009-06-14', 'list=manga'] Rate L

'All done!'

In [21]:
item = weekly_lists['childrens-middle-grade'][45:-35]
print('array length: {}'.format(len(item)))
print('array items:\n{}'.format(item))

array length: 62
array items:
['2013-10-20', '2013-10-27', '2013-11-03', '2013-11-10', '2013-11-17', '2013-11-24', '2013-12-01', '2013-12-08', '2013-12-15', '2013-12-22', '2013-12-29', '2014-01-05', '2014-01-12', '2014-01-19', '2014-01-26', '2014-02-02', '2014-02-09', '2014-02-16', '2014-02-23', '2014-03-02', '2014-03-09', '2014-03-16', '2014-03-23', '2014-03-30', '2014-04-06', '2014-04-13', '2014-04-20', '2014-04-27', '2014-05-04', '2014-05-11', '2014-05-18', '2014-05-25', '2014-06-01', '2014-06-08', '2014-06-15', '2014-06-22', '2014-06-29', '2014-07-06', '2014-07-13', '2014-07-20', '2014-07-27', '2014-08-03', '2014-08-10', '2014-08-17', '2014-08-24', '2014-08-31', '2014-09-07', '2014-09-14', '2014-09-21', '2014-09-28', '2014-10-05', '2014-10-12', '2014-10-19', '2014-10-26', '2014-11-02', '2014-11-09', '2014-11-16', '2014-11-23', '2014-11-30', '2014-12-07', '2014-12-14', '2014-12-21']


In [27]:
print('books added to collection:',weekly_publications.count())
print('unique books by title:',len(weekly_publications.distinct('book_details.title')))

books added to collection: 110475
unique books by title: 10811


In [7]:
### be mindful of how many weeks to grab per category
weeks = []
for key in weekly_lists:
    temp = [len(weekly_lists[key]),key]
    weeks.append(temp)
pprint(sorted(weeks, reverse=True))

[[509, 'trade-fiction-paperback'],
 [509, 'series-books'],
 [509, 'picture-books'],
 [509, 'paperback-nonfiction'],
 [509, 'hardcover-nonfiction'],
 [509, 'hardcover-fiction'],
 [452, 'mass-market-paperback'],
 [412, 'paperback-graphic-books'],
 [412, 'manga'],
 [412, 'hardcover-graphic-books'],
 [369, 'combined-print-and-e-book-nonfiction'],
 [369, 'combined-print-and-e-book-fiction'],
 [312, 'e-book-nonfiction'],
 [312, 'e-book-fiction'],
 [255, 'paperback-advice'],
 [255, 'hardcover-advice'],
 [254, 'advice-how-to-and-miscellaneous'],
 [236, 'paperback-books'],
 [236, 'chapter-books'],
 [141, 'young-adult'],
 [141, 'childrens-middle-grade'],
 [132, 'young-adult-hardcover'],
 [132, 'childrens-middle-grade-hardcover'],
 [118, 'combined-print-nonfiction'],
 [118, 'combined-print-fiction'],
 [75, 'young-adult-paperback'],
 [75, 'young-adult-e-book'],
 [75, 'childrens-middle-grade-paperback'],
 [75, 'childrens-middle-grade-e-book']]


### Get unique ISBNs from books

In [26]:
list_names = []
for item in weekly_publications.aggregate([{'$group' : {'_id':'$list_name', 'count':{'$sum':1}}}]):
    list_names.append(item)
df_lists = pd.DataFrame(list_names).sort_values('count',ascending=False)
df_lists

Unnamed: 0,_id,count
5,Hardcover Fiction,9895
0,Hardcover Nonfiction,9875
10,Paperback Nonfiction,9215
18,Trade Fiction Paperback,9215
27,Mass Market Paperback,8645
11,Combined Print and E-Book Fiction,7095
14,Combined Print and E-Book Nonfiction,7095
13,E-Book Nonfiction,5865
21,E-Book Fiction,5865
23,Picture Books,5080


In [27]:
weekly_publications.find({ '$or': [{'list_name': 'Paperback Books'}, 
                                   {'list_name': 'Combined Print Nonfiction'}, 
                                   {'list_name': 'Combined Print Fiction'}, 
                                   {'list_name': 'E-Book Nonfiction'}, 
                                   {'list_name': 'E-Book Fiction'}, 
                                   {'list_name': 'Combined Print and E-Book Nonfiction'}, 
                                   {'list_name': 'Combined Print and E-Book Fiction'}, 
                                   {'list_name': 'Hardcover Graphic Books'}, 
                                   {'list_name': 'Paperback Graphic Books'}, 
                                   {'list_name': 'Series Books'}]}
                        ).count()

46305

In [28]:
category_isbn13 = weekly_publications.find(
    { '$and': [{'book_details.primary_isbn13': {'$nin': [None,'']}},
               { '$or': [{'list_name': 'Paperback Books'}, 
                                   {'list_name': 'Combined Print Nonfiction'}, 
                                   {'list_name': 'Combined Print Fiction'}, 
                                   {'list_name': 'E-Book Nonfiction'}, 
                                   {'list_name': 'E-Book Fiction'}, 
                                   {'list_name': 'Combined Print and E-Book Nonfiction'}, 
                                   {'list_name': 'Combined Print and E-Book Fiction'}, 
                                   {'list_name': 'Hardcover Graphic Books'}, 
                                   {'list_name': 'Paperback Graphic Books'}, 
                                   {'list_name': 'Series Books'}]}
              ]}, 
    {'book_details.primary_isbn13': 1})

category_isbn13_list = []
for doc in category_isbn13:
    category_isbn13_list.append(doc['book_details'][0]['primary_isbn13'])

category_isbn13_list = list(set(category_isbn13_list))

print('unique ISBNs for chosen category:',len(category_isbn13_list))
pprint(category_isbn13_list[:5])

unique ISBNs for chosen category: 8322
['9780062371904',
 '9780393609394',
 '9780062000125',
 '9780785131700',
 '9780345521316']


### Pickle unique ISBNs for Google Books API queries

In [29]:
with open('adult_isbn13_8322.pkl', 'wb') as f:
    pickle.dump(category_isbn13_list, f)