################################################################################################################################
#                  Mid Term: Author-Vaishali Lambe, NUID-001286444                            #
################################################################################################################################
# Question 2

#### Collection of data from the NYT API

From the Books API we will try to answer the question: which category of books has the bestseller that was on the list for the greatest number of weeks?

* Get the list of categories
* Get the list of bestsellers for each category

From the Archive Search API we will try to answer the questions: how often has Google been in the news each month since 2000?  Is it a Technology company or a Financial company?

In [2]:
# Import the modules we need.
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import datetime
from glob import glob
import json
import os
import re
import requests
import string
import sys
import time


# Path to the data directory into which downloaded JSON is saved.
data_path = os.path.join("..", "data")
print(data_path)

if os.path.isdir(data_path):
    print(data_path + " is a directory")
else:
    print(data_path + " is NOT a directory - something is wrong :(")


# NYT API Key needs to be set in the environment before running this notebook.

nyt_archive_key = os.getenv('nyt_archive_key')
#print(nyt_archive_key)

if (nyt_archive_key is None) or (nyt_archive_key == ''):
    print("NYT API key is missing - it should be in an environment variable named 'nyt_archive_key'")

..\data
..\data is a directory


In [3]:
# General-purpose utility function for saving an object as JSON to the data directory.
def save_to_json(obj, save_file_path):
    print("saving to file: " + save_file_path)

    with open(save_file_path, "wt") as f:
        json.dump(obj, f)
        
# General function for getting JSON, either by downloading or from a cache file.
def resolve_nyt_json(url, cache_file, request_params={}):
    if os.path.isfile(cache_file):
        # Cache file exists, so use that.
        result = {}
        with open(cache_file, 'rt') as f:
            try:
                result = json.load(f)
            except ValueError:
                result = {}
                
        print("resolve_nyt_json(): returning value from cache file: " + cache_file)
        return result

    # It's not in the cache, so download and save it.
    print("resolve_nyt_json(): downloading from NYT API")

    response = requests.get(url, params=request_params)
    print(response.status_code)
        
    # Sleep for 5 seconds after a request, to avoid being rate-limited by the NYT servers.
    time.sleep(5)
    
    if 200 == response.status_code:
        save_to_json(response.json(), cache_file)
    else:
        print("resolve_nyt_json(): error downloading from NYT API ({code})".format(code=response.status_code))
        return {}
        
    return response.json()

In [4]:
# URL for calls to books/v3/lists/names.
def get_books_list_names_url():
    return "https://api.nytimes.com/svc/books/v3/lists/names.json"

# Name of the cache file for calls to books/v3/lists/names.
def get_books_list_names_cache_file_path():
    return os.path.join(data_path, "books_v3_lists_names.json")

# Name of the cache file for calls to books/v3/lists/names.
def get_books_list_names_params():
    return {'api-key':nyt_archive_key}

# Convenience routine for getting the names of the bestseller lists.
def resolve_books_list_names():
    return resolve_nyt_json(get_books_list_names_url(), 
                            get_books_list_names_cache_file_path(), 
                            get_books_list_names_params())

In [5]:
# Trigger a download of the bestseller list names.
list_names_json = resolve_books_list_names()
print(len(list_names_json['results']))

resolve_nyt_json(): returning value from cache file: ..\data\books_v3_lists_names.json
53


In [6]:
# URL for calls to books/v3/lists.
def get_books_list_url():
    return "https://api.nytimes.com/svc/books/v3/lists.json"

# Name of the cache file for calls to books/v3/lists.
def get_books_list_cache_file_path(list_name):
    filename = "books_v3_lists_{list_name}.json".format(list_name=list_name)
    print(filename)
    return os.path.join(data_path, filename)

# Name of the cache file for calls to books/v3/lists.
def get_books_list_params(list_name):
    return {'api-key':nyt_archive_key, 
            'list':list_name,
            'sort-order':'ASC'}

# Convenience routine for getting a bestseller list.
def resolve_books_list(list_name):
    return resolve_nyt_json(get_books_list_url(), 
                            get_books_list_cache_file_path(list_name), 
                            get_books_list_params(list_name))

In [7]:
# Trigger a download of the data for each bestseller list.
longest_on_list = []
weekly_list_names = list([w['list_name_encoded'] for w in list_names_json['results'] if w['updated'] == 'WEEKLY'])
for weekly_list in weekly_list_names:
    # print(weekly_list)
    response = resolve_books_list(weekly_list)
    

books_v3_lists_combined-print-and-e-book-fiction.json
resolve_nyt_json(): returning value from cache file: ..\data\books_v3_lists_combined-print-and-e-book-fiction.json
books_v3_lists_combined-print-and-e-book-nonfiction.json
resolve_nyt_json(): returning value from cache file: ..\data\books_v3_lists_combined-print-and-e-book-nonfiction.json
books_v3_lists_hardcover-fiction.json
resolve_nyt_json(): returning value from cache file: ..\data\books_v3_lists_hardcover-fiction.json
books_v3_lists_hardcover-nonfiction.json
resolve_nyt_json(): returning value from cache file: ..\data\books_v3_lists_hardcover-nonfiction.json
books_v3_lists_trade-fiction-paperback.json
resolve_nyt_json(): returning value from cache file: ..\data\books_v3_lists_trade-fiction-paperback.json
books_v3_lists_mass-market-paperback.json
resolve_nyt_json(): returning value from cache file: ..\data\books_v3_lists_mass-market-paperback.json
books_v3_lists_paperback-nonfiction.json
resolve_nyt_json(): returning value from 

In [8]:
# URL for calls to search/v2/articlesearch.
def get_article_search_url():
    return "https://api.nytimes.com/svc/search/v2/articlesearch.json"

# Name of the cache file for calls to search/v2/articlesearch.
def get_article_search_cache_file_path(query_term, news_desk, end_date):
    filename = "articlesearch_v2_{query_term}_{news_desk}_{end_date}.json".format(
        query_term=query_term, 
        news_desk=news_desk,
        end_date=end_date)
    print(filename)
    return os.path.join(data_path, filename)

# Name of the cache file for calls to search/v2/articlesearch.
def get_article_search_params(query_term, news_desk, end_date):
    filter_query = "news_desk:(\"{news_desk}\")".format(news_desk=news_desk)
    return {'api-key':nyt_archive_key, 
            'q':query_term,
            'end_date':end_date,
            'fq':filter_query,
            'fl':"headline,word_count"}

# Convenience routine for getting the result of an article search.
def resolve_article_search(query_term, news_desk, month):
    end_date = month + "01"
    return resolve_nyt_json(get_article_search_url(), 
                            get_article_search_cache_file_path(query_term, news_desk, end_date), 
                            get_article_search_params(query_term, news_desk, end_date))

In [9]:
# Split 'yyyymm' into year and month, then convert to a number of months.
def to_month(yyyymm):
    y, m = int(yyyymm[:4]), int(yyyymm[4:])
    return y * 12 + m

# Get a list of months from 'start' to 'end'.
# Starting and ending months should be strings in yyyymm format.
def get_month_range(start, end):
    result = []
    for month in range(to_month(start), to_month(end) + 1):
        y, m = divmod(month-1, 12)
        if len(str(m+1)) == 1:
            mm = "0" + str(m+1)
        else:
            mm = str(m+1)
            
        yyyymm = str(y) + mm
        result.append(yyyymm)

    return result

month_list = get_month_range('200001', '201704')
print(month_list)    

['200001', '200002', '200003', '200004', '200005', '200006', '200007', '200008', '200009', '200010', '200011', '200012', '200101', '200102', '200103', '200104', '200105', '200106', '200107', '200108', '200109', '200110', '200111', '200112', '200201', '200202', '200203', '200204', '200205', '200206', '200207', '200208', '200209', '200210', '200211', '200212', '200301', '200302', '200303', '200304', '200305', '200306', '200307', '200308', '200309', '200310', '200311', '200312', '200401', '200402', '200403', '200404', '200405', '200406', '200407', '200408', '200409', '200410', '200411', '200412', '200501', '200502', '200503', '200504', '200505', '200506', '200507', '200508', '200509', '200510', '200511', '200512', '200601', '200602', '200603', '200604', '200605', '200606', '200607', '200608', '200609', '200610', '200611', '200612', '200701', '200702', '200703', '200704', '200705', '200706', '200707', '200708', '200709', '200710', '200711', '200712', '200801', '200802', '200803', '200804',

In [10]:
# Download the data for each news desk, for each month.

company_name = "google"
for news_desk in ["Financial", "Technology"]:
    for month in month_list:
        response = resolve_article_search(company_name, news_desk, month)
        #print(response)
        print("{month}: {count}".format(month=month + "01", count=response['response']['meta']['hits']))

articlesearch_v2_google_Financial_20000101.json
resolve_nyt_json(): returning value from cache file: ..\data\articlesearch_v2_google_Financial_20000101.json
20000101: 0
articlesearch_v2_google_Financial_20000201.json
resolve_nyt_json(): returning value from cache file: ..\data\articlesearch_v2_google_Financial_20000201.json
20000201: 0
articlesearch_v2_google_Financial_20000301.json
resolve_nyt_json(): returning value from cache file: ..\data\articlesearch_v2_google_Financial_20000301.json
20000301: 0
articlesearch_v2_google_Financial_20000401.json
resolve_nyt_json(): returning value from cache file: ..\data\articlesearch_v2_google_Financial_20000401.json
20000401: 0
articlesearch_v2_google_Financial_20000501.json
resolve_nyt_json(): returning value from cache file: ..\data\articlesearch_v2_google_Financial_20000501.json
20000501: 0
articlesearch_v2_google_Financial_20000601.json
resolve_nyt_json(): returning value from cache file: ..\data\articlesearch_v2_google_Financial_20000601.jso

In [11]:
# URL for calls to books/v3/reviews.
def get_books_reviews_url():
    return "https://api.nytimes.com/svc/books/v3/reviews.json"

# Name of the cache file for calls to books/v3/reviews.
def get_books_reviews_cache_file_path(isbn):
    filename = "books_v3_reviews_{isbn}.json".format(isbn=isbn)
    print(filename)
    return os.path.join(data_path, filename)

# Name of the cache file for calls to books/v3/reviews.
def get_books_reviews_params(isbn):
    return {'api-key':nyt_archive_key, 
            'isbn':isbn}

# Convenience routine for getting a review from an ISBN.
def resolve_books_reviews(isbn):
    return resolve_nyt_json(get_books_reviews_url(), 
                            get_books_reviews_cache_file_path(isbn), 
                            get_books_reviews_params(isbn))

In [12]:
# Get the data for each bestseller list.
bestselling_books = {}
for weekly_list in weekly_list_names:
    print(weekly_list)
    response = resolve_books_list(weekly_list)
    books_list = response['results']
    for b in books_list:
        #print(b)
        rank=b['rank']
        title=b['book_details'][0]['title']
        weeks_on_list = b['weeks_on_list']
        
        try:
            isbn=b['isbns'][0]['isbn13']
        except IndexError:
            # Some e-books don't have an ISBN - don't worry about those.
            isbn=None
            
        print("{rank} {title} ({list_name}, {weeks} weeks) {isbn}".format(
                rank=rank, 
                title=title, 
                list_name=weekly_list,
                weeks=weeks_on_list,
                isbn=isbn))
        summary={'title':title, 'isbn':isbn, 'rank':rank, 'list':weekly_list, 'weeks_on_list':weeks_on_list}
        
        if isbn is not None:
            bestselling_books[isbn] = summary
        
print("Found {count} bestselling books".format(count=len(bestselling_books)))

for isbn in bestselling_books:
    review = resolve_books_reviews(isbn)
    count=review['num_results']
    print("{isbn} has {count} reviews".format(isbn=isbn, count=count))
    
    # Add a new 'reviews' attribute for the book.
    bestselling_books[isbn]['reviews'] = count
    
print(bestselling_books)    

combined-print-and-e-book-fiction
books_v3_lists_combined-print-and-e-book-fiction.json
resolve_nyt_json(): returning value from cache file: ..\data\books_v3_lists_combined-print-and-e-book-fiction.json
1 THE SHACK (combined-print-and-e-book-fiction, 5 weeks) 9780964729230
2 DEVIL IN SPRING (combined-print-and-e-book-fiction, 1 weeks) 9780062371904
3 BIG LITTLE LIES (combined-print-and-e-book-fiction, 28 weeks) 9780399167065
4 A MAN CALLED OVE (combined-print-and-e-book-fiction, 40 weeks) 9781476738024
5 A DOG'S PURPOSE (combined-print-and-e-book-fiction, 12 weeks) 9780765326263
6 AFTERMATH: EMPIRE'S END (combined-print-and-e-book-fiction, 1 weeks) 9781101966969
7 ECHOES IN DEATH (combined-print-and-e-book-fiction, 3 weeks) 9781250123114
8 HEARTBREAK HOTEL (combined-print-and-e-book-fiction, 2 weeks) 9780345541437
9 LINCOLN IN THE BARDO (combined-print-and-e-book-fiction, 2 weeks) 9780812995343
10 NORSE MYTHOLOGY (combined-print-and-e-book-fiction, 3 weeks) 9780393609097
11 MILK AND HO