<a href="https://colab.research.google.com/github/Zernach/BetterReads-DS/blob/master/How_to_GoogleBooks_API_Version_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
CREATE TABLE gb_test (
	googleId varchar(20) UNIQUE,
	title varchar(300) UNIQUE,
	authors text[],
	publisher varchar(200),
	publishedDate varchar(40),
	description text,
	isbn varchar(20),
	pageCount integer,
	categories text[],
	thumbnail varchar(200),
	smallThumbnail varchar(200),
	lang varchar(10),
	webReaderLink varchar(150),
	textSnippet varchar(5000),
	isEbook boolean,
	averageRating float,
	maturityRating varchar(20),
	ratingsCount integer,
	subtitle varchar(400),
);

In [0]:
import pandas as pd
books = pd.read_csv('https://raw.githubusercontent.com/zygmuntz/goodbooks-10k/master/books.csv')
titles = books['title'].dropna().apply(str)
#titles = titles.apply(urllib.parse.quote)
#titles = titles[0:100]
titles = pd.DataFrame(titles)
titles.to_csv('10k_Titles', index=False)

In [0]:
import os
import csv
import time
import shelve
import requests
import logging
import threading
from urllib.parse import urljoin, quote

import psycopg2
from psycopg2 import sql

FORMAT = "%(asctime)s - %(message)s"
logging.basicConfig(level=logging.DEBUG, format=FORMAT)
logging.disable(logging.CRITICAL)


def gb_url(search_term, index=None):
    """
    Creates a valid url for request
    search_term: the terms to search for via the specified parameter
    index: defaults to None, can be used to paginate results
    """
    base_url = "https://www.googleapis.com/books/v1/"
    volumes = "volumes?q="
    parameter = "intitle:"
    max_results = ""

    if index:
        tail = (volumes +
                parameter +
                quote(search_term) +
                max_results)
                #f"&startIndex={index}")
    else:
        tail = (volumes +
                parameter +
                quote(search_term) +
                max_results)

    url = urljoin(base_url, tail)

    return url


def get_value(book):
    """
    Compiles book data from json response into an iterable
    for use in SQL command
    book: one individual item in googleAPIresponse['items']
    """
    try:
        googleId = book['id']
    except KeyError:
        googleId = None

    try:
        title = book['volumeInfo']['title']
    except KeyError:
        title = None

    try:
        authors = book['volumeInfo']['authors']
        authors = str(set(authors))
    except KeyError:
        authors = None

    try:
        pub = book['volumeInfo']['publisher']
    except KeyError:
        pub = None

    try:
        publishedDate = book['volumeInfo']['publishedDate']
    except KeyError:
        publishedDate = None

    try:
        description = book['volumeInfo']['description']
    except KeyError:
        description = None

    try:
        isbn = book['volumeInfo']['industryIdentifiers'][0]['identifier']
    except KeyError:
        isbn = None

    try:
        pageCount = book['volumeInfo']['pageCount']
    except KeyError:
        pageCount = None

    try:
        categories = book['volumeInfo']['categories']
        categories = str(set(categories))
    except KeyError:
        categories = None

    try:
        thumbnail = book['volumeInfo']['imageLinks']['thumbnail']
    except KeyError:
        thumbnail = None

    try:
        smallThumbnail = book['volumeInfo']['imageLinks']['smallThumbnail']
    except KeyError:
        smallThumbnail = None

    try:
        lang = book['volumeInfo']['language']
    except KeyError:
        lang = None

    try:
        webReaderLink = book['accessInfo']['webReaderLink']
    except KeyError:
        webReaderLink = None

    try:
        textSnippet = book['searchInfo']['textSnippet']
    except KeyError:
        textSnippet = None

    try:
        isEbook = book['saleInfo']['isEbook']
    except KeyError:
        isEbook = None

    try:
        averageRating = book['volumeInfo']['averageRating']
    except KeyError:
        averageRating = None

    try:
        maturityRating = book['volumeInfo']['maturityRating']
    except KeyError:
        maturityRating = None

    try:
        ratingsCount = book['volumeInfo']['ratingsCount']
    except KeyError:
        ratingsCount = None

    try:
        subtitle = book['volumeInfo']['subtitle']
    except KeyError:
        subtitle = None

    value = [googleId, title, authors, pub, publishedDate,
             description, isbn, pageCount, categories, thumbnail,
             smallThumbnail, lang, webReaderLink, textSnippet,
             isEbook, averageRating, maturityRating, ratingsCount,
             subtitle]

    return value


def execute_queries(data):
    """Creates SQL connection, execute query, close connection"""
    DATABASE_URL = 'postgresql://betterreadsadmin:betterreadsadmin@betterreads-datascience-database.cvmplnwee5ws.us-east-1.rds.amazonaws.com:5432/betterreads'

    connection = psycopg2.connect(DATABASE_URL)
    cursor = connection.cursor()

    books = data['items']
    values = []

    for book in books:
        values.append(get_value(book))

    for entry in values:
        query = sql.SQL(
            "INSERT INTO hybrid_model_data VALUES "
            "(%s, %s, %s, %s, %s, %s, %s, "
            "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
        )
        try:
            cursor.execute(query, entry)
        except Exception as err:
            logging.error(f"Error: {err}")
            connection.rollback()
        else:
            connection.commit()

    cursor.close()
    connection.close()


def request_and_execute(search_term):
    """
    Creates a GET request to google books API, collects data, then uses
    this data to execute a SQL query with connection to database
    """
    # GET request
    url = gb_url(search_term)
    response = requests.get(url)

    try:
        response.raise_for_status()
    except Exception as err:
        logging.error(err)

        return None

    data = response.json()

    if 'items' not in data.keys():
        logging.info(
            f"No items for {search_term} at index {starting_index}"
        )

        return None

    execute_queries(data)

    return True


def get_all_books(search_term, search_index):
    """Gets books data from index 0 of API request pages, starts
    necessary threads to collect other pages
    search_term: the term used in the url call to API
    search_index: the index on which the function begins search
    """
    initial_url = gb_url(search_term)
    response = requests.get(initial_url)

    # main connection
    DATABASE_URL = os.environ["DATABASE_URL"]

    #connection = psycopg2.connect(DATABASE_URL)
    #cursor = connection.cursor()

    #try:
    #    response.raise_for_status()
    #except Exception as err:
    #    logging.error(err)

        # return search_index to shelf and begin on this
        # index upon next pass
        #return search_index

    #data = response.json()

    #if 'items' not in data.keys():
        #logging.info(f"No items for {search_term}")

        #return None

    # reduce unnecessary calls by using 'totalItems' to approximate
    #total_items = (data['totalItems'] // 100) * 100

    #execute_queries(data)

    threads = []

    for index in range(1, 10000, 100):
        subthreads = []
        thread_obj = threading.Thread(target=request_and_execute,
                                      args=(search_term, index))

        threads.append(thread_obj)
        thread_obj.start()

    # wait for all threads to finish
    for thread in threads:
        thread.join()

    return None


def run(term_csv):
    """
    Gets list of search terms from csv and run book retrieval process.
    Upon an http error, search index will be stored and process
    will pick back up from last index on next pass
    """
    # = shelve.open('index_shelf')

    # read in search data
    with open(term_csv) as search_data:
        reader = csv.reader(search_data)
        terms = [row for row in reader]

    #try:
        #start = shelf['start_position']
    #except KeyError:
        #shelf['start_position'] = 0
        #start = shelf['start_position']

    for i in range(1, len(terms), 100):
        #complete_process = get_all_books(terms[i][0], i)
        thread_list = []

        for i in range(100):
            thread_obj = threading.Thread(target=request_and_execute, args=(terms[i]))
            thread_list.append(thread_obj)
            thread_obj.start()

        for thread in thread_list:
            thread.join()

        #if complete_process is not None:
            #shelf['start_position'] = i
            #shelf.close()
            #exit()

    #shelf.close()

  """)


In [0]:
run('10k_Titles')