In [1]:
import numpy as np
import pandas as pd
import requests
from pandas.io.json import json_normalize
from threading import Thread
from queue import Queue
import pandas_import_and_export

# Summer Reading Book Clustering, part 2: Openlibrary search
In this notebook, we will query the openlibrary database in an attempt to match books in the database with cleaned user-entered data from part 1. The openlibrary search function is pretty picky, so there will be a lot of data that won't be matched, but this will provide a good base to build the book clusters on in part 3.

In [4]:
df = pandas_import_and_export.read_csv("bookslogged_clean")   

#create blank cols to be filled
df['matched_title'] = pd.Series("", index=df.index)
df['matched_author'] = pd.Series("", index=df.index)
df[:10]
df = df[:100]

## It's querying time!

Create a function called get_book, which takes an entered title and author and returns a tuple of matched title and matched author (or None and None, if the search returned no results)

Built with this code as a base: https://github.com/saulshanabrook/openpgh-library/blob/master/Untitled.ipynb

Includes many modifications in an attempt to improve the book matching.

In [5]:
def submit_query(query):
    #query should be in the form of a dict, where key is field name and value is entry for that field.
    #most relevant field names are 'title', 'author', and 'q' (searches all fields)
    resp = None
    try:
        #query the openlibrary search
        resp = requests.get(
            'http://openlibrary.org/search.json', 
            query
        ).json()['docs'][0]
    except IndexError:
        #in this case, no results were returned
        pass
    finally:
        return resp

In [6]:
def get_title(d):
    #d is a json object returned from the OL database
    try:
        return d['title']
    except Exception:
        return None

In [7]:
def get_author(d):
    #d is a json object returned from the OL database
    try:
        s = ""
        for author in d['author_name']:
            s = s + author + ", " 
        s = s[:-2] #remove last comma
        return s
    except Exception:
        return None 

In [8]:
def get_book(title, author):
    #title, author are user-entered
    #returns a tuple of (title, author) matched by OL database
    #or (None, None) if no match
    
    if author == "":
        query = {'title': title}
    else:
        query = {'title': title,
                 'author' : author}
    #print(str(query))
    
    result = submit_query(query)
    #print("Response type: " + str(type(result)))
    if result != None: 
        pass
        #print(result['title']) 
    else: 
        pass
        #print("None")
    
    return (get_title(result), get_author(result))

In [10]:
#threading version
#make raw when not using

q = Queue()

def doWork():
    while not q.empty():
        (i, (title, author)) = q.get_nowait()
        
        (matched_title, matched_author) = get_book(title, author)
        
        df.set_value(i, 'matched_title', matched_title)
        df.set_value(i, 'matched_author', matched_author)
        
        q.task_done()

#fill the queue
for i, row in df.iterrows():
    title = row['title']
    author = row['author']
    q.put((i, (title, author)))

for i in range(20):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()
    t.join()  #add this so that the main thread does not continue exectuing until all other threads finish
    
print("done")

done


Finally, export the results to a CSV to be viewed easily in excel.

In [14]:
pandas_import_and_export.to_csv(df, "matchedbooks")