In [1]:
import csv
import json
import re
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from typing import Any, Dict, List, Tuple

In [2]:
with open('../data/goodreads_library_export.csv') as stream:
    book_entry = list(csv.DictReader(stream))

In [3]:
print(json.dumps(book_entry[408], indent=2))

{
  "Book Id": "35457358",
  "Title": "The Alienist (Dr. Laszlo Kreizler, #1)",
  "Author": "Caleb Carr",
  "Author l-f": "Carr, Caleb",
  "Additional Authors": "",
  "ISBN": "=\"0525510273\"",
  "ISBN13": "=\"9780525510277\"",
  "My Rating": "0",
  "Average Rating": "4.05",
  "Publisher": "Random House Trade",
  "Binding": "Paperback",
  "Number of Pages": "498",
  "Year Published": "2017",
  "Original Publication Year": "1994",
  "Date Read": "",
  "Date Added": "2022/04/20",
  "Bookshelves": "to-read, tr-emotional-damage-expected, ab-current-tbr",
  "Bookshelves with positions": "to-read (#1241), tr-emotional-damage-expected (#1), ab-current-tbr (#12)",
  "Exclusive Shelf": "to-read",
  "My Review": "",
  "Spoiler": "",
  "Private Notes": "",
  "Read Count": "0",
  "Recommended For": "",
  "Recommended By": "",
  "Owned Copies": "0",
  "Original Purchase Date": "",
  "Original Purchase Location": "",
  "Condition": "",
  "Condition Description": "",
  "BCID": ""
}


In [4]:
#Beautiful Soup this glorious vibes for author ethnicity/etc. (wikipedia???) and Goodreads genre from goodreads page.

In [312]:
import requests
from bs4 import BeautifulSoup
import re
import time
import random
import decimal
from collections import defaultdict

def format_genres(bookId: str) -> List[Dict[str, Any]]:
    """
    Gets the genres of a book and then formats them into a list of genres and subgenres.
    
    :param bookId: The url identifier of the book.
    :return: A list of genres and subgenres of the bookId.
    """
    # Ensure that the request is OK.
    # Goodreads only allows their API and site to be queried at the rate of 1 per sec.
    # Randomize the seconds as an attempt to bypass the site assuming you are a bot.
    while True:
        time.sleep(float(decimal.Decimal(random.randrange(100, 200))/100))
        page = requests.get(url = 'https://goodreads.com/book/show/' + bookId)
        # print(page.status_code)
        time.sleep(float(decimal.Decimal(random.randrange(100, 200))/100))
        if page.status_code == 200:
            break
            
    soup = BeautifulSoup(page.text, 'html.parser')
    content = soup.findAll("div", {"class": "elementList"})

    if len(content) == 0:
        return None

    genres_list = []
    
    # Construct Genres from Content.
    for div in content:
        div = div.text.replace('\n', '').replace(',', '')
        div = div.rstrip(' users').rstrip('0123456789')
        if '>' in div:
            div = re.sub(r'\s+', ' ', div)
        genres_list.append(div.strip())

    # Remove audiobook if it exists because it is not a genre.
    for genre in genres_list:
        if genre == 'Audiobook':
            genres_list.remove(genre)

    genres_dict = defaultdict(list)

    # Combine genres with related subgenres.
    for genre in genres_list:
        if '>' not in genre:
            genres_dict[genre] = []
        else:
            genre_key, subgenre = genre.split(' > ')
            genres_dict[genre_key].append(subgenre)

    return [{'genre': genre, 'subgenres': subgenres} for genre, subgenres in dict(genres_dict).items()]

In [313]:
print(format_genres('49247150'))
print()
print(format_genres('60101407'))
print()
print(format_genres('17571777'))
print()
print(format_genres('50659467'))

[{'genre': 'Fiction', 'subgenres': []}, {'genre': 'LGBT', 'subgenres': ['Queer']}, {'genre': 'Contemporary', 'subgenres': []}, {'genre': 'Literary Fiction', 'subgenres': []}, {'genre': 'Adult', 'subgenres': []}, {'genre': 'Romance', 'subgenres': []}, {'genre': 'Cultural', 'subgenres': ['India']}, {'genre': 'Adult Fiction', 'subgenres': []}]

None

[{'genre': 'Sequential Art', 'subgenres': ['Manga', 'Comics', 'Graphic Novels']}, {'genre': 'Fantasy', 'subgenres': ['Magic']}, {'genre': 'Adventure', 'subgenres': []}, {'genre': 'Manga', 'subgenres': ['Shonen']}, {'genre': 'Action', 'subgenres': []}, {'genre': 'Media Tie In', 'subgenres': ['Anime']}, {'genre': 'Fiction', 'subgenres': []}]

[{'genre': 'Fantasy', 'subgenres': ['Magic', 'Paranormal']}, {'genre': 'Romance', 'subgenres': []}, {'genre': 'Young Adult', 'subgenres': ['Young Adult Fantasy']}, {'genre': 'New Adult', 'subgenres': []}, {'genre': 'Fiction', 'subgenres': []}, {'genre': 'Fairies', 'subgenres': ['Fae']}, {'genre': 'Retellin

In [225]:
def format_isbn(isbn: str) -> str:
    """
    Format ISBN data.
    
    :param isbn: Unformatted ISBN from Goodreads CSV.
    :return: Formatted ISBN.
    """
    isbn = isbn.strip('"').strip('=').strip('"')
    if isbn == '':
        isbn = None
    return isbn

In [6]:
def split_author(author: str) -> Tuple[str, str]:
    """
    Split author name into first and last name if applicable.
    
    :param author: Unformatted author's full-name from Goodreads CSV.
    :return: Formatted author name as first_name, last_name.
    """
    author = author.split(', ')
    
    first_name = None
    last_name = None
        
    if len(author) == 1:
        first_name = author[0]    
    elif len(author) > 1:
        last_name, first_name = author
    
    return first_name, last_name

In [7]:
def format_additional_authors(additional_authors: str) -> List[str]:
    """
    Format any additional authors if they exist.
    
    :param additional_authors: Any other authors that worked on the publication.
    :return: Additional Authors in correct formatting.
    """
    if len(additional_authors) == 0:
        return None
    
    return additional_authors.split(', ')

In [8]:
def format_integer(value: str) -> int:
    """
    Format integer values in dataset.
    
    :param value: A string representation of an integer datum.
    :return: Integers in correct formatting.
    """
    return None if value == '' else int(value)

In [9]:
def format_user_rating(rating: str) -> int:
    """
    Format user rating correctly.
    
    :param rating: A string representation of a rating.
    :return: Correctly formatted user rating.
    """
    rating = format_integer(rating)
    return None if rating == 0 else rating

In [10]:
def format_date(unformatted_date: str) -> str:
    """
    Format date correctly with month-day-year
    
    :param unformatted_date: A string representation of the date.
    :return: Correctly formatted user rating as a string.
    """
    return None if unformatted_date == '' else datetime.strptime(unformatted_date, '%Y/%m/%d').strftime('%Y-%m-%d')

In [11]:
def format_text(text: str) -> str:
    """
    Format string correctly.
    
    :param text: Representation of body of text.
    :return: A correctly formatted body of text, if present.
    """
    return None if text == '' else text

In [12]:
def format_condition(condition: str) -> str:
    """
    Format book condition correctly.
    
    :param text: Representation of the condition of a book
    :return: A correctly formatted condition of a book, if present.
    """
    return None if condition == '' or condition == 'Unspecified' else condition

In [13]:
def format_owned(owned_bit: str) -> bool:
    """
    Format whether a book is owned or not correctly.
    
    :param owned_bit: 0 or 1 representing whether or not a book is marked as owned.
    :return: An correctly formatted indicator of whether a book is owned or not.
    """
    return owned_bit == '1'

In [14]:
def format_spoiler(spoiler: str) -> bool:
    """
    Format whether a book review has a spoiler.
    
    :param spoiler: Indicates whether a book has a spoiler (true) or doesn't (empty string) in its review.
    :return: Properly formatted indicator of whether or not a book has a spoiler in its review.
    """
    return spoiler != ''

In [15]:
def format_bookshelves_position(shelves_with_pos: str) -> List[Dict[str, Any]]:
    """
    Format bookshelves to have a shelf and position when applicable.
    
    :param shelves_with_pos: Shelves with the associated position.
    :return: The shelf and position.
    """    
    if shelves_with_pos == '':
        return None

    shelves = [shelf.split(' ') for shelf in shelves_with_pos.split(', ')]
    
    return [{'shelf': shelf, 'position': int(re.sub('\D', '', position))} for shelf, position in shelves]


In [315]:
# Lists
my_list = [1, 2, 3, 'a', 'b', 'c']
print(my_list[0], my_list[-1])
print(my_list[0:4])
print(my_list[2:])
print(my_list[:4])

my_list[0] = -1
my_list[-1] = 'z'
print(my_list)

# Tuples
my_tup = (1, 2, 3, 'a', 'b', 'c')
print(my_tup[0], my_tup[-1])
print(my_tup[0:4])
print(my_tup[2:])
print(my_tup[:4])

# my_tup[0] = -1  # can't do it!!!

# Sets
my_list = [1, 1, 1, 2, 2, 2, 3, 4, 4]
my_set = set(my_list)
print(my_set)

my_other_set = {1, 1, 1, 2, 2, 2, 3, 4, 4}
print(my_set == my_other_set)

# Dicts
my_dict = {
    'key_one': 1,
    'key_two': 2,
    'key_three': 3
}
print(my_dict['key_one'])

my_dict['key_one'] = -1
print(my_dict['key_one'])

my_dict['aidan_key'] = 5000
print(my_dict)

my_dict_as_tups = list(my_dict.items())
print(my_dict_as_tups)

my_dict_again = dict(my_dict_as_tups)
print(my_dict == my_dict_again)

my_empty_dict = {}
for i in range(3):
    my_empty_dict[str(i)] = i
print(my_empty_dict)

# Tuples and Iteration (Unpacking)
# Works when you have List of Tuples, and All Tuples are Same Size
my_tups = [('a', 1, 'potato'), ('b', 2, 'leek'), ('c', 3, 'turnip'), ('d', 4, 'marshmallow')]
for letter, number, treat in my_tups:
    print('Letter:', letter)
    print('Number:', number)
    print('Treat:', treat)
    
# Inline For Loops to Build Data Structures
# List
my_list = [1, 2, 3, 4]
my_list_double = [elem * 2 for elem in my_list]
print(my_list_double)

# Dict
my_nums = [1, 2, 3, 4]
my_letters = ['a', 'b', 'c', 'd']
my_dict = {letter: number for letter, number in zip(my_letters, my_nums)}
print(my_dict)

# To show how this works...
print(list(zip(my_letters, my_nums)))

1 c
[1, 2, 3, 'a']
[3, 'a', 'b', 'c']
[1, 2, 3, 'a']
[-1, 2, 3, 'a', 'b', 'z']
1 c
(1, 2, 3, 'a')
(3, 'a', 'b', 'c')
(1, 2, 3, 'a')
{1, 2, 3, 4}
True
1
-1
{'key_one': -1, 'key_two': 2, 'key_three': 3, 'aidan_key': 5000}
[('key_one', -1), ('key_two', 2), ('key_three', 3), ('aidan_key', 5000)]
True
{'0': 0, '1': 1, '2': 2}
Letter: a
Number: 1
Treat: potato
Letter: b
Number: 2
Treat: leek
Letter: c
Number: 3
Treat: turnip
Letter: d
Number: 4
Treat: marshmallow
[2, 4, 6, 8]
{'a': 1, 'b': 2, 'c': 3, 'd': 4}
[('a', 1), ('b', 2), ('c', 3), ('d', 4)]


In [316]:
def format_bookshelves(bookshelves: str, exclusive_bookshelf: str) -> List[str]:
    """
    Format bookshelves to only have custom bookshelves.
    
    :param bookshelves: All shelves a book is associated with.
    :param exclusive_bookshelf: The exclusive bookshelf a book is associated with.
    :return: A list of only custom bookshelves, if any custom bookshelves exist.
    """    
    if bookshelves == '':
        return None
        
    formatted_bookshelves = [shelf for shelf in bookshelves.split(', ') if shelf != exclusive_bookshelf]
                        
    return None if formatted_bookshelves == [] else formatted_bookshelves

In [317]:
def generate_actions(index_name: str):
    """
    Goes through the .csv entries and for each row yields a formatted single
    document. This function is passed into the bulk helper to create many documents
    in sequence.
    
    :param index_name: Name of index.
    :return actions: List of actions containing documents to insert into elasticsearch specified index.
    """
    actions = []
    for book in book_entry:
        first_name, last_name = split_author(book['Author l-f'])
        action = {
            '_index': index_name,
            '_source': {
                'author': book['Author'],
                'authorAdditional': format_additional_authors(book['Additional Authors']),
                'authorFirst': first_name,
                'authorLast': last_name,
                'bcid': format_text(book['BCID']),
                'binding': book['Binding'],
                'bookId': book['Book Id'],
                'bookshelvesCustom': format_bookshelves(book['Bookshelves'], book['Exclusive Shelf']),
                'bookshelvesExclusive': book['Exclusive Shelf'],
                'bookshelvesPosition': format_bookshelves_position(book['Bookshelves with positions']),
                'condition': format_text(book['Condition']),
                'conditionDesc': format_text(book['Condition Description']),
                'dateAdded': format_date(book['Date Added']),
                'dateRead': format_date(book['Date Read']),
                'genres': format_genres(book['Book Id']),
                'isbn': format_isbn(book['ISBN']),
                'isbn13': format_isbn(book['ISBN13']),
                'numberOfPages': format_integer(book['Number of Pages']),
                'ogPurchaseDate': format_date(book['Original Purchase Date']),
                'ogPurchaseLoc': format_text(book['Original Purchase Location']),
                'owned': format_owned(book['Owned Copies']),
                'privateNotes': format_text(book['Private Notes']),
                'publishedYear': format_text(book['Year Published']),
                'publishedYearOriginal': format_text(book['Original Publication Year']),
                'publisher': book['Publisher'],
                'ratingAverage': float(book['Average Rating']),
                'ratingUser': format_user_rating(book['My Rating']),
                'readCount': format_integer(book['Read Count']),
                'recommendedFor': format_text(book['Recommended For']),
                'recommendedBy': format_text(book['Recommended By']),
                'review': format_text(book['My Review']),
                'reviewSpoiler': format_spoiler(book['Spoiler']),
                'title': book['Title']
            }
        }
        actions.append(action)
    
    return actions
        

In [318]:
def create_index(client: Any, index_name: str):
    """
    Create an index in Elasticsearch if one doesn't already exist.
    
    :param client: Elasticsearch object with connection.
    :param index_name: Name of index.
    """
    settings = {
        'settings': {
            'number_of_shards': 1,
            'number_of_replicas': 1
        },
        'mappings': {
            'dynamic': 'strict',
            'properties': {
                'author': {'type': 'text'},
                'authorAdditional': {'type': 'text'},
                'authorFirst': {'type': 'keyword'},
                'authorLast': {'type': 'keyword'},
                'bcid': {'type': 'keyword'},
                'binding': {'type': 'keyword'},
                'bookId': {'type': 'keyword'},
                'bookshelvesCustom': {'type': 'keyword'},
                'bookshelvesExclusive': {'type': 'keyword'},
                'bookshelvesPosition': {
                    'type': 'nested',
                    'properties': {
                        'shelf': {'type': 'keyword'},
                        'position': {'type': 'integer'}
                    }
                },
                'condition': {'type': 'keyword'},
                'conditionDesc': {'type': 'text'},
                'dateAdded': {'type': 'date'},
                'dateRead': {'type': 'date'},
                'genres': {
                    'type': 'nested',
                    'properties': {
                        'genre': {'type': 'keyword'},
                        'subgenres': {'type': 'keyword'}
                    }
                },
                'isbn': {'type': 'keyword'},
                'isbn13': {'type': 'keyword'},
                'numberOfPages': {'type': 'integer'},
                'ogPurchaseDate': {'type': 'date'},
                'ogPurchaseLoc': {'type': 'text'},
                'owned': {'type': 'boolean'},
                'privateNotes': {'type': 'text'},
                'publishedYear': {'type': 'keyword'},
                'publishedYearOriginal': {'type': 'keyword'},
                'publisher': {'type': 'text'},
                'ratingAverage': {'type': 'float'},
                'ratingUser': {'type': 'integer'},
                'readCount': {'type': 'integer'},
                'recommendedBy': {'type': 'text'},
                'recommendedFor': {'type': 'text'},
                'review': {'type': 'text'},
                'reviewSpoiler': {'type': 'boolean'},
                'title': {'type': 'text'}
            }
        }
    }
    
    if not client.indices.exists(index_name):
        client.indices.create(index=index_name, body=settings)

In [None]:
number_of_docs = sum(1 for book in book_entry)
index_name = 'proof_book'
successes = 0

# Instantiate the elasticsearch connection
client = Elasticsearch('http://localhost:9200')

# Test
client.info()

# Create index
create_index(client, index_name)

# Do the thing
bulk(client, generate_actions(index_name))