In [53]:
import csv
import json
from typing import List, Tuple

In [42]:
with open('../data/goodreads_library_export.csv') as stream:
    book_entry = list(csv.DictReader(stream))

In [67]:
print(json.dumps(book_entry[503], indent=2))

{
  "Book Id": "27385980",
  "Title": "Invisible Cities",
  "Author": "Italo Calvino",
  "Author l-f": "Calvino, Italo",
  "Additional Authors": "William Weaver",
  "ISBN": "=\"9780156453\"",
  "ISBN13": "=\"\"",
  "My Rating": "0",
  "Average Rating": "4.13",
  "Publisher": "Mariner",
  "Binding": "Paperback",
  "Number of Pages": "165",
  "Year Published": "",
  "Original Publication Year": "1972",
  "Date Read": "",
  "Date Added": "2020/03/01",
  "Bookshelves": "to-read, rec-to-me, ab-current-tbr",
  "Bookshelves with positions": "to-read (#238), rec-to-me (#20), ab-current-tbr (#226)",
  "Exclusive Shelf": "to-read",
  "My Review": "",
  "Spoiler": "",
  "Private Notes": "",
  "Read Count": "0",
  "Recommended For": "",
  "Recommended By": "",
  "Owned Copies": "0",
  "Original Purchase Date": "",
  "Original Purchase Location": "",
  "Condition": "",
  "Condition Description": "",
  "BCID": ""
}


In [44]:
def format_isbn(isbn: str) -> str:
    """
    Format ISBN data.
    
    :param isbn: Unformatted ISBN from Goodreads CSV.
    :return: Formatted ISBN.
    """
    isbn = isbn.strip('"').strip('=').strip('"')
    if isbn == '':
        isbn = None
    return isbn

In [45]:
def split_author(author: str) -> Tuple[str, str]:
    """
    Split author name into first and last name if applicable.
    
    :param author: Unformatted author's full-name from Goodreads CSV.
    :return: Formatted author name as first_name, last_name.
    """
    author = author.split(', ')
    
    first_name = None
    last_name = None
        
    if len(author) == 1:
        first_name = author[0]    
    elif len(author) > 1:
        last_name, first_name = author
    
    return first_name, last_name

In [61]:
def format_additional_authors(additional_authors: str) -> List[str]:
    """
    Format any additional authors if they exist.
    
    :param additional_authors: Any other authors that worked on the publication.
    :return: Additional Authors in correct formatting.
    """
    if len(additional_authors) == 0:
        return None
    
    return additional_authors.split(', ')

In [74]:
def format_integer(value: str) -> int:
    """
    Format integer values in dataset.
    
    :param value: A string representation of an integer datum.
    :return: Integers in correct formatting.
    """
    return None if value == '' else int(value)

In [75]:
def format_user_rating(rating: str) -> int:
    """
    Format user rating correctly.
    
    :param rating: A string representation of a rating.
    :return: Correctly formatted user rating.
    """
    rating = format_integer(rating)
    return None if rating == 0 else rating

In [76]:
docs = []
for book in book_entry:
    first_name, last_name = split_author(book['Author l-f'])
    doc = {
        'author': book['Author'],
        'authorAdditional': format_additional_authors(book['Additional Authors']),
        'authorFirst': first_name,
        'authorLast': last_name, 
        'bookId': book['Book Id'],
        'isbn': format_isbn(book['ISBN']),
        'isbn13': format_isbn(book['ISBN13']),
        'title': book['Title'],
        'ratingUser': format_user_rating(book['My Rating']),
        'ratingAverage': float(book['Average Rating']),
        'publisher': book['Publisher'],
        'binding': book['Binding'],
        'numberOfPages': format_integer(book['Number of Pages']),
        'publishedYear': format_integer(book['Year Published']),
        'publishedYearOriginal': format_integer(book['Original Publication Year']),
        #dateRead
        #dateAdded
    }
    docs += [doc]

In [79]:
print(docs[2382])

{'author': 'John Steinbeck', 'authorAdditional': None, 'authorFirst': 'John', 'authorLast': 'Steinbeck', 'bookId': '890', 'isbn': '0142000671', 'isbn13': '9780142000670', 'title': 'Of Mice and Men', 'ratingUser': 5, 'ratingAverage': 3.88, 'publisher': 'Penguin Books', 'binding': 'Paperback', 'numberOfPages': 112, 'publishedYear': 2002, 'publishedYearOriginal': 1937}
