In [39]:
import gzip
import json

In [40]:
NUM_OF_BOOKS = 350000

In [41]:
def load_data(file_name, head = 10):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (head is not None) and (count >= head):
                break
    return data

In [42]:
books = load_data('data/goodreads_books.json.gz', NUM_OF_BOOKS)

In [43]:
def extract_features_of_interest(books):
    result = {}
    for book in books:
        result[book['book_id']] = {
                'book_id': book['book_id'],
                'title': book['title'],
                'description': book['description'],
                'similar_books': book['similar_books'],
                'language_code': book['language_code']
            }
            
    return result

In [44]:
books = extract_features_of_interest(books)

In [45]:
def filter_out_non_english_books_(books):
    books_to_remove = [book_id for book_id in books if books[book_id]['language_code'] != 'eng']
    for book_to_remove in books_to_remove:
        del books[book_to_remove]

def filter_out_similar_books_out_of_scope_(books):
    for book_id in books:
        book = books[book_id]
        book['similar_books'] = [similar_book for similar_book in book['similar_books'] if similar_book in books]

In [46]:
filter_out_non_english_books_(books)
filter_out_similar_books_out_of_scope_(books)

In [47]:
print(f'Number of books after filtering: {len(books)}')

Number of books after filtering: 105433


In [60]:
print('Example book:')
print(json.dumps(list(books.values())[40], sort_keys=True, indent=4))

Example book:
{
    "book_id": "16693030",
    "description": "An unflinching, darkly funny, and deeply moving story of a boy, his seriously ill mother, and an unexpected monstrous visitor.\nAt seven minutes past midnight, thirteen-year-old Conor wakes to find a monster outside his bedroom window. But it isn't the monster Conor's been expecting-- he's been expecting the one from his nightmare, the nightmare he's had nearly every night since his mother started her treatments. The monster in his backyard is different. It's ancient. And wild. And it wants something from Conor. Something terrible and dangerous. It wants the truth. From the final idea of award-winning author Siobhan Dowd-- whose premature death from cancer prevented her from writing it herself-- Patrick Ness has spun a haunting and darkly funny novel of mischief, loss, and monsters both real and imagined.",
    "language_code": "eng",
    "similar_books": [
        "6369113",
        "10836471",
        "8159643"
    ],
   

In [65]:
with open(f'data/top_{len(books)}_eng_books_filtered.json', 'w') as books_json_file:
    json.dump(books, books_json_file, sort_keys=True, indent=4)