In [1]:
import gzip
import json
import pandas as pd

# Parsing the provided GoodReads books dataset

In [2]:
NUM_OF_BOOKS = 200000

In [3]:
def is_non_empty_string(text):
    return text is not None and text != ""

In [4]:
def load_first_n_english_books(file_name, n=10):
    count = 0
    books = []
    with gzip.open(file_name) as fin:
        for l in fin:
            book = json.loads(l)
            if book['language_code'] == 'eng' and is_non_empty_string(book['title']) and is_non_empty_string(book['description']):
                count += 1
                books.append(book)
            
            if count >= n:
                break
    
    return books

In [5]:
books = load_first_n_english_books('data/goodreads_books.json.gz', NUM_OF_BOOKS)

In [6]:
def extract_features_of_interest(books):
    result = {}
    for book in books:
        result[book['book_id']] = {
                'book_id': book['book_id'],
                'title': book['title'],
                'description': book['description'],
                'similar_books': book['similar_books']
            }
            
    return result

In [7]:
books = extract_features_of_interest(books)

In [8]:
def filter_out_similar_books_out_of_scope_(books):
    for book_id in books:
        book = books[book_id]
        book['similar_books'] = [similar_book for similar_book in book['similar_books'] if similar_book in books]

In [9]:
filter_out_similar_books_out_of_scope_(books)

In [10]:
print(f'Number of books: {len(books)}')

Number of books: 200000


In [11]:
print('Example book:')
print(json.dumps(list(books.values())[35], indent=4))

Example book:
{
    "book_id": "16693030",
    "title": "A Monster Calls",
    "description": "An unflinching, darkly funny, and deeply moving story of a boy, his seriously ill mother, and an unexpected monstrous visitor.\nAt seven minutes past midnight, thirteen-year-old Conor wakes to find a monster outside his bedroom window. But it isn't the monster Conor's been expecting-- he's been expecting the one from his nightmare, the nightmare he's had nearly every night since his mother started her treatments. The monster in his backyard is different. It's ancient. And wild. And it wants something from Conor. Something terrible and dangerous. It wants the truth. From the final idea of award-winning author Siobhan Dowd-- whose premature death from cancer prevented her from writing it herself-- Patrick Ness has spun a haunting and darkly funny novel of mischief, loss, and monsters both real and imagined.",
    "similar_books": [
        "6369113",
        "10165727",
        "10836471",
    

In [12]:
with open(f'data/top_{len(books)}_eng_books_filtered.json', 'w') as books_json_file:
    json.dump(books, books_json_file, indent=4)

# Constructing training pairs of similar books

In [13]:
def construct_similar_book_pairs(books):
    pairs = []
    for book_id in books:
        book = books[book_id]
        for similar_book_id in book['similar_books']:
            similar_book = books[similar_book_id]
            
            pairs.append({
                'book1_id': book_id,
                'book1_title': book['title'],
                'book1_description': book['description'],
                'book2_id': similar_book_id,
                'book2_title': similar_book['title'],
                'book2_description': similar_book['description']
            })
    return pairs

In [14]:
pairs = construct_similar_book_pairs(books)

In [15]:
print(f'Number of similar book pairs: {len(pairs)}')
print('Example pair:')
print(json.dumps(pairs[0], indent=4))

Number of similar book pairs: 276064
Example pair:
{
    "book1_id": "7327624",
    "book1_title": "The Unschooled Wizard (Sun Wolf and Starhawk, #1-2)",
    "book1_description": "Omnibus book club edition containing the Ladies of Madrigyn and the Witches of Wenshar.",
    "book2_id": "19997",
    "book2_title": "Through a Brazen Mirror",
    "book2_description": "In a medieval kingdom both like and unlike 13th C. England, a mysterious young man appears at the door of the palace kitchen, seeing to serve the king. In a lonely stone tower, a woman watches the world through her mirror of bronze. The young king mourns the death of his friend in battle and and puts off choosing a bride. A child abandoned on a farmer's doorstep grows up to be a powerful witch. The lives of these characters intersect and intertwine in strange and fateful ways as the young man rises from cook to steward to chamberlain and the king becomes more dependent upon him as the sorceress turns his country upside-down i

In [16]:
pairs_df = pd.DataFrame(pairs)
pairs_df.index.name = 'pair_id'

In [17]:
print(f'Dataframe shape: {pairs_df.shape}')
pairs_df.head(10)

Dataframe shape: (276064, 6)


Unnamed: 0_level_0,book1_id,book1_title,book1_description,book2_id,book2_title,book2_description
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,19997,Through a Brazen Mirror,In a medieval kingdom both like and unlike 13t...
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,828466,The Hound and the Falcon (The Hound and the Fa...,Alfred of St. Ruan's Abbey is a monk and a sch...
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,1569323,The Secret Books of Paradys III & IV (Secret B...,"2-in-1 edition, 373 pages\nThe Book of the Dea..."
3,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,1808197,To the Stars,"The epic science fiction trilogy--Homeworld, W..."
4,6066819,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,7052976,The One That I Want,Tilly Farmer is thirty-two years old and has t...
5,6066819,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,3134684,Just Breathe,Unexpected change can be like a breath of fres...
6,89376,Heaven,What is Heaven really going to be like? What w...,761197,The Mystery of Providence,"First published in 1678, this little work well..."
7,89376,Heaven,What is Heaven really going to be like? What w...,10081849,Lit!: A Christian Guide to Reading Books,Whether books are your addiction or your phobi...
8,89378,Dog Heaven,In Newbery Medalist Cynthia Rylant's classic b...,6424103,Harry & Hopper,"When Harry's beloved dog Hopper dies, he has a..."
9,89378,Dog Heaven,In Newbery Medalist Cynthia Rylant's classic b...,197998,The Dead Bird,"One day, the children find a bird lying on its..."


In [18]:
pairs_df = pairs_df.drop_duplicates(subset=['book1_id', 'book2_id'])

In [19]:
print(f'Dataframe shape after removing duplicates: {pairs_df.shape}')
pairs_df.head(10)

Dataframe shape after removing duplicates: (276064, 6)


Unnamed: 0_level_0,book1_id,book1_title,book1_description,book2_id,book2_title,book2_description
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,19997,Through a Brazen Mirror,In a medieval kingdom both like and unlike 13t...
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,828466,The Hound and the Falcon (The Hound and the Fa...,Alfred of St. Ruan's Abbey is a monk and a sch...
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,1569323,The Secret Books of Paradys III & IV (Secret B...,"2-in-1 edition, 373 pages\nThe Book of the Dea..."
3,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,1808197,To the Stars,"The epic science fiction trilogy--Homeworld, W..."
4,6066819,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,7052976,The One That I Want,Tilly Farmer is thirty-two years old and has t...
5,6066819,Best Friends Forever,Addie Downs and Valerie Adler were eight when ...,3134684,Just Breathe,Unexpected change can be like a breath of fres...
6,89376,Heaven,What is Heaven really going to be like? What w...,761197,The Mystery of Providence,"First published in 1678, this little work well..."
7,89376,Heaven,What is Heaven really going to be like? What w...,10081849,Lit!: A Christian Guide to Reading Books,Whether books are your addiction or your phobi...
8,89378,Dog Heaven,In Newbery Medalist Cynthia Rylant's classic b...,6424103,Harry & Hopper,"When Harry's beloved dog Hopper dies, he has a..."
9,89378,Dog Heaven,In Newbery Medalist Cynthia Rylant's classic b...,197998,The Dead Bird,"One day, the children find a bird lying on its..."


In [20]:
pairs_df.to_csv('data/similar_book_pairs.csv')
print(f'Similar book pairs dataframe serialized to CSV successfully.')

Similar book pairs dataframe serialized to CSV successfully.
