In [380]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import ipyplot

This project looks at changes in Romance books as selected by Publishers Weekly over the last ten years (2013-2023).

Publishers Weekly has been making a list of books by genre since 2011. (In the first two years, data formats varied but that information could be added.) They took submissions from major publishers for any books published in that genre, and in most years selected the top 10.

## Extract & clean book metadata

In [541]:
def find_body_index(body, phrase):
    return min([body.index(x) for x in body if phrase in x.text])

def find_article_lede(body):
    start_top_10 = find_body_index(body, "Top 10") + 1 
    return body[:start_top_10-1][0].text

In [542]:
def clean_isbn(isbn):
    return re.sub(r"[\s-]", "", isbn)

In [543]:
def clean_info(info_text, year, season):
    info_text = info_text.replace("C.L. Wilson","CL Wilson").replace("M.L. Buchman","ML Buchman") # replacing when multiple periods
    author = info_text[:info_text.index('. ')]
    publisher = info_text[info_text.index('. ')+2:info_text.index(', ')]
    price = ''
    isbn = ''
    type = ''
    if year < 2019:
        date = info_text[info_text.index(', ')+2:] ## add year if not avail
    elif (year==2019 and season=="Fall"):
        split = info_text[info_text.index('. ')+2:].split(", ")
        for s in split:
            if s[0]=="$":
                price = s
            elif "ISBN" in s:
                isbn = s.replace("ISBN ","")
            elif bool(re.match("^[A-Z]", s)):
                date = s
            else:
                type = s
    else:
        parens = re.search(r"\(([^$]*\$[^)]*)\)", info_text).group(1)
        date = info_text[info_text.index(', ')+2:info_text.index(parens)].replace("(","").strip() ## add year if not avail
        split = parens.split(", ")
        for s in split:
            if s[0]=="$":
                price = s
            elif "ISBN" in s:
                isbn = s.replace("ISBN ","")
            else:
                type = s
    return author, publisher, date, price, isbn, type


In [544]:
def clean_top_10_list(body, year, season):
    start_top_10 = find_body_index(body, "Top 10") + 1
    if year == 2018 or year >= 2022:
        end_top_10 = find_body_index(body, "Romance & Erotica")
    else:
        end_top_10 = find_body_index(body, "Listings")
    top_10_body = [x.text for x in body[start_top_10:end_top_10]]

    output = []
    for i,x in enumerate(top_10_body):
        if year in (2013, 2014):
            book = {}
            book['year'] = year
            book['season'] = season
            book['title'] = x[:x.index('.')].strip()
            info = clean_info(x[x.index('.')+2:], year, season)
            book['author'] = info[0]
            book['publisher'] = info[1]
            book['date'] = info[2]
            output += [book]
        elif (year > 2012 and i%3 == 0) or (year == 2012 and i%2 == 0):
            book = {}
            book['year'] = year
            book['season'] = season
            book['title'] = x.strip()
        elif (year > 2012 and i%3 == 1) or (year == 2012 and i%2 == 0):
            info = clean_info(x, year, season)
            book['author'] = info[0]
            book['publisher'] = info[1]
            book['date'] = info[2]
            book['price'] = info[3]
            book['ISBN'] = clean_isbn(info[4])
            book['type'] = info[5]
        elif (year > 2012 and i%3 == 2):
            book['description'] = x
            output += [book]
    return output

In [545]:
def clean_listing_description(txt):
    txt = txt.replace(";",",").replace("hardcover $","hardcover, $").replace("trade paper ISBN","trade paper, ISBN") # typos if applicable
    output = {}

    title = txt[:txt.rfind(' by ')].strip()
    author = txt[txt.rfind(' by ')+4:txt.rfind('(')].strip()
    output.update({'title': title})
    output.update({'author': author})

    second_half = txt[txt.rfind('(')+1:].split(', ')

    for s in second_half:
        s = s.strip()
        if s[0]=="$":
            if len(s)>6:
                output.update({'type': re.sub(r'[$\d\.]', '', s).strip()})
                output.update({'price': re.sub(r'[a-zA-Z\s-]', '', s).strip()})
            else:
                output.update({'price': s})
        elif s[0:5]=="ISBN ":
            output.update({'ISBN': clean_isbn(s.replace("ISBN ",""))})
        elif bool(re.match("^[A-Z]", s)): # check date
            output.update({'date': s})
        elif bool(re.match("^[a-z]", s)): # check type
            output.update({'type': s})
    
    return output

In [546]:
def clean_listings(body, year, season):
    if year == 2018 or year >= 2022:
        end_top_10 = find_body_index(body, "Romance & Erotica")
    else:
        end_top_10 = find_body_index(body, "Listings")
    listings = [x.text.strip() for x in body[end_top_10+1:] if x!="" and (len(x)>0 and x.text.strip()[0]!="(")] # remove short () lines in some years
    output = []
    for x in listings:

        ## typo in 2019
        if x == "Dared by the Bad Boy by Melonie Johnson (June 25, mass market,":
            x = ""
        if x.strip() == "$7.99, ISBN 978-1-250-19307-0). A Hollywood actress and a stuntman, who come from different backgrounds and had a mutual crush as teens, get a second chance at love on the set of a blockbuster movie.":
            x = "Dared by the Bad Boy by Melonie Johnson (June 25, mass market, $7.99, ISBN 978-1-250-19307-0). A Hollywood actress and a stuntman, who come from different backgrounds and had a mutual crush as teens, get a second chance at love on the set of a blockbuster movie."
        ## end typo

        if len(x) > 0 and len(x) <= 40: ## proxy for identifying publisher
            publisher = x
        else:
            book = {}
            if ')' in x: # some errors when extra paragraph was added
                book['year'] = year
                book['season'] = season
                book['publisher'] = publisher.strip()
                info = re.search(r"\(([^$]*\$[^)]*)\)", x).group(1)
                end_info_position = x.find(info)+len(info)
                book.update(clean_listing_description(x[:end_info_position]))
                desc = x[end_info_position+1:]
                if desc[0:2] == '. ':
                    book['description'] = desc[2:]
                else:
                    book['description'] = desc
                output += [book]
    return output


In [547]:
def clean_article(year, season, url):
    
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text)
    body = soup.find("div",{"class":"article-body"}).find_all('p')

    top_10 = clean_top_10_list(body, int(year), season)
    listings = clean_listings(body, int(year), season)
    
    return top_10, listings

In [548]:
from csv import DictReader
# open file in read mode
with open("urls.csv", 'r') as f:
    dict_reader = DictReader(f)
    urls = list(dict_reader)

In [549]:
def clean_all(urls):
    all_top_10 = []
    all_listings = []
    for i, u in enumerate(urls):

        cleaned = clean_article(u['year'], u['season'], u['url'])
        all_top_10 += cleaned[0]
        all_listings += cleaned[1]

    df_top_10 = pd.DataFrame(all_top_10)
    df_all_listings = pd.DataFrame(all_listings)

    return df_top_10, df_all_listings

In [550]:
df_top_10, df_all_listings = clean_all(urls[4:])

In [551]:
## In earlier years the ISBN for the top 10 list was not included but can be matched in the listings info (in most cases)
df_top_10 = df_top_10.merge(df_all_listings[['year','season','title','author','ISBN']], how='left', on=['year','season','title','author'], suffixes=[None,'_from_listings'])
df_top_10['ISBN_combined'] = df_top_10['ISBN_from_listings'].fillna(df_top_10['ISBN'])
df_top_10 = df_top_10.drop(columns=['ISBN','ISBN_from_listings'])
df_top_10 = df_top_10.rename(columns={'ISBN_combined':'ISBN'})

In [552]:
# Export to CSVs to pick up here
df_top_10.to_csv('top_10.csv')
df_all_listings.to_csv('listings.csv')

## Explore book metadata

In [553]:
df_top_10

Unnamed: 0,year,season,title,author,publisher,date,price,type,description,ISBN
0,2013,Spring,Bungalow Nights,Christie Ridgway,HQN,Feb. 26,,,,9780373777457
1,2013,Spring,The Lemon Orchard,Luanne Rice,Viking/Pamela Dorman,July 2,,,,9780670025275
2,2013,Spring,Once Again a Bride,Jane Ashford,Sourcebooks Casablanca,Feb. 1,,,,9781402276729
3,2013,Spring,Big Girl Panties,Stephanie Evanovich,William Morrow,July 9,,,,9780062224842
4,2013,Spring,Sins of a Ruthless Rogue,Anna Randol,Avon,Mar. 26,,,,9780062231352
...,...,...,...,...,...,...,...,...,...,...
198,2023,Spring,Star Bringer,Tracy Wolff and Nina Croft,Red Tower,July 11,$29.99,,Bestselling YA author Wolff teams up with Crof...,9781649374066
199,2023,Spring,The True Love Experiment,Christina Lauren,Gallery,May 16,$27.99,,"Romance author Fizzy, a fan favorite character...",9781982173432
200,2023,Spring,Unladylike Lessons in Love: A Marleigh Sisters...,Amita Murray,Avon,May 16,$17.99 trade paper,,Murray’s Regency debut and series launch stars...,9780063296480
201,2023,Spring,Wild Things,Laura Kay,Vintage,May 23,$17 trade paper,,A queer found family leaves the city for a co-...,9780593470053


In [554]:
df_all_listings

Unnamed: 0,year,season,publisher,title,author,date,type,price,ISBN,description
0,2013,Spring,Avon,Sins of a Ruthless Rogue,Anna Randol,Mar. 26,mass market,$7.99,9780062231352,Amid the glittering palaces of Russia and the ...
1,2013,Spring,Avon,Run to You,Rachel Gibson,Apr. 30,mass market,$7.99,9780062069146,returns to Texas for the second in the Milita...
2,2013,Spring,Avon,"Love at First Sight: A Cupid, Texas Novel",Lori Wilde,May 28,mass market,$7.99,9780062218933,Wilde launches a series set in a small Texas t...
3,2013,Spring,Avon,Any Duchess Will Do,Tessa Dare,May 28,mass market,$5.99,9780062240125,Dare continues her bestselling Spindle Cove hi...
4,2013,Spring,Avon,Once Upon a Tower,Eloisa James,May 28,mass market,$7.99,9780062223876,James continues her bestselling fairy tale ser...
...,...,...,...,...,...,...,...,...,...,...
1108,2023,Spring,Vintage,Sizzle Reel,Carlyn Greenwald,Apr. 18,trade paper,$17,9780593468197,follows newly out bisexual Luna as she naviga...
1109,2023,Spring,W x Wattpad,The Bite,Z.W. Taylor,Feb. 7,trade paper,$17.99,9781990259654,The heroine of Taylor’s paranormal debut flees...
1110,2023,Spring,Zebra,He’s My Cowboy,"Diana Palmer, Delores Fossen, and Rebecca Zanetti",June 27,mass market,$9.99,9781420155327,brings together three western romance novella...
1111,2023,Spring,Zebra,Strawberry Lane,Jodi Thomas,Apr. 25,trade paper,$16.95,9781420155075,launches a new series with the love story bet...


In [555]:
# It turns out that some top 10 chosen were not part of listings... in some years they seemed to have excluded them on purpose.

unique_isbns = [x for x in list(set(list(df_top_10['ISBN']) + list(df_all_listings['ISBN']))) if x!=""]
print(len(unique_isbns), "unique ISBNs")

1199 unique ISBNs


## Extract book covers and Google Books metadata

In [448]:
def google_books_info_by_isbn(isbn):
    isbn = str(isbn)

    isbn = re.sub(r"[\s-]", "", isbn)
    url = "https://www.googleapis.com/books/v1/volumes?q=isbn:{}".format(isbn)
    r = requests.get(url)
    print(url)
    if "totalItems" in r.json().keys():
        if r.json()["totalItems"] == 0:
            url = "https://www.googleapis.com/books/v1/volumes?q=isbn_13:{}".format(isbn)
            r = requests.get(url)
            print("isbn13", url)

            if r.json()["totalItems"] == 0:
                print("NONE FOUND", url)
                return {'ISBN':isbn}

        elif r.json()["totalItems"] > 1:
            print("multiple items", url)

        data = r.json()['items'][0] ## check if multiple items?
        output = {'ISBN': isbn}
        output.update({'id': data['id']}) 
        for x in ['title','description','pageCount','printedPageCount','maturityRating']: # title included for crosscheck
            if x in data['volumeInfo'].keys():
                output.update({'google_{}'.format(x): data['volumeInfo'][x]})
        if 'imageLinks' in data['volumeInfo'].keys(): # add images
            for y in ['thumbnail','smallThumbnail']: # other types available?
                if y in data['volumeInfo']['imageLinks'].keys():
                    output.update({'google_{}'.format(y): data['volumeInfo']['imageLinks'][y]})
        return output
    else:
        print(r.json())

In [449]:
def get_all_google_books_data(isbn_list):
    output = []
    for i,b in enumerate(isbn_list):
        if i%90==0 and i>0: ## THIS IS HERE FOR RATE LIMITING - REQUEST HIGHER RATE LIMIT?
            print(i, "Sleeping")
            time.sleep(61)
        print(i)
        output += [google_books_info_by_isbn(b)]
        df = pd.DataFrame(output)
    return df

google_books_data = get_all_google_books_data(unique_isbns)
google_books_data.to_csv("google_books_data.csv")

0
https://www.googleapis.com/books/v1/volumes?q=isbn:9780063141889
1
https://www.googleapis.com/books/v1/volumes?q=isbn:9780778369455
2
https://www.googleapis.com/books/v1/volumes?q=isbn:9781684630301
3
https://www.googleapis.com/books/v1/volumes?q=isbn:9781503903852
4
https://www.googleapis.com/books/v1/volumes?q=isbn:9780425276488
5
https://www.googleapis.com/books/v1/volumes?q=isbn:9781250801562
6
https://www.googleapis.com/books/v1/volumes?q=isbn:9780062909879
7
https://www.googleapis.com/books/v1/volumes?q=isbn:9780778386094
8
https://www.googleapis.com/books/v1/volumes?q=isbn:9781476700175
9
https://www.googleapis.com/books/v1/volumes?q=isbn:9781538734902
10
https://www.googleapis.com/books/v1/volumes?q=isbn:9781335735249
11
https://www.googleapis.com/books/v1/volumes?q=isbn:9781629725529
12
https://www.googleapis.com/books/v1/volumes?q=isbn:9781501162398
13
https://www.googleapis.com/books/v1/volumes?q=isbn:9781402274800
14
https://www.googleapis.com/books/v1/volumes?q=isbn:9781

In [450]:
google_books_data

Unnamed: 0,ISBN,id,google_title,google_description,google_pageCount,google_maturityRating,google_thumbnail,google_smallThumbnail
0,9780063141889,z1mkzgEACAAJ,An Island Wedding,New York Times bestselling author Jenny Colgan...,400.0,NOT_MATURE,http://books.google.com/books/content?id=z1mkz...,http://books.google.com/books/content?id=z1mkz...
1,9780778369455,qNnduwEACAAJ,Dark Storm,"""What do you mean, Ôshe's gone'?"" Forensic psy...",384.0,NOT_MATURE,http://books.google.com/books/content?id=qNndu...,http://books.google.com/books/content?id=qNndu...
2,9781684630301,yS3CwwEACAAJ,That's Not a Thing,When a recently engaged Manhattanite learns th...,352.0,NOT_MATURE,http://books.google.com/books/content?id=yS3Cw...,http://books.google.com/books/content?id=yS3Cw...
3,9781503903852,M6dJtAEACAAJ,Fling Club,It's revenge--sweet and hot--in the first book...,332.0,NOT_MATURE,http://books.google.com/books/content?id=M6dJt...,http://books.google.com/books/content?id=M6dJt...
4,9780425276488,IgQJDAAAQBAJ,Caged in Winter,In this emotional and sexy New Adult debut fro...,303.0,NOT_MATURE,http://books.google.com/books/content?id=IgQJD...,http://books.google.com/books/content?id=IgQJD...
...,...,...,...,...,...,...,...,...
1184,9780515151381,K_BvDwAAQBAJ,The Counterfeit Mistress,THE BEAUTY. THE SPY. AND THE BEHOLDER. From th...,338.0,NOT_MATURE,http://books.google.com/books/content?id=K_BvD...,http://books.google.com/books/content?id=K_BvD...
1185,9781420148107,OESwDwAAQBAJ,Once a Spy,Love and survival in the shadow of Waterloo . ...,385.0,NOT_MATURE,http://books.google.com/books/content?id=OESwD...,http://books.google.com/books/content?id=OESwD...
1186,9781250193070,h4TZugEACAAJ,Once Upon a Bad Boy,A Hollywood actress and a stuntman from differ...,320.0,NOT_MATURE,http://books.google.com/books/content?id=h4TZu...,http://books.google.com/books/content?id=h4TZu...
1187,9780062379849,PZuvoQEACAAJ,Love and Miss Communication,*Cosmopolitan Must-Read* *InStyle Book Club Pi...,400.0,NOT_MATURE,http://books.google.com/books/content?id=PZuvo...,http://books.google.com/books/content?id=PZuvo...


## Display book covers

In [518]:
df_top_10_csv = pd.read_csv('top_10.csv')
df_google_books_csv = pd.read_csv('google_books_data.csv')

In [519]:
top_10_merged = df_top_10_csv.merge(df_google_books_csv, how='left', on='ISBN')

In [523]:
for year in range(2013,2024):
    print(year)
    t = top_10_merged.loc[(top_10_merged['year']==year) & (top_10_merged['ISBN']!="")] 
    ipyplot.plot_images(list(t['google_smallThumbnail']), list(t['title']), img_width=100)


2013


2014


2015


2016


2017


2018


2019


2020


2021


2022


2023


In [481]:
listings_merged = df_listings_csv.merge(df_google_books_csv, how='left', on='ISBN')

In [522]:
for year in range(2013,2024):
    print(year)
    t = listings_merged.loc[(listings_merged['year']==year) & (listings_merged['ISBN']!="") & (listings_merged['id']!="9foVAAAAQBAJ")] # issue with nan
    ipyplot.plot_images(list(t['google_smallThumbnail']), list(t['title']), img_width=100)


2013


2014


2015


2016


2017


2018


2019


2020


2021


2022


2023
