In [1]:
import pandas as pd
import warnings
from dotenv import load_dotenv
import os

load_dotenv()
hardcover_bearer_token = os.getenv('HARDCOVER_BEARER_TOKEN')

warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

In [2]:
def get_goodreads_user_books_by_page(user, page_num=1):
    url = f'https://www.goodreads.com/review/list/{user}?page={page_num}'
    
    # Read the html table
    goodreads = pd.read_html(url, attrs={'id': 'books'}, extract_links='body', displayed_only=False)

    # Process the DataFrame
    user_books = goodreads[0]
    user_books = user_books[['title', 'author', 'pages', 'rating', 'ratings', 'pub', 'votes']]
    user_books['goodreads_id'] = user_books['title'].apply(lambda x: x[1]).str.extract(r'(\d+)')
    
    for column in user_books.columns[:-1]:
        user_books[column] = user_books[column].apply(lambda x: x[0])

    user_books['title'] = user_books['title'].apply(lambda x: x.replace('title ', '', 1))
    user_books['author'] = user_books['author'].apply(lambda x: x.replace('author ', '', 1)).apply(lambda x: x.replace(' *', '', 1))
    user_books['pages'] = pd.to_numeric(user_books['pages'].str.extract(r'(\d+)')[0], errors='coerce')
    user_books['rating'] = pd.to_numeric(user_books['rating'].str.extract(r'(\d+\.\d+)')[0], errors='coerce')
    user_books['ratings'] = pd.to_numeric(user_books['ratings'].str.replace(',', '').str.extract(r'(\d+)')[0], errors='coerce')
    
    # I just want "pub" to be the year. But, it can get crazy with a bunch of different date formats
    user_books['pub'] = user_books['pub'].apply(lambda x: x.replace('date pub ', '', 1))
    user_books['pub'] = pd.to_numeric(user_books['pub'].str.extract(r'(?:\b\d{1,2},\s)?(\d{1,4})\b')[0], errors='coerce')
    
    # So, the "votes" column is weird. It actually has a "# times read  x" value, which I am using to get the x value, then convert to a boolean
    user_books.rename(columns={'votes': 'read?'}, inplace=True)
    user_books['read?'] = pd.to_numeric(user_books['read?'].str.extract(r'(\d+)')[0], errors='coerce')
    user_books['read?'] = user_books['read?'] > 0

    return user_books

In [3]:
# Example usage
bens_books_page_1_df = get_goodreads_user_books_by_page('42944663', 1)
print(bens_books_page_1_df.info())
bens_books_page_1_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         30 non-null     object 
 1   author        30 non-null     object 
 2   pages         29 non-null     float64
 3   rating        30 non-null     float64
 4   ratings       30 non-null     int64  
 5   pub           26 non-null     float64
 6   read?         30 non-null     bool   
 7   goodreads_id  30 non-null     object 
dtypes: bool(1), float64(3), int64(1), object(3)
memory usage: 1.8+ KB
None


Unnamed: 0,title,author,pages,rating,ratings,pub,read?,goodreads_id
0,Knee Ability Zero,"Patrick, Ben",98.0,4.37,474,,True,59563188
1,The Idea Factory: Bell Labs and the Great Age of American Innovation,"Gertner, Jon",432.0,4.19,7489,2012.0,False,11797471
2,Is Maths Real? How Simple Questions Lead Us to Mathematicsâ Deepest Truths,"Cheng, Eugenia",328.0,3.55,121,,False,62610160
3,"The Sword in the Stone (The Once and Future King, #1)","White, T.H.",352.0,3.89,26323,1938.0,True,316845
4,"Artificial Condition (The Murderbot Diaries, #2)","Wells, Martha",158.0,4.28,128179,2018.0,True,36223860


In [147]:
def get_all_goodreads_user_books(user):

    page_num = 1
    all_books_df = pd.DataFrame()

    while True:
        print(f'Fetching {user}\'s Page {page_num}...')
        books_on_page = get_goodreads_user_books_by_page(user, page_num)
        if books_on_page.empty:
            print(f'Page {page_num} is empty.')
            break
        all_books_df = pd.concat([all_books_df, books_on_page], ignore_index=True)
        page_num += 1

    return all_books_df

In [148]:
all_bens_books_df = get_all_goodreads_user_books('42944663')
all_bens_books_df.head()

Fetching 42944663's Page 1...
Fetching 42944663's Page 2...
Fetching 42944663's Page 3...
Fetching 42944663's Page 4...
Fetching 42944663's Page 5...
Fetching 42944663's Page 6...
Fetching 42944663's Page 7...
Page 7 is empty.


Unnamed: 0,title,author,pages,rating,ratings,pub,read?,goodreads_id
0,Knee Ability Zero,"Patrick, Ben",98.0,4.38,468,,True,59563188
1,The Idea Factory: Bell Labs and the Great Age of American Innovation,"Gertner, Jon",432.0,4.19,7477,2012.0,False,11797471
2,Is Maths Real?: How Simple Questions Lead Us to Mathematicsâ Deepest Truths,"Cheng, Eugenia",328.0,3.61,114,,False,62610160
3,"The Sword in the Stone (The Once and Future King, #1)","White, T.H.",352.0,3.89,26304,1938.0,True,316845
4,"Artificial Condition (The Murderbot Diaries, #2)","Wells, Martha",158.0,4.28,127052,2018.0,True,36223860


In [149]:
all_bens_books_df['read?'].value_counts()

read?
True     82
False    80
Name: count, dtype: int64

In [150]:
all_bens_books_df['pub'].describe()

count     152.000000
mean     1980.638158
std       154.407286
min       180.000000
25%      1983.750000
50%      2011.000000
75%      2018.000000
max      2023.000000
Name: pub, dtype: float64

In [151]:
all_bens_books_df.info()

# Cool, so every book has a "goodreads_id", "rating", "ratings", and "read?".
#That's techncally all we need from Goodreads

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         162 non-null    object 
 1   author        162 non-null    object 
 2   pages         156 non-null    float64
 3   rating        162 non-null    float64
 4   ratings       162 non-null    int64  
 5   pub           152 non-null    float64
 6   read?         162 non-null    bool   
 7   goodreads_id  162 non-null    object 
dtypes: bool(1), float64(3), int64(1), object(3)
memory usage: 9.1+ KB


In [152]:
import requests
import json

def get_genres_from_hardcover(goodreads_ids):
    url = "https://hardcover-production.hasura.app/v1/graphql"
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {hardcover_bearer_token}'
    }
    
    # Convert the Series or list of IDs to the required string format
    ids_string = ', '.join(f'"{id_}"' for id_ in goodreads_ids)

    # Construct the GraphQL query
    query = f"""
    query GetBookByGoodreadsIDs {{
      book_mappings(
        where: {{platform: {{id: {{_eq: 1}}}}, external_id: {{_in: [{ids_string}]}}}}
      ) {{
        external_id
        book {{
          taggings {{
            tag {{
              tag
            }}
          }}
        }}
      }}
    }}
    """

    payload = json.dumps({"query": query, "variables": {}})
    response = requests.post(url, headers=headers, data=payload).json()

    books_json = response['data']['book_mappings']
    flattened_data = []

    # Iterate through each book entry in the JSON
    for entry in books_json:
        book_id = entry['external_id']
        
        # Flatten the taggings into a single string separated by commas
        tags = ', '.join([tag['tag']['tag'] for tag in entry['book']['taggings']])
        
        # Append the flattened data to the list
        flattened_data.append({'external_id': book_id, 'tags': tags})

    tags_df = pd.DataFrame(flattened_data)

    return tags_df


In [153]:
# Example usage
goodreads_ids = all_bens_books_df['goodreads_id']
hardcover_genres_df = get_genres_from_hardcover(goodreads_ids)

hardcover_genres_df.head()

Unnamed: 0,external_id,tags
0,10235,"Biography, Nonfiction, General"
1,10353369,
2,10799,"Classics, War"
3,11155890,
4,113934,"Fiction, Business"


In [154]:
all_bens_books_genres_df = pd.merge(all_bens_books_df, hardcover_genres_df, left_on='goodreads_id', right_on='external_id', how='left')

all_bens_books_genres_df.head(5)

Unnamed: 0,title,author,pages,rating,ratings,pub,read?,goodreads_id,external_id,tags
0,Knee Ability Zero,"Patrick, Ben",98.0,4.38,468,,True,59563188,59563188,
1,The Idea Factory: Bell Labs and the Great Age of American Innovation,"Gertner, Jon",432.0,4.19,7477,2012.0,False,11797471,11797471,History
2,Is Maths Real?: How Simple Questions Lead Us to Mathematicsâ Deepest Truths,"Cheng, Eugenia",328.0,3.61,114,,False,62610160,62610160,
3,"The Sword in the Stone (The Once and Future King, #1)","White, T.H.",352.0,3.89,26304,1938.0,True,316845,316845,"Fantasy, Classics, Young Adult, Adventure"
4,"Artificial Condition (The Murderbot Diaries, #2)","Wells, Martha",158.0,4.28,127052,2018.0,True,36223860,36223860,"Science fiction, Space, Fantasy, Adventure, LGBTQ, Adventurous, funny, hopeful, inspiring, mysterious, tense, fast, Character driven, Strong Character Development, Loveable Characters, Diverse Characters, Science fiction, Science fiction, fast, Science fiction, Science fiction, Fiction, Space, Adventurous, funny, mysterious, Loveable Characters, Adventurous, fast, Plot driven, Strong Character Development, Loveable Characters, Diverse Characters, Adventurous, funny, mysterious, fast, A mix driven, Strong Character Development, Loveable Characters, Diverse Characters"


In [155]:
all_bens_books_genres_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         162 non-null    object 
 1   author        162 non-null    object 
 2   pages         156 non-null    float64
 3   rating        162 non-null    float64
 4   ratings       162 non-null    int64  
 5   pub           152 non-null    float64
 6   read?         162 non-null    bool   
 7   goodreads_id  162 non-null    object 
 8   external_id   160 non-null    object 
 9   tags          160 non-null    object 
dtypes: bool(1), float64(3), int64(1), object(5)
memory usage: 11.7+ KB


In [156]:
# Show rows where external_id is empty
all_bens_books_genres_df[all_bens_books_genres_df['external_id'].isna()]

Unnamed: 0,title,author,pages,rating,ratings,pub,read?,goodreads_id,external_id,tags
8,The Covenant of Water,"Verghese, Abraham",724.0,4.49,104975,2023.0,False,180357146,,
12,"Sapiens: A Graphic History, Volume 2 - The Pillars of Civilization","Harari, Yuval Noah",256.0,4.26,3581,2011.0,True,57924373,,


In [12]:
from bs4 import BeautifulSoup
import requests
import re

def get_user_info(user_id):
    url = f"https://www.goodreads.com/user/show/{user_id}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract user_id from the canonical link
    canonical_link = soup.find('link', {'rel': 'canonical'})['href']
    user_id = canonical_link.split('/')[-1].split('-')[0]

    # Extract the number of books from the title tag
    title = soup.find('title').text
    books_read = re.search(r'(\d+)\s+books', title).group(1)

    # Extract the number of friends from meta tags
    full_name = soup.find('meta', {'property': 'og:title'})['content']
    first_name = soup.find('meta', {'property': 'profile:first_name'})['content']
    last_name = soup.find('meta', {'property': 'profile:last_name'})['content']
    username = soup.find('meta', {'property': 'profile:username'})['content']

    # Extract number of friends
    friends = re.search(r" Friends \((\d+)\)", soup.text).group(1)

    return {
        'user_id': user_id,
        'full_name': full_name,
        'first_name': first_name,
        'last_name': last_name,
        'username': username,
        'books_read': books_read,
        'number_of_friends': friends
    }

# Example usage
user_info = get_user_info('48799880')
print(user_info)

{'user_id': '48799880', 'full_name': 'Lisa Tsinis', 'first_name': 'Lisa', 'last_name': 'Tsinis', 'username': 'lisatsinis', 'books_read': '233', 'number_of_friends': '110'}
