# Setup

In [1]:
import pandas as pd
import requests
import json
import math

In [105]:
def google_book_search(title, author, publisher):
    search_terms = " ".join(filter(None, [title, author, publisher]))
    url = 'https://www.googleapis.com/books/v1/volumes?q='
    response = requests.get(url+search_terms)
    data = response.json()
    # Normalizing data
    df = pd.json_normalize(data, record_path=['items'])
    return df


search_terms = "maniac labatut"
url = 'https://www.googleapis.com/books/v1/volumes?q='
response = requests.get(url+search_terms)
data = response.json()
# # Normalizing data
df = pd.json_normalize(data, record_path=['items'])

In [13]:
secret_key = ''
database_id = '1b71653e363148e0b2ba399638473ced'

In [18]:
def query_databases(secret_key, database_id):
    url = "https://api.notion.com/v1/databases/"+database_id+'/query'

    payload = {'id': database_id}
    headers = {
        'Notion-Version': '2021-05-13',
        'Authorization': 'Bearer '+secret_key
    }

    response = requests.request(
        "POST", url, headers=headers, data=payload)
    print(f"The response code is {response.status_code}")
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    else:
        return response.json()

In [21]:
res = query_databases(secret_key, database_id)

The response code is 200


In [89]:
res.get('results')[0].get('properties')[
    'Summary']['rich_text'][0].get('plain_text')

'Dieses eBook: "Moby Dick" ist mit einem detaillierten und dynamischen Inhaltsverzeichnis versehen und wurde sorgfältig korrekturgelesen. Moby-Dick beginnt mit dem Satz: "Call me Ishmael." Es folgt die Ich-Erzählung des Matrosen Ismael (sein voller Name wird nie genannt), der aus einer angesehenen Familie stammt und sich entscheidet, als Matrose zur See zu fahren, um seiner Melancholie zu entfliehen. Er spricht von einem unbändigen Drang in ihm, der ihn überkomme, wenn er des Festlands überdrüssig sei. Ismael hat bereits einige Fahrten auf Handelsschiffen hinter sich, will nun aber auf einem Walfänger anheuern. Das erzählerische Rückgrat des Romans ist die schicksalhafte Fahrt des Walfangschiffes Pequod, dessen einbeiniger Kapitän Ahab mit blindem Hass den weißen Pottwal jagt, der ihm das Bein abgerissen hat. Entlang dieses erzählerischen Fadens reiht Melville zahlreiche philosophische, wissenschaftliche, kunstgeschichtliche und mythologische Exkurse. Dieses eBook ist eine Überarbeitun

In [98]:
notion_columns = ['Category', 'Publisher', 'Summary', 'Current page', 'Link',
                  'Total pages', 'Date started', 'Author', 'Title', 'url', 'page_id']
notion = pd.DataFrame(columns=notion_columns)
print(notion.columns)
print(notion.head())
for page in res.get('results'):
    properties = page.get('properties')
    try:
        author = properties.get('Author').get('rich_text')[0].get('plain_text')
    except IndexError:
        author = None
    try:
        title = properties.get('Title').get('title')[0].get('plain_text')
    except IndexError:
        title = None
    try:
        publisher = properties['Publisher']['select']['name']
    except KeyError:
        publisher = None
    try:
        category = properties['Category']['select']['name']
    except KeyError:
        category = None
    try:
        summary = properties['Summary']['rich_text'][0]['plain_text']
    except IndexError:
        summary = None
    try:
        current_page = properties['Current page']['number']
    except KeyError:
        current_page = None
    try:
        link = properties['Link']['url']
    except KeyError:
        link = None
    total_pages = properties['Total pages']['number']
    try:
        date_started = properties['Date started']['date']['start']
    except KeyError:
        date_started = None

    url = page.get('url')
    page_id = url[-32:]
    # concat the data
    notion = pd.concat([notion, pd.DataFrame([[category, publisher, summary, current_page, link, total_pages,
                       date_started, author, title, url, page_id]], columns=notion_columns)], ignore_index=True)
# drop rows without title
notion = notion.dropna(subset=['Title'])

Index(['Category', 'Publisher', 'Summary', 'Current page', 'Link',
       'Total pages', 'Date started', 'Author', 'Title', 'url', 'page_id'],
      dtype='object')
Empty DataFrame
Columns: [Category, Publisher, Summary, Current page, Link, Total pages, Date started, Author, Title, url, page_id]
Index: []


In [106]:
google_results = pd.DataFrame()

for book in notion.itertuples():

    google_results = pd.concat([google_results, google_book_search(
        book.Title, book.Author, book.Publisher)], ignore_index=True)

In [110]:

google_data = google_results[['selfLink', 'volumeInfo.title',
                              'volumeInfo.subtitle', 'volumeInfo.authors', 'volumeInfo.publisher',
                              'volumeInfo.publishedDate', 'volumeInfo.description', 'volumeInfo.pageCount', 'volumeInfo.categories',
                              'volumeInfo.imageLinks.smallThumbnail', 'volumeInfo.imageLinks.thumbnail', 'saleInfo.country', 'saleInfo.retailPrice.amount',
                              'saleInfo.retailPrice.currencyCode'
                              ]]

In [145]:
import pandas as pd


def clean_google_data(google_df, notion_df):
    filtered_results = []

    for _, notion_row in notion_df.iterrows():
        try:
            # Filter google_df by title
            matches = google_df[google_df['volumeInfo.title']
                                == notion_row['Title']]
        except TypeError as e:
            print(f"TypeError encountered while filtering by title: {e}")
            continue

        try:
            # Further filter by author if available
            if pd.notna(notion_row.get('Author')):
                matches = matches[matches['volumeInfo.authors'].apply(
                    lambda authors: notion_row['Author'] in authors if authors else False)]
        except TypeError as e:
            print(f"TypeError encountered while filtering by author: {e}")
            continue

        try:
            # Further filter by publisher if available
            if pd.notna(notion_row.get('Publisher')):
                matches = matches[matches['volumeInfo.publisher']
                                  == notion_row['Publisher']]
        except TypeError as e:
            print(f"TypeError encountered while filtering by publisher: {e}")
            continue

        try:
            # If there are matches, keep the latest by published_date
            latest_match = matches.sort_values(
                by='volumeInfo.publishedDate', ascending=False).iloc[0]
            filtered_results.append(latest_match)
        except IndexError as e:
            # only single match, append
            print(f"Not match found for {notion_row['Title']}")
            continue
        except KeyError:
            print(f"Not match found for {notion_row['Title']}")
            continue

    # Convert the list of filtered results to a DataFrame
    filtered_df = pd.DataFrame(filtered_results)
    return filtered_df

In [144]:
clean_google_data = clean_google_data(google_data, notion)

Empty DataFrame
Columns: [selfLink, volumeInfo.title, volumeInfo.subtitle, volumeInfo.authors, volumeInfo.publisher, volumeInfo.publishedDate, volumeInfo.description, volumeInfo.pageCount, volumeInfo.categories, volumeInfo.imageLinks.smallThumbnail, volumeInfo.imageLinks.thumbnail, saleInfo.country, saleInfo.retailPrice.amount, saleInfo.retailPrice.currencyCode]
Index: []
Not match found for Moby Dick
Empty DataFrame
Columns: [selfLink, volumeInfo.title, volumeInfo.subtitle, volumeInfo.authors, volumeInfo.publisher, volumeInfo.publishedDate, volumeInfo.description, volumeInfo.pageCount, volumeInfo.categories, volumeInfo.imageLinks.smallThumbnail, volumeInfo.imageLinks.thumbnail, saleInfo.country, saleInfo.retailPrice.amount, saleInfo.retailPrice.currencyCode]
Index: []
Not match found for Winterbienen
TypeError encountered while filtering by author: argument of type 'float' is not iterable
                                             selfLink volumeInfo.title  \
30  https://www.googlea

In [132]:
clean_google_data

Unnamed: 0,selfLink,volumeInfo.title,volumeInfo.subtitle,volumeInfo.authors,volumeInfo.publisher,volumeInfo.publishedDate,volumeInfo.description,volumeInfo.pageCount,volumeInfo.categories,volumeInfo.imageLinks.smallThumbnail,volumeInfo.imageLinks.thumbnail,saleInfo.country,saleInfo.retailPrice.amount,saleInfo.retailPrice.currencyCode
30,https://www.googleapis.com/books/v1/volumes/nG...,Über Menschen,Roman,[Juli Zeh],Luchterhand Literaturverlag,2021-03-22,Dora ist mit ihrer kleinen Hündin aufs Land ge...,351.0,[Fiction],http://books.google.com/books/content?id=nGIGE...,http://books.google.com/books/content?id=nGIGE...,DE,10.99,EUR
40,https://www.googleapis.com/books/v1/volumes/mk...,Zur See,Roman - Der Nummer 1 Bestseller,[Dörte Hansen],Verlagsgruppe Random House GmbH,2022-09-28,Der dritte Roman von Bestsellerautorin Dörte H...,193.0,[Fiction],http://books.google.com/books/content?id=mkVnE...,http://books.google.com/books/content?id=mkVnE...,DE,14.99,EUR
68,https://www.googleapis.com/books/v1/volumes/nc...,To Kill a Mockingbird,,[Harper Lee],Harper Perennial Modern Classics,2006-05-23,Harper Lee's Pulitzer Prize-winning masterwork...,346.0,[Fiction],http://books.google.com/books/content?id=ncuX8...,http://books.google.com/books/content?id=ncuX8...,DE,,
71,https://www.googleapis.com/books/v1/volumes/vR...,The Innovators,Die Vordenker der digitalen Revolution von Ada...,[Walter Isaacson],C. Bertelsmann Verlag,2018-04-23,"Sind sie jetzt Nerds, Weltverbesserer oder Spi...",479.0,[Biography & Autobiography],http://books.google.com/books/content?id=vRs-D...,http://books.google.com/books/content?id=vRs-D...,DE,19.99,EUR
82,https://www.googleapis.com/books/v1/volumes/GU...,The Code Breaker,"Jennifer Doudna, Gene Editing, and the Future ...",[Walter Isaacson],Simon and Schuster,2022-05-03,"""A gripping account of how the pioneering scie...",560.0,[Biography & Autobiography],http://books.google.com/books/content?id=GUSFE...,http://books.google.com/books/content?id=GUSFE...,DE,,
102,https://www.googleapis.com/books/v1/volumes/el...,The Children of Hurin,,[J. R. R. Tolkien],Mariner Books,2008-10,A fantasy adventure saga set in the early days...,313.0,[Fiction],http://books.google.com/books/content?id=eldcP...,http://books.google.com/books/content?id=eldcP...,DE,,
110,https://www.googleapis.com/books/v1/volumes/gW...,Sei du selbst,Geschichte der Philosophie 3,[Richard David Precht],Goldmann Verlag,2019-10-14,Der lang erwartete dritte Band von Prechts fün...,503.0,[Philosophy],http://books.google.com/books/content?id=gWaiC...,http://books.google.com/books/content?id=gWaiC...,DE,19.99,EUR
121,https://www.googleapis.com/books/v1/volumes/a_...,Prima facie,A novel,[Suzie Miller],Carl Hanser Verlag GmbH Co KG,2024-01-29,"Das Gesetz ist dazu da, ALLE Menschen zu besch...",320.0,[Fiction],http://books.google.com/books/content?id=a_LnE...,http://books.google.com/books/content?id=a_LnE...,DE,18.99,EUR
130,https://www.googleapis.com/books/v1/volumes/g9...,SPQR,Die tausendjährige Geschichte Roms,[Mary Beard],S. Fischer Verlag,2016-10-13,Die Geschichte Roms für unsere Zeit: Wer hätte...,774.0,[History],http://books.google.com/books/content?id=g9ARD...,http://books.google.com/books/content?id=g9ARD...,DE,5.99,EUR
150,https://www.googleapis.com/books/v1/volumes/p3...,Offene See,Roman,[Benjamin Myers],Dumont Buchverlag,2020-03-20,Eine zeitlose und geradezu zärtliche Geschicht...,234.0,[Fiction],http://books.google.com/books/content?id=p3nAD...,http://books.google.com/books/content?id=p3nAD...,DE,9.99,EUR


Continue in debugging mode. Some entries like moby dick seem to be excluded from the matches. Have to find reason why.