# Setup

In [16]:
import pandas as pd
import requests
import json
import math
import os
from dotenv import load_dotenv

load_dotenv()

NOTION_SECRET = os.getenv("NOTION_SECRET")
DATABASE_ID = os.getenv("DATABASE_ID")

In [17]:
def google_book_search(title, author, publisher):
    search_terms = " ".join(filter(None, [title, author, publisher]))
    url = 'https://www.googleapis.com/books/v1/volumes?q='
    response = requests.get(url+search_terms)
    data = response.json()
    # Normalizing data
    df = pd.json_normalize(data, record_path=['items'])
    return df


search_terms = "maniac labatut"
url = 'https://www.googleapis.com/books/v1/volumes?q='
response = requests.get(url+search_terms)
data = response.json()
# # Normalizing data
df = pd.json_normalize(data, record_path=['items'])

In [18]:
def query_databases(secret_key, database_id):
    url = "https://api.notion.com/v1/databases/"+database_id+'/query'

    payload = {'id': database_id}
    headers = {
        'Notion-Version': '2021-05-13',
        'Authorization': 'Bearer '+secret_key
    }

    response = requests.request(
        "POST", url, headers=headers, data=payload)
    print(f"The response code is {response.status_code}")
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    else:
        return response.json()

In [19]:
res = query_databases(NOTION_SECRET, DATABASE_ID)

The response code is 200


In [20]:
res.get('results')[0].get('properties')[
    'Summary']['rich_text'][0].get('plain_text')

'Dieses eBook: "Moby Dick" ist mit einem detaillierten und dynamischen Inhaltsverzeichnis versehen und wurde sorgfältig korrekturgelesen. Moby-Dick beginnt mit dem Satz: "Call me Ishmael." Es folgt die Ich-Erzählung des Matrosen Ismael (sein voller Name wird nie genannt), der aus einer angesehenen Familie stammt und sich entscheidet, als Matrose zur See zu fahren, um seiner Melancholie zu entfliehen. Er spricht von einem unbändigen Drang in ihm, der ihn überkomme, wenn er des Festlands überdrüssig sei. Ismael hat bereits einige Fahrten auf Handelsschiffen hinter sich, will nun aber auf einem Walfänger anheuern. Das erzählerische Rückgrat des Romans ist die schicksalhafte Fahrt des Walfangschiffes Pequod, dessen einbeiniger Kapitän Ahab mit blindem Hass den weißen Pottwal jagt, der ihm das Bein abgerissen hat. Entlang dieses erzählerischen Fadens reiht Melville zahlreiche philosophische, wissenschaftliche, kunstgeschichtliche und mythologische Exkurse. Dieses eBook ist eine Überarbeitun

In [21]:
notion_columns = ['Category', 'Publisher', 'Summary', 'Current page', 'Link',
                  'Total pages', 'Date started', 'Author', 'Title', 'url', 'page_id']
notion = pd.DataFrame(columns=notion_columns)
print(notion.columns)
print(notion.head())
for page in res.get('results'):
    properties = page.get('properties')
    try:
        author = properties.get('Author').get('rich_text')[0].get('plain_text')
    except IndexError:
        author = None
    try:
        title = properties.get('Title').get('title')[0].get('plain_text')
    except IndexError:
        title = None
    try:
        publisher = properties['Publisher']['select']['name']
    except KeyError:
        publisher = None
    try:
        category = properties['Category']['select']['name']
    except KeyError:
        category = None
    try:
        summary = properties['Summary']['rich_text'][0]['plain_text']
    except IndexError:
        summary = None
    try:
        current_page = properties['Current page']['number']
    except KeyError:
        current_page = None
    try:
        link = properties['Link']['url']
    except KeyError:
        link = None
    total_pages = properties['Total pages']['number']
    try:
        date_started = properties['Date started']['date']['start']
    except KeyError:
        date_started = None

    url = page.get('url')
    page_id = url[-32:]
    # concat the data
    notion = pd.concat([notion, pd.DataFrame([[category, publisher, summary, current_page, link, total_pages,
                       date_started, author, title, url, page_id]], columns=notion_columns)], ignore_index=True)
# drop rows without title
notion = notion.dropna(subset=['Title'])

Index(['Category', 'Publisher', 'Summary', 'Current page', 'Link',
       'Total pages', 'Date started', 'Author', 'Title', 'url', 'page_id'],
      dtype='object')
Empty DataFrame
Columns: [Category, Publisher, Summary, Current page, Link, Total pages, Date started, Author, Title, url, page_id]
Index: []


In [22]:
google_results = pd.DataFrame()

for book in notion.itertuples():

    google_results = pd.concat([google_results, google_book_search(
        book.Title, book.Author, book.Publisher)], ignore_index=True)

In [23]:

google_data = google_results[['selfLink', 'volumeInfo.title',
                              'volumeInfo.subtitle', 'volumeInfo.authors', 'volumeInfo.publisher',
                              'volumeInfo.publishedDate', 'volumeInfo.description', 'volumeInfo.pageCount', 'volumeInfo.categories',
                              'volumeInfo.imageLinks.smallThumbnail', 'volumeInfo.imageLinks.thumbnail', 'saleInfo.country', 'saleInfo.retailPrice.amount',
                              'saleInfo.retailPrice.currencyCode'
                              ]]

In [35]:
notion

Unnamed: 0,Category,Publisher,Summary,Current page,Link,Total pages,Date started,Author,Title,url,page_id
0,Fiction,dtv,"Dieses eBook: ""Moby Dick"" ist mit einem detail...",20,https://www.googleapis.com/books/v1/volumes/Vq...,920,2024-08-11,Herman Melville,Moby Dick,https://www.notion.so/Moby-Dick-8ae970fcc5034b...,8ae970fcc5034b6ca551eee7e434b2b4
1,Fiction,C.H. Beck,Januar 1944: Während über der Eifel britische ...,250,https://www.googleapis.com/books/v1/volumes/ui...,250,2021-02-28,Norbert Scheuer,Winterbienen,https://www.notion.so/Winterbienen-36a8bcd366d...,36a8bcd366d64786b768eb2509ff3e6f
2,,,Was ist Mythos? Der Autor charakterisiert den ...,,https://www.googleapis.com/books/v1/volumes/BY...,468,2023-11-14,Stephen Fry,Mythos,https://www.notion.so/Mythos-eb60075d9db74e36b...,eb60075d9db74e36ba530479bb452c36
3,,,Dora ist mit ihrer kleinen Hündin aufs Land ge...,,https://www.googleapis.com/books/v1/volumes/nG...,320,2021-04-01,Juli Zeh,Über Menschen,https://www.notion.so/ber-Menschen-a044186a860...,a044186a86014c3b89fb198155814ada
4,,,,,https://www.googleapis.com/books/v1/volumes/PK...,208,2022-12-10,Dörte Hansen,Zur See,https://www.notion.so/Zur-See-b30e1dd1d3c64ce4...,b30e1dd1d3c64ce48d768bde69d3c2a8
...,...,...,...,...,...,...,...,...,...,...,...
67,,,Die Astronomie untersucht die Eigenschaften de...,,https://www.googleapis.com/books/v1/volumes/lq...,1228,2024-03-22,Jeffrey O. Bennett,Astronomie,https://www.notion.so/Astronomie-8a838eaa595d4...,8a838eaa595d45d8bb1899b2246324c6
68,,,Ein liniertes Notizbuch in Standartgröße. Auf ...,,https://www.googleapis.com/books/v1/volumes/tO...,661,,Jonathan Eig,Ali,https://www.notion.so/Ali-f9338061df794fd3b908...,f9338061df794fd3b9086bad63615b78
69,,,The edition of the works of the three sixth-ce...,,https://www.googleapis.com/books/v1/volumes/rR...,195,2023-02-24,Carlo Rovelli,Anaximander,https://www.notion.so/Anaximander-3f8a812a9b9a...,3f8a812a9b9a40afac54d856189e7d7c
70,Fiction,,Mit dem Deutschen Buchpreis 2020 ausgezeichnet...,,https://www.googleapis.com/books/v1/volumes/1n...,201,2023-06-11,Annette Weber,"Annette, ein Heldinnenepos",https://www.notion.so/Annette-ein-Heldinnenepo...,fcfb16d9e44e48768fc04d1f995ffd3a


In [75]:
import pandas as pd


def clean_google_data(google_df, notion_df):
    filtered_results = []

    for _, notion_row in notion_df.iterrows():
        try:
            # Convert Notion title to lowercase and split into words
            notion_title_words = notion_row['Title'].lower().split()
            # Create a regex pattern to match all words
            pattern = '.*'.join(notion_title_words)
            # Filter google_df by title using regex
            matches = google_df[google_df['volumeInfo.title'].str.lower(
            ).str.contains(pattern, regex=True, na=False)]

        except TypeError as e:
            print(f"TypeError encountered while filtering by title: {e}")
            continue

        try:
            # Further filter by author if available
            if pd.notna(notion_row.get('Author')):
                tmp_df = matches[matches['volumeInfo.authors'].apply(
                    lambda authors: notion_row['Author'] in authors if isinstance(authors, list) else False)]
                if not tmp_df.empty:  # If there are matches, keep them
                    matches = tmp_df
        except TypeError as e:
            print(f"TypeError encountered while filtering by author: {e}")
            continue

        try:
            # Further filter by publisher if available
            if pd.notna(notion_row.get('Publisher')):
                tmp_df = matches[matches['volumeInfo.publisher'].apply(
                    lambda publisher: notion_row['Publisher'] == publisher if isinstance(publisher, str) else False)]
                if not tmp_df.empty:  # If there are matches, keep them
                    matches = tmp_df

        except TypeError as e:
            print(f"TypeError encountered while filtering by publisher: {e}")
            continue
        try:
            # If there are matches, keep the latest by published_date
            latest_match = matches.sort_values(
                by='volumeInfo.publishedDate', ascending=False).iloc[0]
            filtered_results.append(latest_match)
        except IndexError as e:
            # only single match, append
            if not matches.empty:
                filtered_results.append(matches)
            else:
                print(f"No match found for {notion_row['Title']}")
            continue
        except KeyError:
            if not matches.empty:
                filtered_results.append(matches)
            else:
                print(f"No match found for {notion_row['Title']}")
            continue

    # Convert the list of filtered results to a DataFrame
    filtered_df = pd.DataFrame(filtered_results)
    return filtered_df

In [76]:
clean_google_data = clean_google_data(google_data, notion)

No match found for Das Universum in der Nussschale
