In [1]:
!pip install openai pandas tqdm rapidfuzz scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting openai
  Downloading openai-1.82.1-py3-none-any.whl.metadata (25 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.11.5-py3-none-any.whl.metadata (67 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadat

In [2]:
import pandas as pd
import numpy as numpy
import re

from openai import OpenAI
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
from rapidfuzz import fuzz, process
import time
from openai import RateLimitError

In [None]:
# Configuration dictionary for API keys and models
llm_config = {
    "openai": {
        "api_key": "MASKED",  # Replace with your actual API key
        "models": {
            "default": "gpt-4o-mini",
            "advanced": "gpt-4",
            "economy": "gpt-4o-mini"
        }
    }
}

# Initialize clients with API keys from config
# Choose which provider(s) you want to use and comment out the others if you don't have all API keys
openai_client = OpenAI(api_key=llm_config["openai"]["api_key"])

In [4]:
def analyze_document(document_text):
    """
    Analyze a Danish 19th-century newspaper announcement and extract book titles and authors.

    Args:
        document_text (str): The text of the document to analyze.

    Returns:
        pd.DataFrame or str: DataFrame with extracted entities or 'NO BOOKS'.
    """
    system_prompt = (
    "Here is an announcement in a Danish nineteenth-century newspaper. Your task is to extract book titles and authors using the following format:\n\n"
    "original_title: <title in Danish>\n"
    "translated_title: <title in English>\n"
    "author: <author name>\n\n"
    "Guidelines:\n"
    "1. Carefully identify the beginning and end of each book title. Pay attention to capitalization, italics, quotation marks, or context that may indicate a book title. Titles are often followed by a description or an author’s name.\n"
    "2. If the announcement mentions multiple book titles, extract each one separately. Ensure each title is uniquely identified.\n"
    "3. If the author is missing or unclear, use 'NO_AUTHOR'. Verify the context to ensure the correct identification of authors.\n"
    "4. Translate the original Danish title into English yourself for the 'translated_title'. Ensure the translation preserves the meaning and context of the original title.\n"
    "5. Pay special attention to context - announcements may contain other text (e.g., product listings, theater plays, chapter titles) that should not be considered book titles. Identify keywords that separate book titles from other content.\n"
    "6. If no book titles are present, return exactly one row with:\n"
    "   original_title: NO_BOOK\n"
    "   translated_title: NO_BOOK\n"
    "   author: NO_BOOK\n\n"
    "Examples with books:\n"
    "Example1: 'Baggesens allerældste Poesier'.\n"
    "→ original_title: allerældste Poesier; translated_title: Oldest Poems; author: Baggesen\n"
    "Example2: 'Kateketisk Magasin af J. C. Wegener, Forstander for det Kongelige Skolelærer-Seminarium paa Joenstrup.'\n"
    "→ original_title: Kateketisk Magasin; translated_title: Catechetical Magazine; author: J.C. Wegener\n"
    "Example3: 'Ceres. Et periodisk Skrivt for dannede Læsere. Udgiver af F. M. Lange. Femte Hefte. Det indeholder: Juliette, eller det hemmelige Ægteskab, af Frederik Kind. - Jagtgildet, af Washington Irving. Subskription modtages hos Vogelius, Boghandler og Bogbinder.'\n"
    "→ original_title: Juliette, eller det hemmelige Ægteskab; translated_title: Juliette, or The Secret Marriage; author: Frederik Kind\n"
    "→ original_title: Jagtgildet; translated_title: The Hunting Feast; author: Washington Irving\n\n"
    "Examples without books:\n"
    "Example1: 'J. Et Parti gode hjemmegjorte Bolster og Dynevaar er i Dag arriveret og sælges billigst muligt af M. N. Samson.'\n"
    "→ original_title: NO_BOOK; translated_title: NO_BOOK; author: NO_BOOK\n"
    "Example2: 'C. Andersen. Første Afdeling: 'Spanierne i Odense, Vaudeville i 1 Act. Anden Afdeling: 'Fem og tyve Aar derefter i Helsingøer, Vaudeville i 1 Act. Billetter a 2 Mk. 8 s., (Børn det Halve) erholdes i mit Logie hos Hr. Kobbersmed Schmidt. Hvo som tager 6 Billetter erholder disse for 2 A. Werligh. Rbd.' This is a theater announcement.\n"
    "→ original_title: NO_BOOK; translated_title: NO_BOOK; author: NO_BOOK\n"
    "Example3: 'Første Binds andet Hefte, indeholdende følgende Katekisationer: 1 Den ægtekristelige Menneskekjærlighed bør være ufortrøden, virksom, uegennyttig og viis 2 Om de Glæder, den sande Menneskekjærlighed skjænker os 5 Om Guds Almagt; 4 Om Guds Alvidenhed; 5 OmGuds Viisdom; 6 Til Lærebogens 6 Kap. 1. 2. 5, 7 Religion er Menneskets vigtigste Anliggende.' These are chapter titles.\n"
    "→ original_title: NO_BOOK; translated_title: NO_BOOK; author: NO_BOOK\n"
    "Use these guidelines and examples to enhance extraction accuracy and maintain the required output format.\n"
)

    try:
        model = llm_config["openai"]["models"]["default"]

        response = openai_client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": document_text}
            ],
            temperature=0.3,
            max_tokens=800
        )

        result_text = response.choices[0].message.content.strip()

        if result_text.upper() == "NO BOOKS":
            return "NO BOOKS"

        # Extract structured results
        pattern = re.compile(
            r"original_title:\s*(.*?)\ntranslated_title:\s*(.*?)\nauthor:\s*(.*?)(?:\n|$)",
            re.DOTALL
        )
        entries = [
            {
                "original_title": match.group(1).strip(),
                "translated_title": match.group(2).strip(),
                "author": match.group(3).strip()
            }
            for match in pattern.finditer(result_text)
        ]

        return pd.DataFrame(entries)

    except Exception as e:
        return f"Error analyzing document with OpenAI: {str(e)}"


In [5]:
# Example usage in a notebook
sample_document = """
Paa Addresse-Contoiret i Mariboe er pr. Commission til Salg et nitid Exemplar af Pragt-Udgaven af Niels Klims underjordiske Reise ved L. Holberg. Oversat efter den lanske Original af Jens Baggesen Kbh. 1789, m. sine Kobb. Prisen er 4 Rd. D. C. Hr. Lientenant K. H. Seidelins Opfordring til Lollands og Falsters Jndbyggere, at tilberede raae Salpeter, med Prosessor Maschmanns Underretning om Salpeters Tilvirkning, kan paa Addresse-Contoiret. i Mariboe bekommes gratis.
"""


print("OPENAI BOOK TITLE EXTRACTION:")
openai_summary = analyze_document(sample_document)
print(openai_summary)
print("-" * 50)

OPENAI BOOK TITLE EXTRACTION:
                                   original_title  \
0                 Niels Klims underjordiske Reise   
1  Opfordring til Lollands og Falsters Jndbyggere   
2                                         NO_BOOK   

                                   translated_title          author  
0                  Niels Klim's Underground Journey      L. Holberg  
1  Appeal to the Inhabitants of Lolland and Falster  K. H. Seidelin  
2                                           NO_BOOK         NO_BOOK  
--------------------------------------------------


In [5]:
# Load book announcements
book_announces = pd.read_csv('../data/book_announces_250503.csv', index_col=0)
book_announces.head()

Unnamed: 0,article_id,date,embedding,n_chunks_orig,clean_category,nøgle,text,category,article_length,characters,embedding_shape,newspaper,label_type,book_announce,comment,predicted_book_announce
37,lol_000038,1809-03-07,[ 0.03518467 0.00932873 -0.0178937 ... -0.02...,1,Bekjendtgjørelser,1809-03-07_52,Paa Addresse-Contoiret i Mariboe bekommes følg...,Bekiendtgiørelser,76,475,[1024],lol,gold,unknown,,y
108,lol_000109,1809-03-17,[ 0.02211799 0.00355701 -0.02588731 ... -0.00...,1,Bekjendtgjørelser,1809-03-17_174,Maskeradeballet i Dannemark 1808. Et Syn af N....,Bekiendtgiørelser,84,495,[1024],lol,gold,unknown,,y
188,lol_000189,1809-03-28,[ 0.04022709 0.02202421 -0.01741452 ... -0.01...,1,Bekjendtgjørelser,1809-03-28_306,Maskeradeballet i Dannemark 1808. Et Syn af N....,Bekiendtgiørelser,29,148,[1024],lol,gold,unknown,,y
328,lol_000330,1809-04-18,[ 0.02150071 0.00016588 -0.00461735 ... -0.02...,1,Bekjendtgjørelser,1809-04-18_554,"Alle Slags Bogbinder-Arbeide, saavel nyt, som ...",Bekiendtgiørelser,50,341,[1024],lol,gold,unknown,,y
470,lol_000476,1809-05-09,[ 0.0191186 0.00117065 -0.01776858 ... -0.03...,1,Bekjendtgjørelser,1809-05-09_826,Paa Addresse-Contoiret i Mariboe er pr. Commis...,Bekiendtgiørelser,70,469,[1024],lol,gold,unknown,,y


In [7]:
# Create sample with the gold standard articles

gold_df = pd.read_csv("../../newspaper_temp_files/training_testing_titles.csv", index_col=0)

random_sample = book_announces[book_announces['article_id'].isin(gold_df['article_id'])]

In [18]:
random_sample.head()

Unnamed: 0,article_id,date,embedding,n_chunks_orig,clean_category,nøgle,text,category,article_length,characters,embedding_shape,newspaper,label_type,book_announce,comment,predicted_book_announce
1382,lol_001393,1809-09-29,[ 0.05600849 0.01557517 -0.02665625 ... -0.03...,1,Bekjendtgjørelser,1809-09-29_2574,"Løier Et Brev paa Vers til Joh. Nordahl Bruun,...",Bekiendtgiørelser,33,185,[1024],lol,gold,unknown,,y
1822,lol_001834,1809-12-12,[ 0.01884314 0.02579251 0.0012215 ... -0.02...,1,Bekjendtgjørelser,1809-12-12_3367,Bekjendtgjørelser. Jndbydelse. Da udenlandske ...,Bekjendtgjørelser,45,296,[1024],lol,gold,unknown,,y
1975,lol_001987,1810-01-09,[ 0.03172351 0.02591745 -0.00467053 ... -0.00...,1,Bekjendtgjørelser,1810-01-09_3675,Bekjendtgjørelser. Gudstjenesten begynder 2den...,Bekjendtgjørelser,30,196,[1024],lol,gold,unknown,,y
2547,lol_002560,1810-03-22,[-0.00690638 0.02288655 -0.03385538 ... 0.00...,1,Bekjendtgjørelser,1810-03-22_4723,Disse sande skjulte Menneskevenner takkes paa ...,Bekjendtgjørelser,12,79,[1024],lol,gold,unknown,,y
3121,lol_003135,1810-06-05,[ 0.03199682 0.00117054 -0.01987574 ... -0.01...,1,Bekjendtgjørelser,1810-06-05_5778,"Tanker i Anledning af Skrivelsen fra Falster, ...",Bekjendtgjørelser,22,130,[1024],lol,gold,unknown,,y


In [None]:
# Create a random sample

#random_sample = book_announces.sample(n=300, random_state=42)

#random_sample = book_announces.groupby('newspaper').sample(n=50, random_state=42)
#random_sample.shape

(300, 16)

In [101]:
# This will hold individual mini-DataFrames returned by analyze_document
books_dfs = []

for idx, row in tqdm(random_sample.iterrows(), total=len(random_sample)):
    article_id = row['article_id']
    date = row['date']
    raw_text = row['text']
    
    # Run your analysis
    result = analyze_document(raw_text)
    
    # Handle case where result is a string (either "NO BOOKS" or unexpected)
    if isinstance(result, str):
        if result.strip().upper() == "NO BOOKS":
            continue
        else:
            # Optional: parse the string into a DataFrame if the output is line-separated
            # But ideally your function should return a proper DataFrame if expected
            print(f"Warning: Expected DataFrame but got string for article_id {article_id}")
            continue
    
    # Now we're sure result is a DataFrame — attach article_id
    result['article_id'] = article_id
    result['date'] = date
    result['text'] = raw_text
    books_dfs.append(result)

# Concatenate all results
books_df = pd.concat(books_dfs, ignore_index=True)

100%|██████████| 293/293 [05:34<00:00,  1.14s/it]


In [9]:
books_df.head()

Unnamed: 0,original_title,translated_title,author,article_id,date,text
0,Løier Et Brev paa Vers til Joh. Nordahl Bruun,Løier A Letter in Verse to Joh. Nordahl Bruun,Niels Tønder Lund Gunnerus,lol_001393,1809-09-29,"Løier Et Brev paa Vers til Joh. Nordahl Bruun,..."
1,Foraaret,The Spring,James Thomson,lol_001393,1809-09-29,"Løier Et Brev paa Vers til Joh. Nordahl Bruun,..."
2,NO_BOOK,NO_BOOK,NO_BOOK,lol_001834,1809-12-12,Bekjendtgjørelser. Jndbydelse. Da udenlandske ...
3,NO_BOOK,NO_BOOK,NO_BOOK,lol_001987,1810-01-09,Bekjendtgjørelser. Gudstjenesten begynder 2den...
4,NO_BOOK,NO_BOOK,C. F. Schultz,lol_002560,1810-03-22,Disse sande skjulte Menneskevenner takkes paa ...


In [86]:
books_df.shape

(425, 6)

In [16]:
no_books = random_sample[~random_sample['article_id'].isin(books_df['article_id'])]
no_books.shape

(160, 16)

In [None]:
books_df.to_csv('../results/prompts/P3_extracted_titles_random_sample_api.csv')
random_sample.to_csv('../results/prompts/P3_random_sample_api.csv')
no_books[['article_id', 'text']].to_csv('../results/prompts/P3_no_books.csv')

### Test prompts and compare with gold standard

In [102]:
# Normalize and prepare
pred_df = books_df[['article_id', 'original_title', 'author']].dropna()
gold_df = pd.read_csv("../../newspaper_temp_files/training_testing_titles.csv", index_col=0)
gold_df = gold_df[['article_id', 'original_title', 'author']].dropna()

# Lowercase and strip for comparison
pred_df = pred_df.applymap(lambda x: str(x).strip().lower())
gold_df = gold_df.applymap(lambda x: str(x).strip().lower())

# Make sure article_id is comparable
pred_df['article_id'] = pred_df['article_id'].astype(str)
gold_df['article_id'] = gold_df['article_id'].astype(str)

  pred_df = pred_df.applymap(lambda x: str(x).strip().lower())
  gold_df = gold_df.applymap(lambda x: str(x).strip().lower())


Full match on article_id, fuzzy match on original_title and author

In [107]:
FUZZY_THRESHOLD = 80

matched_pred = set()
matched_gold = set()

# Join on article_id, compare title+author
for article_id in set(pred_df['article_id']).intersection(gold_df['article_id']):
    preds = pred_df[pred_df['article_id'] == article_id][['original_title', 'author']].values
    golds = gold_df[gold_df['article_id'] == article_id][['original_title', 'author']].values

    for pred in preds:
        best_score = 0
        best_gold = None
        for gold in golds:
            title_score = fuzz.token_sort_ratio(pred[0], gold[0])
            author_score = fuzz.token_sort_ratio(pred[1], gold[1])
            combined_score = (title_score + author_score) / 2
            if combined_score > best_score:
                best_score = combined_score
                best_gold = gold
        if best_score >= FUZZY_THRESHOLD:
            matched_pred.add((article_id, pred[0], pred[1]))
            matched_gold.add((article_id, best_gold[0], best_gold[1]))

# Reconstruct sets
pred_set = set(pred_df.itertuples(index=False, name=None))
gold_set = set(gold_df.itertuples(index=False, name=None))

TP = len(matched_pred)
FP = len(pred_set - matched_pred)
FN = len(gold_set - matched_gold)

precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")

Precision: 0.68
Recall:    0.71
F1 Score:  0.70


Full match on article_id, fuzzy match on title

In [111]:
FUZZY_THRESHOLD = 80

matched_pred = set()
matched_gold = set()

# Join on article_id and fuzzy match on original_title
for article_id in set(pred_df['article_id']).intersection(gold_df['article_id']):
    preds = pred_df[pred_df['article_id'] == article_id]['original_title'].values
    golds = gold_df[gold_df['article_id'] == article_id]['original_title'].values

    for pred_title in preds:
        best_score = 0
        best_gold = None
        for gold_title in golds:
            score = fuzz.token_sort_ratio(pred_title, gold_title)
            if score > best_score:
                best_score = score
                best_gold = gold_title
        if best_score >= FUZZY_THRESHOLD:
            matched_pred.add((article_id, pred_title))
            matched_gold.add((article_id, best_gold))

# Reconstruct sets
pred_set = set(pred_df.itertuples(index=False, name=None))
gold_set = set(gold_df.itertuples(index=False, name=None))

# Reduce to tuples of (article_id, original_title)
pred_set = set((row[0], row[1]) for row in pred_set)
gold_set = set((row[0], row[1]) for row in gold_set)

# Metrics
TP = len(matched_pred)
FP = len(pred_set - matched_pred)
FN = len(gold_set - matched_gold)

precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")

Precision: 0.79
Recall:    0.82
F1 Score:  0.81


In [11]:
# Create a list of false positives
false_positives = list(pred_set - matched_pred)

# Convert to DataFrame for inspection
fp_df = pd.DataFrame(false_positives, columns=['article_id', 'original_title', 'author'])
fp_df.shape

# Display or export
#fp_df.to_csv('../results/prompts/P3_fp.csv')

(102, 3)

In [14]:
fp_df.to_csv('../results/prompts/P3_fp.csv')
fn_df.to_csv('../results/prompts/P3_fn.csv')

In [12]:
# Create a list of false negatives
false_negatives = list(gold_set - matched_gold)

# Convert to DataFrame for inspection
fn_df = pd.DataFrame(false_negatives, columns=['article_id', 'original_title', 'author'])
fn_df.shape

# Display or export
#fn_df.to_csv('../results/prompts/P3_fn.csv', index=False)

(87, 3)

### Prompt improving loop (RIGHT)

In [83]:
# === Load data ===
gold_df = pd.read_csv("../../newspaper_temp_files/training_testing_titles.csv", index_col=0)
gold_df = gold_df[['original_title', 'author']].applymap(lambda x: str(x).strip().lower())
gold_set = set(gold_df.itertuples(index=False, name=None))

sample_texts = random_sample['text'].dropna().tolist()

# === OpenAI helper ===
def call_with_retry(*args, **kwargs):
    while True:
        try:
            return openai_client.chat.completions.create(*args, **kwargs)
        except RateLimitError:
            print("⚠️ Rate limit hit. Retrying in 1 second...")
            time.sleep(1)

# === Run GPT extraction ===
def extract_books(prompt, text):
    response = call_with_retry(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": text}
        ],
        temperature=0.3,
        max_tokens=800
    )
    return response.choices[0].message.content.strip()

def parse_output(text):
    pattern = re.compile(
        r"original_title:\s*(.*?)\ntranslated_title:\s*(.*?)\nauthor:\s*(.*?)(?:\n|$)",
        re.DOTALL
    )
    return [(m[0].strip().lower(), m[2].strip().lower()) for m in pattern.findall(text)]

def run_extraction(prompt_text, sample_texts):
    results = []
    for text in sample_texts:
        output = extract_books(prompt_text, text)
        parsed = parse_output(output)
        results.append({
            "input": text,
            "output": output,
            "parsed": parsed
        })
    return results

# === Evaluate results ===
def evaluate_results(results, gold_set):
    pred_set = set()
    matched_pred = set()
    matched_gold = set()
    tp_list = []
    fp_list = []

    for r in results:
        for pred in r['parsed']:
            pred_set.add(pred)
            match, score, _ = process.extractOne(pred[0], [g[0] for g in gold_set], scorer=fuzz.token_sort_ratio)
            if score >= 75:
                matched_pred.add(pred)
                matched_gold.add((match,))
                tp_list.append(pred)
            else:
                fp_list.append(pred)

    TP = len(tp_list)
    FP = len(fp_list)
    FN = len(gold_set - matched_gold)

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return {
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "f1": round(f1, 3),
        "true_positives": tp_list,
        "false_positives": fp_list,
        "results": results
    }

# === Improve prompt ===
def suggest_better_prompt(current_prompt, evaluation):
    system_message = "You are a prompt engineer. Improve prompts based on extraction results. Do not change the output format."

    def format_examples(label, examples):
        if not examples:
            return f"{label}: None"
        return f"{label}:\n" + "\n".join(f"- Title: {title}, Author: {author}" for title, author in examples[:5])

    user_message = f"""Here is the current prompt:

{current_prompt}

Performance:
- Precision: {evaluation['precision']}
- Recall: {evaluation['recall']}
- F1-score: {evaluation['f1']}

{format_examples("True Positives", evaluation['true_positives'])}

{format_examples("False Positives", evaluation['false_positives'])}

"Please revise this prompt to improve extraction accuracy. Keep the required output format:\n"
"original_title: <title in Danish>\n"
"translated_title: <title in English>\n"
"author: <author name>\n\n"

Return only the revised prompt.
"""

    response = call_with_retry(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        temperature=0.7,
        max_tokens=1000
    )
    return response.choices[0].message.content.strip()

# === Initial prompt ===
initial_prompt = """
    "Here is an announcement in a Danish nineteenth-century newspaper. Your task is to extract book titles and authors using the following format:\n\n"
    "original_title: <title in Danish>\n"
    "translated_title: <title in English>\n"
    "author: <author name>\n\n"
    "Guidelines:\n"
    "1. Carefully identify the beginning and end of each book title. Look for capitalization, italics, or quotation marks that may indicate a book title.\n"
    "2. If the announcement mentions multiple book titles, extract each one separately.\n"
    "3. If the author is missing or unclear, use 'NO_AUTHOR'.\n"
    "4. Translate the original Danish title into English yourself for the 'translated_title'.\n"
    "5. Pay special attention to context - announcements may contain other text (e.g., product listings, theater plays) that should not be considered book titles.\n"
    "6. If no book titles are present, return exactly one row with:\n"
    "   original_title: NO_BOOK\n"
    "   translated_title: NO_BOOK\n"
    "   author: NO_BOOK\n\n"
    "Examples with books:\n"
    "Example1: 'Baggesens allerældste Poesier'.\n"
    "→ original_title: allerældste Poesier; translated_title: Oldest Poems; author: Baggesen\n"
    "Example2: 'Kateketisk Magasin af J. C. Wegener, Forstander for det Kongelige Skolelærer-Seminarium paa Joenstrup.'\n"
    "→ original_title: Kateketisk Magasin; translated_title: Catechetical Magazine; author: J.C. Wegener\n"
    "Example3: 'Ceres. Et periodisk Skrivt for dannede Læsere. Udgiver af F. M. Lange. Femte Hefte. Det indeholder: Juliette, eller det hemmelige Ægteskab, af Frederik Kind. - Jagtgildet, af Washington Irving. Subskription modtages hos Vogelius, Boghandler og Bogbinder.'\n"
    "→ original_title: Juliette, eller det hemmelige Ægteskab; translated_title: Juliette, or The Secret Marriage; author: Frederik Kind\n"
    "→ original_title: Jagtgildet; translated_title: The Hunting Feast; author: Washington Irving\n\n"
    "Examples without books:\n"
    "Example1: 'J. Et Parti gode hjemmegjorte Bolster og Dynevaar er i Dag arriveret og sælges billigst muligt af M. N. Samson.'\n"
    "→ original_title: NO_BOOK; translated_title: NO_BOOK; author: NO_BOOK\n"
    "Example2: 'C. Andersen. Første Afdeling: 'Spanierne i Odense, Vaudeville i 1 Act. Anden Afdeling: 'Fem og tyve Aar derefter i Helsingøer, Vaudeville i 1 Act. Billetter a 2 Mk. 8 s., (Børn det Halve) erholdes i mit Logie hos Hr. Kobbersmed Schmidt. Hvo som tager 6 Billetter erholder disse for 2 A. Werligh. Rbd.' This is a theater announcement.\n"
    "→ original_title: NO_BOOK; translated_title: NO_BOOK; author: NO_BOOK\n"
    "Example3: 'Første Binds andet Hefte, indeholdende følgende Katekisationer: 1 Den ægtekristelige Menneskekjærlighed bør være ufortrøden, virksom, uegennyttig og viis 2 Om de Glæder, den sande Menneskekjærlighed skjænker os 5 Om Guds Almagt; 4 Om Guds Alvidenhed; 5 OmGuds Viisdom; 6 Til Lærebogens 6 Kap. 1. 2. 5, 7 Religion er Menneskets vigtigste Anliggende.' These are chapter titles.\n"
    "→ original_title: NO_BOOK; translated_title: NO_BOOK; author: NO_BOOK\n"
    "Use these guidelines and examples to enhance extraction accuracy and maintain the required output format.\n"
"""

# === Main loop ===
current_prompt = initial_prompt
history = []

for i in range(3):  # Run 3 iterations
    print(f"\n🔄 Iteration {i+1}")

    # Step 1: Use prompt to extract
    results = run_extraction(current_prompt, sample_texts)

    # Step 2: Evaluate
    metrics = evaluate_results(results, gold_set)
    print(f"📊 F1: {metrics['f1']} | Precision: {metrics['precision']} | Recall: {metrics['recall']}")

    # Save output to file
    pd.DataFrame(results).to_csv(f"iteration2_{i+1}_outputs.csv", index=False)

    # Track history
    history.append({
        "iteration": i + 1,
        "f1": metrics['f1'],
        "precision": metrics['precision'],
        "recall": metrics['recall'],
        "prompt": current_prompt
    })

    # Get new prompt
    current_prompt = suggest_better_prompt(current_prompt, metrics)
    print(f"📝 New prompt (start):\n{current_prompt[:500]}...")

# === Save history ===
pd.DataFrame(history).to_csv("prompt_tuning_history_2.csv", index=False)

  gold_df = gold_df[['original_title', 'author']].applymap(lambda x: str(x).strip().lower())



🔄 Iteration 1
⚠️ Rate limit hit. Retrying in 1 second...
⚠️ Rate limit hit. Retrying in 1 second...
⚠️ Rate limit hit. Retrying in 1 second...
⚠️ Rate limit hit. Retrying in 1 second...
⚠️ Rate limit hit. Retrying in 1 second...
📊 F1: 0.692 | Precision: 0.882 | Recall: 0.569
📝 New prompt (start):
Revised Prompt:

"Here is an announcement in a Danish nineteenth-century newspaper. Your task is to extract book titles and authors using the following format:

"
"original_title: <title in Danish>
"
"translated_title: <title in English>
"
"author: <author name>

"
"Guidelines:
"
"1. Carefully identify the beginning and end of each book title. Pay attention to capitalization, italics, quotation marks, or context that may indicate a book title. Titles are often followed by a description or an aut...

🔄 Iteration 2
⚠️ Rate limit hit. Retrying in 1 second...
⚠️ Rate limit hit. Retrying in 1 second...
⚠️ Rate limit hit. Retrying in 1 second...
⚠️ Rate limit hit. Retrying in 1 second...
⚠️ Rate li

### API for real

In [7]:
# This will hold individual mini-DataFrames returned by analyze_document
books_dfs = []

for idx, row in tqdm(book_announces.iterrows(), total=len(book_announces), desc="Processing articles"):
    article_id = row['article_id']
    date = row['date']
    raw_text = row['text']
    
    # Run your analysis
    result = analyze_document(raw_text)
    
    # Handle case where result is a string (either "NO BOOKS" or unexpected)
    if isinstance(result, str):
        if result.strip().upper() == "NO BOOKS":
            continue
        else:
            # Optional: parse the string into a DataFrame if the output is line-separated
            # But ideally your function should return a proper DataFrame if expected
            print(f"Warning: Expected DataFrame but got string for article_id {article_id}")
            continue
    
    # Now we're sure result is a DataFrame — attach article_id
    result['article_id'] = article_id
    result['date'] = date
    result['text'] = raw_text
    books_dfs.append(result)

# Concatenate all results
books_df = pd.concat(books_dfs, ignore_index=True)

Processing articles: 100%|██████████| 7531/7531 [2:04:06<00:00,  1.01it/s]  


In [8]:
books_df.to_csv('../results/all_extracted_titles_250530.csv')

In [9]:
books_df.shape

(10176, 6)

In [10]:
books_df['original_title'].value_counts().head(20)

original_title
NO_BOOK                                                                                              4127
De vigtigste indenlandske Tildragelser og de mærkeligste Personers Levnetsbeskrivelser                 24
Haandbog for den læsende Ungdom                                                                        22
De mærkeligste Personers Levnetsbeskrivelse og de vigtigste Tildragelser igjennem alle Tidsaldere      17
Nye Kogebog                                                                                            15
Veiledning til Hovedregning eller mental Regnekunst                                                    14
Videnskabelig Fortegnelse over Provindsialbogsamlingen i Mariboe                                       14
Bibelske Fortællinger med Anvendelse paa Religion og Sædelære                                          13
Thonboes Læsebog                                                                                       12
Underviisning i Religionen for 