In [3]:
import pandas as pd
import glob
import os
from openai import OpenAI
import math

root_path = "books"

client = OpenAI()

model_name = 'gpt-4.1-2025-04-14'
generator = 'gpt-4.1'
temperature = 0.4

txt_files = glob.glob(os.path.join(root_path, "**", "*.txt"), recursive=True)


def is_missing(value):
    if value is None:
        return True
    if isinstance(value, float) and math.isnan(value): 
        return True
    if isinstance(value, str) and value.lower() == "nan":
        return True
    return False

prompt_template = '''Provide a very detailed summary of the plot for the following book: The summary must be of the text provided, do NOT use any internal or previous knowledge that you may have on that book to create the summary. Include all main events and the complete storyline, detailing every key development, situations, events with characters, and the conclusion. Do NOT include any historical context, literary analysis, or philosophical discussion- only the plot.
Full text of the book: "{book_text}"'''


In [4]:
for file_index in range(5):
    output_csv = f"Results_external/books_summaries_generator_{generator}_temperature_{temperature}_{file_index+1}.csv"

    if os.path.exists(output_csv):
        df = pd.read_csv(output_csv)
    else:
        df = pd.DataFrame(columns=["book_title", "summary_llm"])

    for file_path in txt_files:
        book_title = os.path.splitext(os.path.basename(file_path))[0]
        if book_title in df["book_title"].values and not pd.isna(df.loc[df["book_title"] == book_title, "summary_llm"].values[0]):
            print(f"skipped: {book_title}")
            continue

        with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()

        prompt = prompt_template.format(book_text = content)

        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                    ]
                }
            ],
            max_tokens=8000,
            temperature=temperature,
        )
        summary = response.choices[0].message.content

        df = pd.concat([df, pd.DataFrame([{
            "book_title": book_title,
            "summary_llm": summary
        }])], ignore_index=True)


    df.to_csv(output_csv, index=False, encoding="utf-8-sig")
    print(f"saved: {output_csv}")


skipped: A Doll_s House-Henrik Ibsen
skipped: Alice_s Adventures in Wonderland-Lewis Carroll
skipped: Anna Karenina-Leo Tolstoy
skipped: Crime and Punishment-Fyodor Dostoyevsky
skipped: Death in Venice-Thomas Mann
skipped: Don Quixote-Miguel de Cervantes
skipped: Dracula-Bram Stoker
skipped: Frankenstein-Mary Shelley
skipped: Great Expectations-Charles Dickens
skipped: Hamlet-William Shakespeare
skipped: Jane Eyre-Charlotte Brontë
skipped: Madame Bovary-Gustave Flaubert
skipped: Mrs. Dalloway-Virginia Woolf
skipped: Pride and Prejudice-Jane Austen
skipped: Robinson Crusoe-Daniel Defoe
skipped: Six Characters in Search of an Author-Luigi Pirandello
skipped: The Adventures of Huckleberry Finn-Mark Twain
skipped: The Count of Monte Cristo-Alexandre Dumas
skipped: The Decameron-Giovanni Boccaccio
skipped: The Divine Comedy-Dante Alighieri
skipped: The Great Gatsby-F. Scott Fitzgerald
skipped: The Iliad-Homer
skipped: The Scarlet Letter-Nathaniel Hawthorne
skipped: Twenty Thousand Leagues u