In [8]:
import os
import pandas as pd
api_key = os.getenv("ANTHROPIC_API_KEY")

In [10]:
import anthropic
from pathlib import Path
import json
from typing import Dict, List
import logging
from time import sleep

class PlayMetadataProcessor:
    def __init__(self, api_key: str, plays_directory: str):
        self.client = anthropic.Client(api_key=api_key)
        self.plays_dir = Path(plays_directory)
        
    def extract_metadata(self, text: str, play_title: str) -> Dict:
        prompt = f"""Analyze the beginning of this Norwegian/Danish play from the 1800s and extract the following metadata if present:
        - Year (look for years between 1750-1900)
        - Title of the play
        - Number of acts (look for "Acter", "Akt", etc.)
        - Author name
        - Genre (look for terms like: Lystspil, Drama, Sørgespil, Vaudeville, Syngestykke, Comoedie, Tragoedie)
        
        Return ONLY a Python dictionary with these keys: {{"year": null or int, "title": str or null, "acts": int or null, "author": str or null, "genre": str or null}}
        If information isn't found, use null.
        
        Example responses:
        {{"year": 1842, "title": "Den Unge Gudmoder", "acts": 3, "author": "P.A. Heiberg", "genre": "Lystspil"}}
        {{"year": 1835, "title": "Formynder og Myndling", "acts": 5, "author": "Scribe", "genre": "Syngestykke"}}

        Text to analyze:
        {text}"""
        
        try:
            response = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=1000,
                temperature=0,
                messages=[{"role": "user", "content": prompt}]
            )
            
            raw_response = response.content[0].text.strip()
            print(f"\nProcessing: {play_title}")
            print("Raw LLM response:")
            print(raw_response)
            
            # Find and parse the dictionary
            start_idx = raw_response.find('{')
            end_idx = raw_response.rfind('}') + 1
            if start_idx != -1 and end_idx != -1:
                dict_str = raw_response[start_idx:end_idx]
                try:
                    metadata = json.loads(dict_str)
                    return metadata
                except:
                    print(f"Failed to parse metadata for {play_title}")
                    return {"year": None, "title": None, "acts": None, "author": None, "genre": None}
            return {"year": None, "title": None, "acts": None, "author": None, "genre": None}
            
        except Exception as e:
            print(f"Error processing {play_title}: {str(e)}")
            return {"year": None, "title": None, "acts": None, "author": None, "genre": None}
    
    def process_play(self, filepath: Path) -> Dict:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                play_start = f.read(1000)  # First 1000 characters should contain metadata
            
            metadata = self.extract_metadata(play_start, filepath.stem)
            
            play_info = {
                'filename': filepath.stem,
                'filepath': str(filepath),
                **metadata
            }
            
            print(f"Extracted metadata from {filepath.name}")
            return play_info
            
        except Exception as e:
            print(f"Error reading {filepath}: {str(e)}")
            return {
                'filename': filepath.stem,
                'filepath': str(filepath),
                'year': None,
                'title': None,
                'acts': None,
                'author': None,
                'genre': None
            }

    def process_all_plays(self, delay: float = 1.0) -> List[Dict]:
        play_files = list(self.plays_dir.glob('*.txt'))
        all_plays = []
        
        print(f"Found {len(play_files)} plays to process")
        
        for play_file in play_files:
            play_info = self.process_play(play_file)
            all_plays.append(play_info)
            
            # Save after each play is processed
            output_file = self.plays_dir / 'plays_metadata.json'
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(all_plays, f, indent=2, ensure_ascii=False)
            
            sleep(delay)
                
        return all_plays

In [11]:
processor = PlayMetadataProcessor(api_key, "./plays")
metadata = processor.process_all_plays()

Found 55 plays to process

Processing: stage_3971483
Raw LLM response:
{
    "year": 1778,
    "title": "Til Sæters",
    "acts": null,
    "author": "E. S. Res.",
    "genre": "Dramatisk Idel med Sange"
}
Extracted metadata from stage_3971483.txt

Processing: stage_3970963
Raw LLM response:
{
    "year": null,
    "title": "Rosa og Rosita",
    "acts": 2,
    "author": null,
    "genre": "Lystspil"
}
Extracted metadata from stage_3970963.txt

Processing: stage_3971267
Raw LLM response:
Basert på teksten gir jeg følgende metadata:

{"year": 1851, "title": "Scheik Hassan", "acts": 3, "author": "Henrik Hertz", "genre": "Lystspil"}
Extracted metadata from stage_3971267.txt

Processing: stage_1579397
Raw LLM response:
Basert på den gitte teksten, er den ekstraherte metadata:

{"year": null, "title": "Batailles des dames", "acts": 5, "author": "Scribe og Legeme", "genre": "Lystspil"}

Forklaringer:
- Ingen årstall mellom 1750-1900 er oppgitt, så "year" er null.
- Tittelen "Batailles des dam

In [12]:
# Quick analysis while it's running or after completion
df_metadata = pd.DataFrame(metadata)

# Check which fields we're successfully extracting
print("\nSuccessful extractions per field:")
print(df_metadata.notna().sum())

# For years found
if df_metadata['year'].notna().any():
    print("\nYears found:")
    print(df_metadata[df_metadata['year'].notna()][['filename', 'year', 'title']].head())

# For acts found
if df_metadata['acts'].notna().any():
    print("\nNumber of acts found:")
    print(df_metadata[df_metadata['acts'].notna()][['filename', 'acts', 'title']].head())

# For plays with authors
if df_metadata['author'].notna().any():
    print("\nAuthors found:")
    print(df_metadata[df_metadata['author'].notna()][['filename', 'author', 'title']].head())


Successful extractions per field:
filename    55
filepath    55
year        24
title       44
acts        40
author      30
genre       45
dtype: int64

Years found:
         filename    year                      title
0   stage_3971483  1778.0                 Til Sæters
2   stage_3971267  1851.0              Scheik Hassan
4   stage_1564003  1832.0  Den skjønneste Dagi Livet
9   stage_3971124  1862.0                   Brødrene
12  stage_3971323  1852.0              Huldrens Hjem

Number of acts found:
        filename  acts                      title
1  stage_3970963   2.0             Rosa og Rosita
2  stage_3971267   3.0              Scheik Hassan
3  stage_1579397   5.0        Batailles des dames
4  stage_1564003   2.0  Den skjønneste Dagi Livet
5  stage_1594278   3.0      Han er af god Familie

Authors found:
        filename                author                      title
0  stage_3971483            E. S. Res.                 Til Sæters
2  stage_3971267          Henrik Hertz      

In [14]:
df_metadata.to_excel("metadata.xlsx")

In [9]:
pd.DataFrame(metadata)

Unnamed: 0,filename,filepath,year,title,acts,author
0,stage_3971483,plays/stage_3971483.txt,,,,
1,stage_3970963,plays/stage_3970963.txt,,Rosa og Rosita,2.0,
2,stage_3971267,plays/stage_3971267.txt,1851.0,Scheik Hassan,3.0,Henrik Hertz
3,stage_1579397,plays/stage_1579397.txt,,Batailles des dames,5.0,Scribe og Legeme
4,stage_1564003,plays/stage_1564003.txt,1832.0,Den skjønneste Dagi Livet,2.0,Johan Ludvig Heiberg
5,stage_1594278,plays/stage_1594278.txt,,Hau er af god Familie,3.0,
6,stage_1579385,plays/stage_1579385.txt,,Pariserdringen,2.0,Bayrde Vanderbarn
7,stage_1563964,plays/stage_1563964.txt,,,1.0,
8,stage_1587674,plays/stage_1587674.txt,,Keiserendens Fanger,2.0,Bayar
9,stage_3971124,plays/stage_3971124.txt,1862.0,Brødrene,3.0,H. Olaf Hansen
