In [19]:
import anthropic
from pathlib import Path
import json
from typing import Dict, List
import logging
from time import sleep
import ast

class NorwegianPlayProcessor:
    def __init__(self, api_key: str, plays_directory: str):
        self.client = anthropic.Client(api_key=api_key)
        self.plays_dir = Path(plays_directory)
        self.setup_logging()
        
    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            filename='play_processing.log',
            encoding='utf-8'
        )
    
    def extract_characters_with_metadata(self, text: str, play_title: str) -> List[Dict]:
        prompt = f"""Analyze this Norwegian play from the 1800s and return ONLY a Python list of dictionaries containing character information.
        Do NOT add any explanatory text before or after the list.
        Return ONLY the list in this exact format:
        [
            {{"name": "Character Name", "gender": "M or F", "status": "social status", "description": "original description"}},
        ]

        Base gender (M/F) on:
        - Titles (Fru, Frøken, Madame = F)
        - Male titles (Herr, Hr. = M)
        - Names and professions
        - Character descriptions

        Text to analyze:
        {text}"""
        
        try:
            response = self.client.messages.create(
                model="claude-3-sonnet-20240229",
                max_tokens=1000,
                temperature=0,
                messages=[{"role": "user", "content": prompt}]
            )
            
            raw_response = response.content[0].text.strip()
            
            # Find the actual list in the response
            start_idx = raw_response.find('[')
            end_idx = raw_response.rfind(']') + 1
            
            if start_idx != -1 and end_idx != -1:
                list_str = raw_response[start_idx:end_idx]
                try:
                    # Convert simple character lists to full metadata format
                    if '"gender"' not in list_str and '"status"' not in list_str:
                        # This handles the case where we got a simple list of names
                        names = ast.literal_eval(list_str)
                        characters = []
                        for name in names:
                            # Basic gender detection
                            gender = 'F' if any(title in name for title in ['Fru ', 'Frøken ', 'Madame ']) else 'M'
                            characters.append({
                                "name": name,
                                "gender": gender,
                                "status": "",
                                "description": ""
                            })
                        return characters
                    else:
                        # Handle full metadata format
                        return ast.literal_eval(list_str)
                except Exception as e:
                    logging.error(f"Parse error for {play_title}: {str(e)}\nText: {list_str}")
                    return []
            else:
                logging.error(f"No list found in response for {play_title}")
                return []
            
        except Exception as e:
            logging.error(f"API error for {play_title}: {str(e)}")
            return []    
    def process_play(self, filepath: Path) -> Dict:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                play_start = f.read(1000)
            
            characters = self.extract_characters_with_metadata(play_start, filepath.stem)
            
            play_info = {
                'title': filepath.stem,
                'characters': characters,
                'filepath': str(filepath),
                'character_count': len(characters),
                'gender_distribution': {
                    'M': sum(1 for c in characters if c['gender'] == 'M'),
                    'F': sum(1 for c in characters if c['gender'] == 'F')
                } if characters else {'M': 0, 'F': 0}
            }
            
            # Debug print to verify data
            print(f"\nProcessed {filepath.name}:")
            print(f"Found {len(characters)} characters")
            print("Play info:", json.dumps(play_info, indent=2, ensure_ascii=False))
            
            return play_info
            
        except Exception as e:
            print(f"Error processing {filepath}: {str(e)}")
            return {'title': filepath.stem, 'characters': [], 'filepath': str(filepath), 'character_count': 0}

    def process_all_plays(self, delay: float = 1.0) -> List[Dict]:
        play_files = list(self.plays_dir.glob('*.txt'))
        all_plays = []
        
        print(f"Found {len(play_files)} plays to process")
        
        for play_file in play_files:
            play_info = self.process_play(play_file)
            all_plays.append(play_info)
            print(f"Added to all_plays: {play_info['title']} with {len(play_info['characters'])} characters")
            sleep(delay)
        
        # Verify data before saving
        print("\nFinal data summary:")
        print(f"Total plays processed: {len(all_plays)}")
        print(f"Plays with characters: {sum(1 for p in all_plays if p['characters'])}")
        
        output_file = self.plays_dir / 'characters_with_metadata.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_plays, f, indent=2, ensure_ascii=False)
        
        return all_plays

In [6]:
import os

api_key = os.getenv("ANTHROPIC_API_KEY")

In [23]:
# First create dataframe from the plays_data
df_plays = pd.DataFrame(plays_data)

# Explode the characters column - this will create one row per character
df_characters = df_plays.explode('characters')

# The characters column now contains dictionaries - we can normalize this
df_chars_normalized = pd.json_normalize(df_characters['characters'])

# Now combine with the play information
df_final = pd.concat([
    df_characters.drop('characters', axis=1).reset_index(drop=True),
    df_chars_normalized.reset_index(drop=True)
], axis=1)

In [24]:
df_final

Unnamed: 0,title,filepath,character_count,gender_distribution,name,gender,status,description
0,stage_3971483,plays/stage_3971483.txt,9,"{'M': 7, 'F': 2}",Sigrid,F,,
1,stage_3971483,plays/stage_3971483.txt,9,"{'M': 7, 'F': 2}",Ragnhild,F,,
2,stage_3971483,plays/stage_3971483.txt,9,"{'M': 7, 'F': 2}",Asmund,M,,
3,stage_3971483,plays/stage_3971483.txt,9,"{'M': 7, 'F': 2}",Nordal,M,,
4,stage_3971483,plays/stage_3971483.txt,9,"{'M': 7, 'F': 2}",Steenby,M,Studenter,
...,...,...,...,...,...,...,...,...
569,stage_1563988,plays/stage_1563988.txt,14,"{'M': 10, 'F': 4}",Johan,M,hans Bontignedring,
570,stage_1563988,plays/stage_1563988.txt,14,"{'M': 10, 'F': 4}",Joseph,M,Falkenskjolds Tjener,
571,stage_1563988,plays/stage_1563988.txt,14,"{'M': 10, 'F': 4}",Berg,M,ham en Hofmand,
572,stage_1563988,plays/stage_1563988.txt,14,"{'M': 10, 'F': 4}",Præsidenten,M,i høieste Ret,


In [25]:
# Average characters per play
print("Average characters per play:", df_final.groupby('title').size().mean())

# Distribution of cast sizes
play_sizes = df_final.groupby('title').size()
print("\nPlay size distribution:")
print(play_sizes.describe())

# Gender balance per play
gender_by_play = df_final.groupby('title')['gender'].value_counts().unstack().fillna(0)
gender_by_play['total'] = gender_by_play.sum(axis=1)
gender_by_play['female_ratio'] = gender_by_play['F'] / gender_by_play['total']

print("\nGender ratio statistics:")
print(gender_by_play['female_ratio'].describe())

# Plays with most characters
print("\nPlays with largest casts:")
print(play_sizes.sort_values(ascending=False).head())

# Plays with most female characters
print("\nPlays with highest female representation (minimum 5 characters):")
print(gender_by_play[gender_by_play['total'] >= 5].sort_values('female_ratio', ascending=False).head())

Average characters per play: 10.436363636363636

Play size distribution:
count    55.000000
mean     10.436364
std       4.898086
min       2.000000
25%       7.000000
50%       9.000000
75%      13.000000
max      20.000000
dtype: float64

Gender ratio statistics:
count    55.000000
mean      0.323611
std       0.145568
min       0.000000
25%       0.226496
50%       0.333333
75%       0.428571
max       0.666667
Name: female_ratio, dtype: float64

Plays with largest casts:
title
stage_1579379    20
stage_1594255    20
stage_1594283    20
stage_1616602    20
stage_1579380    18
dtype: int64

Plays with highest female representation (minimum 5 characters):
gender           ?    F    M  M/F    N  total  female_ratio
title                                                      
stage_3971765  0.0  3.0  2.0  0.0  0.0    5.0      0.600000
stage_1581443  0.0  4.0  3.0  0.0  0.0    7.0      0.571429
stage_1564190  0.0  4.0  4.0  0.0  0.0    8.0      0.500000
stage_1579385  0.0  4.0  4.0  0.0  

In [26]:
# Check the result
print(df_final.head())
print("\nTotal characters:", len(df_final))
print("\nGender distribution:")
print(df_final['gender'].value_counts(normalize=True))

           title                 filepath  character_count  \
0  stage_3971483  plays/stage_3971483.txt                9   
1  stage_3971483  plays/stage_3971483.txt                9   
2  stage_3971483  plays/stage_3971483.txt                9   
3  stage_3971483  plays/stage_3971483.txt                9   
4  stage_3971483  plays/stage_3971483.txt                9   

  gender_distribution      name gender     status description  
0    {'M': 7, 'F': 2}    Sigrid      F       None        None  
1    {'M': 7, 'F': 2}  Ragnhild      F       None        None  
2    {'M': 7, 'F': 2}    Asmund      M       None        None  
3    {'M': 7, 'F': 2}    Nordal      M       None        None  
4    {'M': 7, 'F': 2}   Steenby      M  Studenter        None  

Total characters: 574

Gender distribution:
gender
M      0.688153
F      0.299652
M/F    0.008711
?      0.001742
N      0.001742
Name: proportion, dtype: float64


In [29]:
df_final[["title", "name", "gender", "status", "description"]].to_excel("Karakteranalyse.xlsx")

In [20]:
processor = NorwegianPlayProcessor(api_key, "./plays")
plays_data = processor.process_all_plays()

print(f"\nProsessert {len(plays_data)} skuespill")

Found 55 plays to process

Processed stage_3971483.txt:
Found 9 characters
Play info: {
  "title": "stage_3971483",
  "characters": [
    {
      "name": "Sigrid",
      "gender": "F",
      "status": null,
      "description": null
    },
    {
      "name": "Ragnhild",
      "gender": "F",
      "status": null,
      "description": null
    },
    {
      "name": "Asmund",
      "gender": "M",
      "status": null,
      "description": null
    },
    {
      "name": "Nordal",
      "gender": "M",
      "status": null,
      "description": null
    },
    {
      "name": "Steenby",
      "gender": "M",
      "status": "Studenter",
      "description": null
    },
    {
      "name": "Busk",
      "gender": "M",
      "status": null,
      "description": null
    },
    {
      "name": "Skoleholderen",
      "gender": "M",
      "status": "Skoleholderen",
      "description": null
    },
    {
      "name": "Peer",
      "gender": "M",
      "status": null,
      "description": null
 

In [15]:
for play in plays_data:
    print(f"\n{play['title']}:")
    print(f"- Antall karakterer: {len(play['characters'])}")
    print(f"- Karakterer: {', '.join(play['characters'])}")


stage_3971483:
- Antall karakterer: 0
- Karakterer: 

stage_3970963:
- Antall karakterer: 0
- Karakterer: 

stage_3971267:
- Antall karakterer: 0
- Karakterer: 

stage_1579397:
- Antall karakterer: 0
- Karakterer: 

stage_1564003:
- Antall karakterer: 0
- Karakterer: 

stage_1594278:
- Antall karakterer: 0
- Karakterer: 

stage_1579385:
- Antall karakterer: 0
- Karakterer: 

stage_1563964:
- Antall karakterer: 0
- Karakterer: 

stage_1587674:
- Antall karakterer: 0
- Karakterer: 

stage_3971124:
- Antall karakterer: 0
- Karakterer: 

stage_1558153:
- Antall karakterer: 0
- Karakterer: 

stage_1594283:
- Antall karakterer: 0
- Karakterer: 

stage_3971323:
- Antall karakterer: 0
- Karakterer: 

stage_1579380:
- Antall karakterer: 0
- Karakterer: 

stage_1580805:
- Antall karakterer: 0
- Karakterer: 

stage_1594268:
- Antall karakterer: 0
- Karakterer: 

stage_3971763:
- Antall karakterer: 0
- Karakterer: 

stage_1581439:
- Antall karakterer: 0
- Karakterer: 

stage_1558140:
- Antall kar

In [12]:
import pandas as pd

In [14]:
plays_data

[{'title': 'stage_3971483',
  'characters': [],
  'filepath': 'plays/stage_3971483.txt',
  'character_count': 0},
 {'title': 'stage_3970963',
  'characters': [],
  'filepath': 'plays/stage_3970963.txt',
  'character_count': 0},
 {'title': 'stage_3971267',
  'characters': [],
  'filepath': 'plays/stage_3971267.txt',
  'character_count': 0},
 {'title': 'stage_1579397',
  'characters': [],
  'filepath': 'plays/stage_1579397.txt',
  'character_count': 0},
 {'title': 'stage_1564003',
  'characters': [],
  'filepath': 'plays/stage_1564003.txt',
  'character_count': 0},
 {'title': 'stage_1594278',
  'characters': [],
  'filepath': 'plays/stage_1594278.txt',
  'character_count': 0},
 {'title': 'stage_1579385',
  'characters': [],
  'filepath': 'plays/stage_1579385.txt',
  'character_count': 0},
 {'title': 'stage_1563964',
  'characters': [],
  'filepath': 'plays/stage_1563964.txt',
  'character_count': 0},
 {'title': 'stage_1587674',
  'characters': [],
  'filepath': 'plays/stage_1587674.txt',