In [53]:
import openai
import tiktoken
import pandas as pd
import os
import ast
import re
from time import time
from tqdm import tqdm
from dhlab.nbtokenizer import tokenize
openai.api_key = os.getenv("OPENAI_API_KEY")

In [1]:
import pandas as pd

In [2]:
# Define the character list with indices
characters = [
    'JOHANNES ROSMER',
    'REBEKKA WEST',
    'REKTOR KROLL',
    'ULRIK BRENDEL',
    'PEDER MORTENSGÅRD',
    'MADAM HELSETH'
]

# Create a prefixed character list with indices
character_index = "\n".join([f"{i+1}: {name}" for i, name in enumerate(characters)])

In [70]:
print(character_index)

1: JOHANNES ROSMER
2: REBEKKA WEST
3: REKTOR KROLL
4: ULRIK BRENDEL
5: PEDER MORTENSGÅRD
6: MADAM HELSETH


In [48]:
def create_prompt(character_index, act_name, act_text):
    prompt = f"""Characters in {act_name}:
{character_index}

Structure:
- Each line of dialogue begins with an uppercase character name, followed by their statement.
- Character names always appear in uppercase to distinguish them from regular text.
- Pronouns should refer to the speaker or other characters listed above based on context.

Instructions:
- With each pronoun resolve it and add the character ID(s) behind it, formatted in square brackets (e.g., [1]).
- For collective pronouns like "vi" (we), use a list of character IDs in brackets, e.g., [1, 2].
- Do not add any additional commentary; provide only the modified text with pronouns replaced.
- Return the output as a JSON list of objects, where each object is a tuple like this
(the name of the character speaking, the dialogue text with pronouns replaced by IDs)

Example Output:

[
("MADAM HELSETH", "Å, det er ikke værdt at snakke om det. Sligt noget tror nu ikke De [2] på heller."),
("REBEKKA", "Tror da De [6] på det?"),
  ...
]

- Provide only the JSON output with no labels, explanations, or additional text. Do not prefix the JSON with "json" or any other text.

Text:
{act_text}
"""
    return prompt




In [25]:
import json

In [57]:
with open("Rosmersholm.json") as fp:
    acts = json.load(fp)

In [59]:
def split_text(text, split_by_lines=150):
    lines = text.splitlines()
    midpoint = len(lines) // 2
    part1 = "\n".join(lines[:midpoint])
    part2 = "\n".join(lines[midpoint:])
    return part1, part2

In [60]:
# Use the split function on each act
part1, part2 = split_text(acts[0]['text'])

# Verify token counts for each part
print("Part 1 token count:", count_tokens(part1))
print("Part 2 token count:", count_tokens(part2))

Part 1 token count: 6574
Part 2 token count: 5608


In [14]:
# Your function for analyzing text
def analyze_chunk(input_prompt):
    # Request the model for the completion
    completion = openai.chat.completions.create(
        model="gpt-4o-2024-08-06",  
        messages=[
            {
                "role": "user",
                "content": input_prompt,
            },
        ],
    )

    # Return the response content (the analyzed XML)
    return completion.choices[0].message

In [67]:
#results = []
for act in tqdm(acts[1:], 'act'):
    for part in  tqdm(split_text(act['text']), 'line_part'):
        results.append(analyze_chunk(create_prompt(character_index, act['act_name'], part)))
        

act:   0%|                                                                                        | 0/3 [00:00<?, ?it/s]
line_part:   0%|                                                                                  | 0/2 [00:00<?, ?it/s][A
line_part:  50%|████████████████████████████████████▌                                    | 1/2 [01:48<01:48, 108.21s/it][A
line_part: 100%|█████████████████████████████████████████████████████████████████████████| 2/2 [03:31<00:00, 105.75s/it][A
act:  33%|██████████████████████████▎                                                    | 1/3 [03:31<07:03, 211.51s/it]
line_part:   0%|                                                                                  | 0/2 [00:00<?, ?it/s][A
line_part:  50%|█████████████████████████████████████                                     | 1/2 [01:27<01:27, 87.63s/it][A
line_part: 100%|██████████████████████████████████████████████████████████████████████████| 2/2 [03:09<00:00, 94.72s/it][A
act:  67%|████

In [54]:
len(tokenize(results[0].content))

13233

In [68]:
end_result = []
for r in results:
    end_result.append(r.content)

In [69]:
with open('Rosmersholm_analyse_2.json', "w") as fp:
    json.dump(end_result, fp)

In [132]:
import json

# Example API response with Markdown formatting
response = []
for i, res in enumerate(results):
    response_text = res.content
    # Remove '```json' and '```' if present
    if response_text.startswith("```json"):
        response_text = response_text.replace("```json","")  # Remove the initial ```json
    if response_text.endswith("```"):
        response_text = response_text.replace("```","")  # Remove the trailing ```
    
    #print(response_text)
    # Strip any extra whitespace and parse as JSON
    parsed_response = ast.literal_eval(response_text.strip())
    response.append(("act", str(i)))
    response +=parsed_response
    #print(i)

In [133]:
df_rosmer = pd.DataFrame(response, columns=["Taler", "Dialog"])

In [149]:
df_rosmer.to_excel("Rosmersholm_refs.xlsx")

In [94]:
import ast

In [73]:
import tiktoken

# Use the tokenizer for the specific model you're working with
encoding = tiktoken.encoding_for_model("gpt-4")  # or "gpt-4o" if you have it

# Count tokens in your string
def count_tokens(text):
    return len(encoding.encode(text))

for res in results:
    text = res.content  # Replace with your actual act text
    token_count = count_tokens(text)
    print("Token count:", token_count)


Token count: 8730
Token count: 7817
Token count: 8500
Token count: 8172
Token count: 6890
Token count: 7048
Token count: 5460
Token count: 6209


In [129]:
import re

In [152]:
df_rosmer['Referanser'] = df_rosmer.Dialog.apply(lambda x: re.findall(r'(\[.*?\])', x)) 

In [153]:
df

NameError: name 'df' is not defined