In [None]:
import os
import re
import time
import pandas as pd
import anthropic
from anthropic import Anthropic

In [None]:
## CONVERT AND CLEAN SUBTITLES FROM .SRT TO .TXT

input_folder = "../data/subtitles/srt"
output_folder = "../data/subtitles/txt"
os.makedirs(output_folder, exist_ok=True)

def clean_subtitle_text(text):
    # Remove timestamps
    text = re.sub(r"\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}", "", text)
    # Remove sequence numbers
    text = re.sub(r"^\d+$", "", text, flags=re.MULTILINE)
    # Remove speaker cues (like "- JOHN:" or "- Hello")
    text = re.sub(r"^- .*?:", "", text, flags=re.MULTILINE)
    text = re.sub(r"^- ", "", text, flags=re.MULTILINE)
    # Remove empty lines and strip spaces
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return " ".join(lines)

for filename in os.listdir(input_folder):
    if filename.endswith(".srt"):
        imdb_id = os.path.splitext(filename)[0]
        with open(os.path.join(input_folder, filename), "r", encoding="utf-8", errors="ignore") as f:
            raw_text = f.read()
        cleaned_text = clean_subtitle_text(raw_text)
        with open(os.path.join(output_folder, f"{imdb_id}.txt"), "w", encoding="utf-8") as f_out:
            f_out.write(cleaned_text)

print("All subtitles converted and cleaned.")

In [None]:
## COMBINE SUBTITLES AND DESCRIPTIONS INTO TXT FILES

subs_folder = "../data/subtitles/txt"
desc_folder = "../data/descriptions"
output_folder = "../data/combined"
os.makedirs(output_folder, exist_ok=True)

for filename in os.listdir(desc_folder):
    if filename.endswith(".txt"):
        imdb_id = os.path.splitext(filename)[0]
        desc_path = os.path.join(desc_folder, filename)
        sub_path = os.path.join(subs_folder, f"{imdb_id}.txt")
        
        with open(desc_path, "r", encoding="utf-8") as f:
            desc_text = f.read().strip()

        combined_text = desc_text
        
        if os.path.exists(sub_path):
            with open(sub_path, "r", encoding="utf-8") as f:
                sub_text = f.read().strip()
            combined_text += "\n\n--- SUBTITLES ---\n\n" + sub_text
        else:
            combined_text += "\n\n--- SUBTITLES ---\n\n(No subtitles available)"

        with open(os.path.join(output_folder, f"{imdb_id}.txt"), "w", encoding="utf-8") as f:
            f.write(combined_text)

print("Combined files (description + optional subtitles) created for all IMDb IDs in the descriptions folder.")

In [None]:
# APPEND IMDB ID to TXT FILES

txt_folder = "../data/combined"
all_files = sorted([f for f in os.listdir(txt_folder) if f.endswith(".txt")])

for filename in all_files:
    imdb_id = filename.replace(".txt", "")
    file_path = os.path.join(txt_folder, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        original_text = f.read()

    new_text = f"imdb_id: {imdb_id}\n\n{original_text.strip()}"
    
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(new_text)

In [None]:
## RUN ANTHROPIC PROMPT AND SAVE OUPUT CSV

# --- Config ---
txt_folder = "../data/combined"
base_prompt_path = "../data/supplementary files/anthropic_prompt.txt"
output_path = "../data/claude_output.csv"
model = "claude-sonnet-4-20250514"
api_key = "my-API-key"
TOKEN_LIMIT_PER_MIN = 30000
TOKEN_REST_SECONDS = 90  # 2 minutes

client = Anthropic(api_key=api_key)

# --- Load base prompt ---
with open(base_prompt_path, "r", encoding="utf-8") as f:
    base_prompt = f.read().strip()

# --- Track existing completions ---
if os.path.exists(output_path):
    existing_df = pd.read_csv(output_path)
    completed_ids = set(existing_df["imdb_id"].astype(str))
    include_header = False
else:
    completed_ids = set()
    include_header = True

# --- Token tracking ---
tokens_used_this_minute = 0

# --- Process Files ---
for filename in sorted(os.listdir(txt_folder)):
    if not filename.endswith(".txt"):
        continue

    imdb_id = filename.replace(".txt", "")
    if imdb_id in completed_ids:
        print(f"Skipping {imdb_id} (already completed)")
        continue

    print(f"Processing {imdb_id}")
    with open(os.path.join(txt_folder, filename), "r", encoding="utf-8") as f:
        movie_text = f.read().strip()

    full_prompt = f"{base_prompt}\n\n{movie_text}"

    try:
        response = client.messages.create(
            model=model,
            max_tokens=8192,
            temperature=0,
            messages=[
                {"role": "user", "content": full_prompt}
            ]
        )
        csv_response = response.content[0].text.strip()

        # Estimate tokens used
        tokens_used = int(len(full_prompt) / 4) + int(len(csv_response) / 4)
        tokens_used_this_minute += tokens_used

        # Token limit enforcement
        if tokens_used_this_minute >= TOKEN_LIMIT_PER_MIN:
            print(f"Token limit hit ({tokens_used_this_minute}), pausing for {TOKEN_REST_SECONDS} seconds...")
            time.sleep(TOKEN_REST_SECONDS)
            tokens_used_this_minute = 0

        # Remove header from subsequent responses
        lines = csv_response.splitlines()
        if not include_header:
            lines = lines[1:]

        # Write to file
        mode = "w" if include_header else "a"
        with open(output_path, mode, encoding="utf-8") as out_f:
            for line in lines:
                out_f.write(line + "\n")

        print(f"Processed {imdb_id}")
        include_header = False  # Ensure header is only written once
        time.sleep(2)

    except Exception as e:
        print(f"Error with {imdb_id}: {e}")

In [None]:
## Add release_year to this file so that it can be used for creating plots

import pandas as pd

# File paths
claude_output_path = "../data/claude_output_final.csv"
sample_movies_path = "../data/sample_100_movies.csv"

# Read both CSVs
claude_df = pd.read_csv(claude_output_path)
sample_df = pd.read_csv(sample_movies_path)

# Validate column presence
if 'imdb_id' not in claude_df.columns or 'imdb_id' not in sample_df.columns:
    raise ValueError("Both files must contain an 'imdb_id' column.")
if 'year_of_release' not in sample_df.columns:
    raise ValueError("'sample_100_movies.csv' must contain a 'year_of_release' column.")

# Merge to bring in year_of_release
claude_df = claude_df.merge(sample_df[['imdb_id', 'year_of_release']], on='imdb_id', how='left')

# Reorder columns: imdb_id, year_of_release, then everything else
columns = ['imdb_id', 'year_of_release'] + [col for col in claude_df.columns if col not in ['imdb_id', 'year_of_release']]
claude_df = claude_df[columns]

# Overwrite the Claude output file
claude_df.to_csv(claude_output_path, index=False)

print("Updated 'claude_output.csv' with year_of_release as the second column.")