# Screenplay Proofreading with Gemma 2 9B IT (4-bit)

This notebook runs the proofreading process using a free T4 or P100 GPU on Google Colab or Kaggle.

### Instructions:
1.  **Runtime**: Ensure you are connected to a GPU Runtime (Colab: Runtime > Change runtime type > T4 | Kaggle: Settings > Accelerator > GPU T4 x2).
2.  **Files**: Upload your `ParsedScreenplays.zip` file.
3.  **Secrets**: Add a secret named `HF_TOKEN` with your Hugging Face token.
4.  **Run All**: Run all cells in order.

In [None]:
# Install necessary libraries
!pip install -q -U transformers accelerate bitsandbytes

In [None]:
from huggingface_hub import login
import os

try:
    # Try Colab Secrets
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print("Logged in via Colab Secret 'HF_TOKEN'!")
except ImportError:
    # Try Kaggle Secrets
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        hf_token = user_secrets.get_secret("HF_TOKEN")
        login(token=hf_token)
        print("Logged in via Kaggle Secret 'HF_TOKEN'!")
    except Exception:
        print("Secret 'HF_TOKEN' not found. Logging in interactively...")
        login()
except Exception:
    print("Secret 'HF_TOKEN' not found. Logging in interactively...")
    login()

In [None]:
# Unzip the uploaded screenplays
import zipfile
import glob

# Handle Kaggle Dataset paths if zip isn't in working dir
zip_path = 'ParsedScreenplays.zip'
if not os.path.exists(zip_path):
    # Check common Kaggle input paths if user uploaded as dataset
    possible_paths = glob.glob('/kaggle/input/**/*.zip', recursive=True)
    if possible_paths:
        zip_path = possible_paths[0]
        print(f"Found zip at {zip_path}")

if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('.')
    print("Unzipped ParsedScreenplays!")
else:
    print("WARNING: ParsedScreenplays.zip not found. Please upload it!")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# SWITCHING TO GEMMA 2 9B to resolve CUDA errors with Gemma 3 4B
model_id = "google/gemma-2-9b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

print(f"Model {model_id} loaded successfully!")

In [None]:
import glob
import re
import tqdm

def split_into_chunks(text, max_chars=6000):
    # Split by scene headers to respect context
    scene_pattern = re.compile(r'(=== \d+ .*? ===)')
    parts = scene_pattern.split(text)
    chunks = []
    current_chunk = ""
    if parts:
         current_chunk = parts[0]
         for i in range(1, len(parts), 2):
             header = parts[i]
             content = parts[i+1] if i+1 < len(parts) else ""
             full_scene = header + content
             if len(current_chunk) + len(full_scene) > max_chars:
                 chunks.append(current_chunk)
                 current_chunk = full_scene
             else:
                 current_chunk += full_scene
         if current_chunk:
             chunks.append(current_chunk)
    else:
        chunks = [text[i:i+max_chars] for i in range(0, len(text), max_chars)]
    return chunks

def proofread_chunk(text, filename, chunk_index):
    prompt = (
        "You are a professional proofreader. Proofread the following screenplay text segment, "
        "identifying typos, formatting inconsistencies, or logical errors. "
        "For each error you find:\n"
        "1. Cite the Episode (from filename) and Scene (from headers).\n"
        "2. Explain the error.\n"
        "3. Provide the corrected text for that section.\n\n"
        "After listing the errors, provide the full validated and corrected version of the text segment. "
        "Do not leave out any part of the original text in your corrected version.\n\n"
        f"Filename: {filename} (Chunk {chunk_index})\n\n"
        "Screenplay Content:\n"
        f"{text}"
    )

    messages = [
        {"role": "user", "content": prompt}
    ]

    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")

    outputs = model.generate(
        input_ids,
        max_new_tokens=4000,
        do_sample=True,
        temperature=0.7,
        top_p=0.95
    )

    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

In [None]:
# Main Processing Loop

input_dir = "ParsedScreenplays"
output_dir = "ProofreadScreenplays"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

files = sorted(glob.glob(os.path.join(input_dir, "*.txt")))
print(f"Found {len(files)} screenplays.")

for file_path in tqdm.tqdm(files, desc="Proofreading"):
    filename = os.path.basename(file_path)
    output_path = os.path.join(output_dir, f"Proofread_{filename}")

    if os.path.exists(output_path):
        print(f"Skipping {filename} (exists)")
        continue

    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    chunks = split_into_chunks(content, max_chars=6000)
    print(f"\nProcessing {filename} in {len(chunks)} chunks...")

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"Proofreading Output for {filename}\n")

    for i, chunk in enumerate(chunks):
        # Retry logic could be added here, but usually local/cloud inference is stable unless OOM
        try:
            result = proofread_chunk(chunk, filename, i+1)
            with open(output_path, 'a', encoding='utf-8') as f:
                f.write(f"\n=== Chunk {i+1} Output ===\n{result}\n")
            print(f"Done Chunk {i+1}/{len(chunks)}")
        except Exception as e:
            print(f"Error on chunk {i+1}: {e}")

    print(f"Finished {filename}")

In [None]:
# Zip results for download
!zip -r ProofreadScreenplays.zip ProofreadScreenplays
print("Zipped results! Downloading... (If on Colab)")

try:
    from google.colab import files
    files.download('ProofreadScreenplays.zip')
except ImportError:
    print("On Kaggle: Output file 'ProofreadScreenplays.zip' is in the working directory.")
    # Kaggle auto-saves outputs, no explicit download call needed