## 1. Setup Environment

In [None]:
# Install dependencies
!pip install -q sentence-transformers chromadb scikit-learn transformers accelerate bitsandbytes pymongo pdfplumber torch

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Navigate to project directory
%cd /content/drive/MyDrive/bachelor_project

In [None]:
# Add src directory to Python path
import sys
import os

# Get current working directory
current_dir = os.getcwd()
src_path = os.path.join(current_dir, 'src')

# Add to path if not already there
if src_path not in sys.path:
    sys.path.insert(0, src_path)
    print(f"‚úì Added to path: {src_path}")

# Verify src files exist
if os.path.exists(src_path):
    src_files = os.listdir(src_path)
    print(f"‚úì Found {len(src_files)} files in src/")
    print(f"  Files: {', '.join([f for f in src_files if f.endswith('.py')])}")
else:
    print(f"‚ö†Ô∏è Warning: src/ directory not found at {src_path}")
    print("Make sure you've uploaded your project files to Google Drive")

In [None]:
# Check current directory structure
import os

print("üìÅ Current directory:", os.getcwd())
print("\nüìÇ Directory contents:")
for item in os.listdir('.'):
    item_type = "üìÅ" if os.path.isdir(item) else "üìÑ"
    print(f"  {item_type} {item}")

# Check if src exists
if os.path.exists('src'):
    print("\n‚úì src/ folder found!")
    print("üìÇ Files in src/:")
    for item in os.listdir('src'):
        if item.endswith('.py'):
            print(f"  üìÑ {item}")
else:
    print("\n‚ö†Ô∏è src/ folder not found!")
    
# Check for txt files
txt_files = [f for f in os.listdir('.') if f.endswith('.txt')]
if txt_files:
    print(f"\n‚úì Found {len(txt_files)} .txt file(s):")
    for f in txt_files:
        print(f"  üìÑ {f}")

## 2. Load Llama 3.1 Model

**Model Se√ßenekleri:**
- **Llama 3.2 3B**: Meta'nƒ±n son modeli, hafif (HF token gerekli) (√ñNERƒ∞LEN)
- **Llama 3.1 8B**: Daha g√º√ßl√º ama aƒüƒ±r (HF token gerekli)
- **Qwen 2.5 7B**: T√ºrk√ße dahil √ßokdilli, token gerekmez
- **Mistral 7B**: A√ßƒ±k eri≈üimli, token gerekmez

Llama kullanmak i√ßin HuggingFace token'ƒ±nƒ± Colab Secrets'a eklemelisin.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Get HuggingFace token from Colab Secrets
from google.colab import userdata
import os

try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    os.environ['HF_TOKEN'] = HF_TOKEN
    print("‚úì HuggingFace token loaded from Colab Secrets")
    print(f"Token length: {len(HF_TOKEN)} characters")
    print(f"Token preview: {HF_TOKEN[:10]}...{HF_TOKEN[-10:]}")
except Exception as e:
    print(f"‚ùå Token bulunamadƒ±: {e}")
    print("\n=== TOKEN EKLEME ADIMLARI ===")
    print("1. Sol panelde üîë (Key/Secrets) simgesine tƒ±kla")
    print("2. 'Add new secret' butonuna bas")
    print("3. Name: HF_TOKEN")
    print("4. Value: HuggingFace token'ƒ±nƒ± yapƒ±≈ütƒ±r")
    print("5. 'Notebook access' toggle'ƒ±nƒ± A√á (√∂nemli!)")
    print("6. Bu h√ºcreyi tekrar √ßalƒ±≈ütƒ±r")
    print("\nToken almak i√ßin: https://huggingface.co/settings/tokens")
    HF_TOKEN = None

### HuggingFace Token Setup

**Colab Secrets'a token eklemek i√ßin:**
1. Sol panelde üîë (Key) simgesine tƒ±kla
2. "Add new secret" butonuna bas
3. Name: `HF_TOKEN`
4. Value: HuggingFace token'ƒ±nƒ± yapƒ±≈ütƒ±r (https://huggingface.co/settings/tokens)
5. "Notebook access" toggle'ƒ±nƒ± a√ß

Token eklendikten sonra a≈üaƒüƒ±daki h√ºcreleri √ßalƒ±≈ütƒ±r.

In [None]:
# Load Model for Summarization
# Run this cell when you're ready to generate summaries (Step 6)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os

# Option 3: Qwen 2.5 7B (no token needed, excellent Turkish support) - RECOMMENDED
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"

# Option 4: Mistral 7B (no token needed)
# MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

# Option 1: Llama 3.2 3B (gated, requires HF token)
# MODEL_ID = "meta-llama/Llama-3.2-3B-Instruct"

# Option 2: Llama 3.1 8B (gated, requires HF token, more powerful)
# MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

print(f"Loading tokenizer from {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

print("Loading model (this may take a few minutes)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

print("‚úì Model loaded successfully!")
print(f"Model: {MODEL_ID}")
print(f"Device: {model.device}")

## 3. Parse and Prepare Screenplay

In [None]:
import os
import re

# Specify your script file (can be .txt or .pdf)
script_file = "the_addams_family.txt"  # Change this to your file

# Metadata: Update based on your content type
metadata = {
    "title": "The Addams Family",  # Film or series name
    "type": "movie",  # "movie" or "series"
    "year": 1991,  # Optional
    # For series, add: "season": 1, "episode": 1
}

# Check file extension and process accordingly
if script_file.endswith('.txt'):
    txt_file = script_file
    print(f"‚úì Using existing text file: {txt_file}")
elif script_file.endswith('.pdf'):
    from parser_pdf_to_txt import PDFParser
    parser = PDFParser(output_dir="data/raw_scripts")
    txt_file = parser.parse_pdf(script_file)
    print(f"‚úì Parsed PDF to: {txt_file}")
else:
    raise ValueError(f"Unsupported file format: {script_file}. Use .txt or .pdf")

# Verify file exists
if not os.path.exists(txt_file):
    raise FileNotFoundError(f"File not found: {txt_file}")

# ========== PREPROCESSING: Clean screenplay text (INLINE) ==========
print("\nüìã PREPROCESSING: Cleaning screenplay...")

# Define cleaner inline
patterns = {
    'scene_header': r'^(INT\.|EXT\.)\s+[^\n]*(DAY|NIGHT|DAWN|DUSK|CONTINUOUS)',
    'transition': r'^(CUT TO|FADE IN|FADE OUT|DISSOLVE TO|SMASH CUT)\b',
    'technical': r'\b(CAMERA|SOUND|MUSIC|MONTAGE|TITLE CARD)\b',
    'parenthetical': r'\([^)]*(?:whispering|shouting|V\.O\.|O\.S\.|beat|pause)\)',
    'whitespace': r'\s+',
    'formatting': r'^[-=*]{3,}$',
}

def clean_screenplay(text, min_line_length=5):
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        original = line.strip()
        if not original:
            continue
        
        # Skip scene headers, transitions, formatting
        if (re.match(patterns['scene_header'], original, re.IGNORECASE) or
            re.match(patterns['formatting'], original) or
            re.match(patterns['transition'], original, re.IGNORECASE)):
            continue
        
        # Clean the line
        cleaned = re.sub(patterns['parenthetical'], '', original, flags=re.IGNORECASE)
        cleaned = re.sub(patterns['technical'], '', cleaned, flags=re.IGNORECASE)
        cleaned = re.sub(patterns['whitespace'], ' ', cleaned).strip()
        
        if cleaned and len(cleaned) >= min_line_length:
            cleaned_lines.append(cleaned)
    
    result = '\n'.join(cleaned_lines)
    result = re.sub(r'\n{3,}', '\n\n', result)
    return result.strip()

# Read and clean
with open(txt_file, 'r', encoding='utf-8') as f:
    original_text = f.read()

cleaned_text = clean_screenplay(original_text)

# Statistics
orig_lines = len(original_text.split('\n'))
clean_lines = len(cleaned_text.split('\n'))
orig_chars = len(original_text)
clean_chars = len(cleaned_text)
noise_pct = round((1 - clean_chars / orig_chars) * 100, 1) if orig_chars > 0 else 0

print("\nüìä Cleaning Statistics:")
print(f"  Lines: {orig_lines} ‚Üí {clean_lines} ({orig_lines - clean_lines} removed, {round((orig_lines-clean_lines)/orig_lines*100, 1)}%)")
print(f"  Characters: {orig_chars:,} ‚Üí {clean_chars:,}")
print(f"  Noise removed: {noise_pct}%")

# Save cleaned version
txt_file_cleaned = txt_file.replace('.txt', '_cleaned.txt')
with open(txt_file_cleaned, 'w', encoding='utf-8') as f:
    f.write(cleaned_text)

txt_file = txt_file_cleaned
print(f"\n‚úì Using cleaned text: {txt_file}")

In [None]:
# Split into scenes
from scene_splitter import SceneSplitter

# Create output directory based on content type
if metadata["type"] == "movie":
    output_dir = f"data/scenes/{metadata['title'].replace(' ', '_').lower()}"
else:
    output_dir = f"data/scenes/{metadata['title'].replace(' ', '_').lower()}_s{metadata.get('season', 1):02d}e{metadata.get('episode', 1):02d}"

splitter = SceneSplitter(output_dir=output_dir)
scenes = splitter.process_script(txt_file, metadata=metadata)

print(f"‚úì Split into {len(scenes)} scenes")
print(f"‚úì Saved to: {output_dir}")

In [None]:
# Chunk long scenes
from chunker import SceneChunker

# Create chunks directory and prefix based on content
if metadata["type"] == "movie":
    chunks_dir = f"data/chunks/{metadata['title'].replace(' ', '_').lower()}"
    prefix = metadata['title'].replace(' ', '_').lower()
else:
    chunks_dir = f"data/chunks/{metadata['title'].replace(' ', '_').lower()}_s{metadata.get('season', 1):02d}e{metadata.get('episode', 1):02d}"
    prefix = f"{metadata['title'].replace(' ', '_').lower()}_s{metadata.get('season', 1):02d}e{metadata.get('episode', 1):02d}"

chunker = SceneChunker(output_dir=chunks_dir)
chunks = chunker.process_all_scenes(output_dir, prefix=prefix)

print(f"‚úì Created {len(chunks)} chunks")
print(f"‚úì Saved to: {chunks_dir}")

## 4. Generate Embeddings and Store in ChromaDB

In [None]:
from embedder import EmbeddingManager
from vectorstore import VectorStore

# Generate embeddings
embedder = EmbeddingManager()
chunks_with_embeddings = embedder.add_embeddings_to_chunks(chunks)

print(f"‚úì Generated embeddings for {len(chunks_with_embeddings)} chunks")

In [None]:
# Store in ChromaDB
vectorstore = VectorStore(persist_directory="embeddings")
collection_name = "your_show_s01e01"

vectorstore.add_chunks_to_collection(collection_name, chunks_with_embeddings)

print(f"‚úì Stored in collection: {collection_name}")

## 5. Select Representative Scenes (BRV)

In [None]:
from clustering import RepresentativeSceneSelector
import numpy as np

# Extract embeddings
embeddings = np.array([chunk['embedding'] for chunk in chunks_with_embeddings])

# Select representatives
selector = RepresentativeSceneSelector(n_clusters=10)
representatives = selector.select_brv_scenes(chunks_with_embeddings, auto_select_k=True)

print(f"‚úì Selected {len(representatives)} representative scenes")

In [None]:
# Preview representatives
for i, rep in enumerate(representatives[:5], 1):
    print(f"\n{i}. {rep['header']}")
    print(f"   Scene: {rep['scene_number']}, Cluster: {rep['cluster_id']}")
    print(f"   Preview: {rep['content'][:150]}...")

In [None]:
# Visualize embeddings in 2D space
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

# Extract data for visualization
embeddings_array = np.array([chunk['embedding'] for chunk in chunks_with_embeddings])
cluster_labels = np.array([chunk.get('cluster_id', -1) for chunk in chunks_with_embeddings])
scene_numbers = [chunk['scene_number'] for chunk in chunks_with_embeddings]

# Reduce to 2D using t-SNE
print("üîÑ Reducing embeddings to 2D (this may take a minute)...")
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(embeddings_array)-1))
embeddings_2d = tsne.fit_transform(embeddings_array)

# Create figure
plt.figure(figsize=(14, 10))

# Plot all chunks
unique_clusters = np.unique(cluster_labels)
colors = plt.cm.tab10(np.linspace(0, 1, len(unique_clusters)))

for i, cluster_id in enumerate(unique_clusters):
    mask = cluster_labels == cluster_id
    plt.scatter(
        embeddings_2d[mask, 0], 
        embeddings_2d[mask, 1],
        c=[colors[i]],
        label=f'Cluster {int(cluster_id)}',
        alpha=0.6,
        s=100
    )

# Highlight representative chunks
rep_indices = [chunk['chunk_id']-1 for chunk in representatives]  # chunk_id is 1-indexed
rep_coords = embeddings_2d[rep_indices]

plt.scatter(
    rep_coords[:, 0], 
    rep_coords[:, 1],
    c='red',
    marker='*',
    s=500,
    edgecolors='black',
    linewidths=2,
    label='Representatives',
    zorder=5
)

# Add labels for representatives
for i, (x, y) in enumerate(rep_coords):
    plt.annotate(
        f"R{i+1}",
        (x, y),
        xytext=(5, 5),
        textcoords='offset points',
        fontsize=10,
        fontweight='bold',
        bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7)
    )

plt.title('Chunk Embeddings Visualization (2D t-SNE)', fontsize=16, fontweight='bold')
plt.xlabel('t-SNE Dimension 1', fontsize=12)
plt.ylabel('t-SNE Dimension 2', fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"‚úì Visualized {len(embeddings_array)} chunks in {len(unique_clusters)} clusters")
print(f"‚≠ê {len(representatives)} representative chunks highlighted in red")

### Visualize Clusters

Embedding'leri 2D'ye d√º≈ü√ºr√ºp cluster'larƒ± g√∂rselle≈ütirelim.

## 6. Generate Summary (Map-Reduce)

In [None]:
# MAP phase: Summarize each representative scene
# ‚ö†Ô∏è Make sure you've loaded the model first (Step 2, cell 11)

# Check if model and tokenizer are loaded
try:
    _ = tokenizer
    _ = model
    print("‚úì Model and tokenizer are loaded")
except NameError:
    print("‚ùå ERROR: Model not loaded!")
    print("\nüîÑ STEPS TO FIX:")
    print("1. Scroll up to Step 2, Cell 11 (Load Model for Summarization)")
    print("2. Run that cell to load the Qwen model")
    print("3. Wait for 'Model loaded successfully!' message")
    print("4. Then come back here and run this cell again")
    raise NameError("Model and tokenizer must be loaded before summarization. Run Cell 11 first!")

from tqdm import tqdm
import gc

def summarize_scene(scene_content, max_tokens=200):
    system_prompt = """You are a professional film critic. Summarize scenes clearly and concisely.
Focus on key events, character interactions, and plot developments.
Write in narrative style, not bullet points."""

    user_prompt = f"""Summarize this scene:

{scene_content}

Provide a clear, narrative summary (1-2 paragraphs)."""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize with aggressive truncation
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=1200  # Reduce input size
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=False  # Disable KV cache to save memory
        )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "assistant\n\n" in full_output:
        summary = full_output.split("assistant\n\n")[-1].strip()
    else:
        summary = full_output
    
    # Aggressive memory cleanup
    del inputs, outputs
    torch.cuda.empty_cache()
    gc.collect()
    
    return summary

# Generate map summaries with memory monitoring
map_summaries = []
for i, rep in enumerate(tqdm(representatives, desc="MAP phase")):
    summary = summarize_scene(rep['content'])
    map_summaries.append({
        "scene_number": rep['scene_number'],
        "header": rep['header'],
        "summary": summary
    })
    
    # Clear cache after every scene
    torch.cuda.empty_cache()
    gc.collect()
    
    # Print memory status every 3 scenes
    if (i + 1) % 3 == 0:
        mem = torch.cuda.memory_allocated() / 1e9
        print(f"  [{i+1}/{len(representatives)}] Memory: {mem:.2f}GB")

print(f"\n‚úì Generated {len(map_summaries)} scene summaries")

In [None]:
# REDUCE phase: Combine into final summary (with memory optimization)
import gc

def combine_summaries(summaries, max_tokens=400):
    system_prompt = """You are a professional film critic writing a complete screenplay summary.
Combine the scene summaries into one coherent, chronological narrative.
Be concise but comprehensive."""

    # More compact joining
    combined_text = "\n".join([s['summary'] for s in summaries])

    user_prompt = f"""Combine these scene summaries into one complete story:

{combined_text}

Write a concise narrative summary."""

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize with aggressive truncation
    inputs = tokenizer(
        input_text, 
        return_tensors="pt",
        truncation=True,
        max_length=1500  # Stricter limit
    ).to(model.device)

    print("Generating final summary...")
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=False  # Disable KV cache
        )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "assistant\n\n" in full_output:
        summary = full_output.split("assistant\n\n")[-1].strip()
    else:
        summary = full_output
    
    # Cleanup
    del inputs, outputs
    torch.cuda.empty_cache()
    gc.collect()
    
    return summary

# Clear memory before REDUCE
torch.cuda.empty_cache()
gc.collect()

final_summary = combine_summaries(map_summaries)

print("\n" + "="*60)
print("FINAL SUMMARY")
print("="*60)
print(final_summary)

## 7. Save to MongoDB

In [None]:
from mongodb_manager import MongoDBManager

# Connect to MongoDB (use MongoDB Atlas for cloud)
mongodb_uri = "mongodb+srv://<username>:<password>@cluster.mongodb.net/"
mongo = MongoDBManager(uri=mongodb_uri, db_name="screenplay_summaries")

# Save summary
doc_id = mongo.save_summary(
    title="Your Show S01E01",
    final_summary=final_summary,
    map_outputs=map_summaries,
    metadata={
        "show": "Your Show",
        "season": 1,
        "episode": 1,
        "processed_on": "colab"
    }
)

print(f"‚úì Saved to MongoDB with ID: {doc_id}")

## 8. Export Results

In [None]:
# Save final summary to file
with open("summaries/final/your_show_s01e01.txt", "w", encoding="utf-8") as f:
    f.write(final_summary)

print("‚úì Summary saved to file")

# Download to local machine
from google.colab import files
files.download("summaries/final/your_show_s01e01.txt")