# Gaza Journalist Video Classifier - Validation

**Multimodal classification with Audio + Vision + OCR**

## Instructions:
1. **Run Cell 1** - Setup (takes ~5-10 minutes) - Run ONCE
2. **Run Cell 2** - Load processing functions - Run ONCE  
3. **Run Cell 3** - Upload Excel & Process videos - Run MANY times

## Cell 1: Setup (Run Once)

In [None]:
%%bash
echo "[1/6] Installing Python packages..."
pip install -q yt-dlp pandas openpyxl pytesseract pillow requests

echo "[2/6] Installing system packages..."
apt-get update -qq > /dev/null 2>&1
apt-get install -qq tesseract-ocr tesseract-ocr-ara ffmpeg git build-essential

echo "[3/6] Setting up Whisper.cpp..."
if [ ! -d "whisper.cpp" ]; then
    git clone https://github.com/ggerganov/whisper.cpp.git
    cd whisper.cpp && make -j4 && cd ..
fi

echo "[4/6] Downloading Whisper model..."
if [ ! -f "whisper.cpp/models/ggml-base.bin" ]; then
    cd whisper.cpp && bash ./models/download-ggml-model.sh base && cd ..
fi

echo "[5/6] Installing Ollama..."
if ! command -v ollama &> /dev/null; then
    curl -fsSL https://ollama.com/install.sh | sh
fi

echo "[6/6] Starting Ollama and pulling models..."
nohup ollama serve > /tmp/ollama.log 2>&1 &
sleep 5

echo "  - Pulling Qwen 2.5 72B..."
ollama pull qwen2.5:72b
echo "  - Pulling LLaVA..."
ollama pull llava-llama-3:8b

echo ""
echo "✓ Setup complete!"

## Cell 2: Load Functions (Run Once After Setup)

In [None]:
# All processing functions embedded here
import json, subprocess, pandas as pd, tempfile, time, re, os, requests, base64
from pathlib import Path
from typing import Dict, List

CATEGORIES = ["Destruction of Property", "Displacement", "IDF", "Jewish Dissent", "Inhumane Acts",
              "Imprisonment", "Resilience", "Starvation of Civilian", "Testimonials", "Willful Killing"]

TAGS = ["Birth Prevention", "Call to Action", "Ceasefire Violation", "Children", "Ethnic Cleansing",
        "Food", "Healthcare workers", "Hospitals", "IDF", "Journalists", "Media and Journalism",
        "Other", "Repression", "Schools", "Torture", "Water", "Women"]

WHISPER_PATH = "./whisper.cpp/build/bin/whisper-cli"
WHISPER_MODEL = "./whisper.cpp/models/ggml-base.bin"
LLAVA_MODEL = "llava-llama-3:8b"
LLM_MODEL = "qwen2.5:72b"
OLLAMA_URL = "http://localhost:11434/api/generate"

def download_video(url: str, output: str) -> bool:
    try:
        r = subprocess.run(["yt-dlp", "-f", "best[ext=mp4]", "-o", output, url], capture_output=True, timeout=120)
        return r.returncode == 0 and Path(output).exists()
    except: return False

def extract_audio(video: str, audio: str) -> bool:
    try:
        r = subprocess.run(["ffmpeg", "-i", video, "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", "-y", audio], capture_output=True, timeout=60)
        return r.returncode == 0
    except: return False

def transcribe_audio(audio: str, lang: str = "ar") -> str:
    if not os.path.exists(WHISPER_PATH): return ""
    cmd = [WHISPER_PATH, "-m", WHISPER_MODEL, "-f", audio, "-nt"]
    if lang != "auto": cmd.extend(["-l", lang])
    try: return subprocess.run(cmd, capture_output=True, text=True, timeout=120).stdout.strip()
    except: return ""

def extract_frames(video: str, n: int = 5) -> List[str]:
    temp = tempfile.mkdtemp()
    try:
        dur = float(subprocess.run(["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", video],
                                   capture_output=True, text=True, timeout=10).stdout.strip())
    except: dur = 30.0
    frames = []
    interval = max(1.0, dur / (n + 1))
    for i in range(n):
        t = interval * (i + 1)
        f = Path(temp) / f"frame_{i+1:03d}.jpg"
        try:
            subprocess.run(["ffmpeg", "-ss", str(t), "-i", video, "-frames:v", "1", "-q:v", "2", "-y", str(f)], capture_output=True, timeout=10)
            if f.exists(): frames.append(str(f))
        except: continue
    return frames

def extract_text(img: str) -> str:
    try:
        import pytesseract
        from PIL import Image
        return pytesseract.image_to_string(Image.open(img), lang='ara+eng').strip()
    except: return ""

def analyze_vision(img: str, ctx: str = "") -> str:
    with open(img, 'rb') as f:
        b64 = base64.b64encode(f.read()).decode('utf-8')
    prompt = f"""Analyze this Gaza journalist report frame.\nContext: {ctx[:500] if ctx else 'No audio'}\n\nDescribe:\n- People (children, women, injured, medical staff, soldiers)\n- Setting (hospital, tent, destroyed building)\n- Situation (what's happening)\n- Evidence (visual indicators)"""
    try:
        r = requests.post(OLLAMA_URL, json={"model": LLAVA_MODEL, "prompt": prompt, "images": [b64], "stream": False}, timeout=60)
        if r.status_code == 200: return r.json().get("response", "")
    except: pass
    return ""

def classify(transcript: str, ocr: str, vision: List[str]) -> Dict:
    sys_prompt = f"""Analyze Gaza journalist reports using audio, text, and vision.\n\n**CATEGORIES**: Deaths→Willful Killing, Food→Starvation, Buildings→Destruction, Tents→Displacement\n\n{json.dumps(CATEGORIES, indent=2)}\n**TAGS**: {json.dumps(TAGS, indent=2)}\n\nRespond JSON: {{"category": "name", "tags": ["tag1"], "confidence": "high|medium|low", "reasoning": "..."}}"""
    content = f"""{'='*80}\nMULTIMODAL ANALYSIS\n{'='*80}\n\n1. AUDIO:\n{transcript or '[No audio]'}\n\n2. OCR:\n{ocr or '[No text]'}\n\n3. VISION:\n{chr(10).join(f'Frame {i+1}: {v}' for i, v in enumerate(vision)) if vision else '[No vision]'}\n\n{'='*80}\nClassify:"""
    try:
        r = requests.post(OLLAMA_URL, json={"model": LLM_MODEL, "prompt": f"{sys_prompt}\n\n{content}", "stream": False, "format": "json", "options": {"temperature": 0.1}}, timeout=180)
        if r.status_code == 200:
            txt = r.json().get("response", "").strip()
            if txt.startswith("```json"): txt = txt[7:]
            elif txt.startswith("```"): txt = txt[3:]
            if txt.endswith("```"): txt = txt[:-3]
            return json.loads(txt.strip())
    except: pass
    return {"category": "Unknown", "tags": [], "confidence": "low", "reasoning": "Error"}

def process_video(video: str, lang: str = "ar") -> Dict:
    with tempfile.TemporaryDirectory() as tmp:
        audio = Path(tmp) / "audio.wav"
        transcript = transcribe_audio(str(audio), lang) if extract_audio(video, str(audio)) else ""
        frames = extract_frames(video, 5)
        ocr = "\n".join(extract_text(f) for f in frames)
        vision = [analyze_vision(f, transcript[:500]) for f in frames[:3]]
        result = classify(transcript, ocr, vision)
        return {"transcript_length": len(transcript), "ocr_length": len(ocr), "vision_frames": len(vision), **result}

def norm_cat(c): return "Willful Killing" if str(c).strip() == "Wilful Killing" else (str(c).strip() if not pd.isna(c) else "Unknown")
def norm_tags(t): return [x.strip() for x in re.split(r'[,;]', str(t)) if x.strip()] if not pd.isna(t) else []

print("✓ Functions loaded!")

## Cell 3: Upload Excel & Run Validation (Run Multiple Times)

In [None]:
from google.colab import files

# Configuration
SAMPLE_SIZE = 30  # Change this number
OUTPUT_DIR = "validation_output"

# Upload Excel
print("Upload your Excel file:\n")
uploaded = files.upload()
excel = list(uploaded.keys())[0]
print(f"\n✓ Uploaded: {excel}\n")

# Read and filter
df = pd.read_excel(excel)
print(f"Total entries: {len(df)}")
df['source'] = df['Source Link/URL'].apply(lambda x: 'instagram' if 'instagram' in str(x).lower() else ('twitter' if 'twitter' in str(x).lower() or 'x.com' in str(x).lower() else ('youtube' if 'youtube' in str(x).lower() else ('facebook' if 'facebook' in str(x).lower() else 'other'))))
proc = df[df['source'] != 'other']
print(f"Processable: {len(proc)}\n")

# Sample
sample = proc.sample(n=min(SAMPLE_SIZE, len(proc)), random_state=42)

# Setup output
out = Path(OUTPUT_DIR)
out.mkdir(exist_ok=True)
vid_dir = out / "videos"
vid_dir.mkdir(exist_ok=True)

results = []
print(f"Processing {len(sample)} videos...\n" + "="*80)

# Process each video
for idx, (i, row) in enumerate(sample.iterrows(), 1):
    url = row['Source Link/URL']
    human_cat = norm_cat(row['Category'])
    human_tags = norm_tags(row['Tags (optional)'])
    src = row['source']

    print(f"\n[{idx}/{len(sample)}] {src.upper()}")
    print(f"  Human: {human_cat}")

    vpath = vid_dir / f"video_{i}.mp4"
    if download_video(url, str(vpath)):
        print(f"  [+] Downloaded")
        try:
            start = time.time()
            cls = process_video(str(vpath))
            elapsed = time.time() - start
            auto_cat = cls['category']
            match = auto_cat == human_cat
            print(f"  [+] Auto: {auto_cat} ({cls['confidence']})")
            print(f"  {'✓ MATCH' if match else '✗ MISMATCH'} | {elapsed:.1f}s")
            results.append({"url": url, "source": src, "human_category": human_cat, "human_tags": human_tags,
                          "automated_category": auto_cat, "automated_tags": cls['tags'], "category_match": match,
                          "confidence": cls['confidence'], "time": elapsed, **cls})
        except Exception as e:
            print(f"  [ERROR] {str(e)}")
            results.append({"url": url, "source": src, "error": str(e), "human_category": human_cat})
    else:
        print(f"  [ERROR] Download failed")
        results.append({"url": url, "source": src, "error": "Download failed", "human_category": human_cat})

# Report
print("\n" + "="*80 + "\nVALIDATION REPORT\n" + "="*80)
success = [r for r in results if "error" not in r]
matches = sum(1 for r in success if r.get("category_match", False))
print(f"\nProcessed: {len(results)} | Successful: {len(success)} | Failed: {len(results)-len(success)}")
if success:
    acc = 100 * matches / len(success)
    print(f"\nAccuracy: {matches}/{len(success)} ({acc:.1f}%)")
    avg_time = sum(r.get("time", 0) for r in success) / len(success)
    print(f"Avg time: {avg_time:.1f}s/video")
    print(f"\nConfidence:")
    for c in ["high", "medium", "low"]:
        cnt = sum(1 for r in success if r.get('confidence') == c)
        print(f"  {c.capitalize()}: {cnt} ({100*cnt/len(success):.1f}%)")

# Save
with open(out / "results.json", 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n✓ Results: {out}/results.json")