In [None]:
import pandas as pd
import ollama
from tqdm import tqdm
import os

# --- CONFIGURATION ---
FILE_PATH = '/Users/ayushipatel/Desktop/media-concensus-dashboard/data/raw/final_labels_MBIC.csv'
OUTPUT_PATH = 'audit_results_phase1.csv'
MODEL_NAME = "llama3:8b"
SAMPLE_SIZE = 20  # Start small to test

# 1. LOAD DATASET
try:
    df = pd.read_csv(FILE_PATH, sep=None, engine='python', on_bad_lines='warn').head(SAMPLE_SIZE)
    
    # --- ADD THIS CLEANING BLOCK ---
    # Strip whitespace from column names (e.g., ' text ' -> 'text')
    df.columns = df.columns.str.strip() 
    
    print(f"‚úÖ Columns found (cleaned): {df.columns.tolist()}")
    
    # Check if 'text' exists, if not, find the closest match
    if 'text' not in df.columns:
        print(f"‚ùå 'text' column not found. Available: {df.columns.tolist()}")
        # Check if it's named 'sentence' or something else in your specific CSV version
        if 'sentence' in df.columns:
            df = df.rename(columns={'sentence': 'text'})
            print("üîÑ Renamed 'sentence' to 'text'")
    # -------------------------------

except Exception as e:
    print(f"‚ùå Error loading CSV: {e}")
    exit()

# 2. ROBUST AUDIT FUNCTION
def audit_bias(text):
    if not isinstance(text, str) or text.strip() == "":
        return "Empty Text"
    
    system_prompt = (
        "You are a media bias auditor. Classify the sentence into: 'Left', 'Right', or 'Center'. "
        "Respond ONLY with the category name. No explanations."
    )
    
    try:
        response = ollama.chat(
            model=MODEL_NAME, 
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': f"Sentence: {text}"},
            ],
            options={'temperature': 0} # Set to 0 for consistent, scientific results
        )
        # Clean up response (remove punctuation or extra whitespace)
        return response['message']['content'].strip().replace('.', '')
    except Exception as e:
        return f"Error: {str(e)}"

# 3. RUN AUDIT WITH PROGRESS BAR
print(f"üöÄ Starting Llama 3 Audit using {MODEL_NAME}...")
tqdm.pandas()

# We use the 'text' column from your MBIC data
df['llama3_judgment'] = df['text'].progress_apply(audit_bias)

# 4. SAVE & PREVIEW
df.to_csv(OUTPUT_PATH, index=False)
print(f"\n‚úÖ Audit complete! Results saved to: {os.path.abspath(OUTPUT_PATH)}")

# 5. QUICK COMPARISON (Llama 3 vs. Expert)
# MBIC uses 'label_bias' for Biased/Non-Biased. 
# We'll see how Llama 3's 'Left/Right/Center' compares to the source outlet.
print("\n--- Sample Results ---")
print(df[['outlet', 'label_bias', 'llama3_judgment']].head(10))

‚úÖ Successfully loaded 20 rows.
üöÄ Starting Llama 3 Audit using llama3:8b...


KeyError: 'text'