# Political Corruption Article Sampling, Translation, and LLM Annotation Support

This notebook performs a multi-step pipeline to support human annotation of news articles for political corruption. It:

1. Loads and balances samples of news articles across selected countries.
2. Translates non-English articles into English using the Gemma3 or NLLB model, with chunking and retry logic for robustness.
3. Uses an LLM to identify whether political corruption is a central theme, highlighting relevant sentences, providing a rationale, tentative label, and confidence score.
4. Highlights keywords and model-identified sentences to assist annotators.
5. Outputs a CSV with translations, model suggestions, and fields for human labeling.

The final output is designed to streamline annotation workflows and improve label consistency and quality.


In [66]:
import os
import pandas as pd
import random
import requests
import difflib
from tqdm import tqdm
from transformers import pipeline

# ========== Config ==========
NEWS_FOLDER = "/home/akroon/data/volume_2/RESPOND_NEWS"
SELECTED_COUNTRIES = ["Bulgaria", "Italy", "Netherlands", "United_Kingdom"]
TOTAL_SAMPLES = 1000
TRANSLATION_SAMPLE_SIZE = 20

COUNTRY_TO_LANG = {
    "Bulgaria": "bg",
    "Italy": "it",
    "Netherlands": "nl",
    "United_Kingdom": "en"
}

# Load NLLB fallback pipeline
nllb_translator = pipeline("translation", model="facebook/nllb-200-distilled-600M", src_lang="bul_BUL", tgt_lang="eng_Latn")

# ========== Utility Functions ==========

def truncate_text(text, max_chars=512):
    return text if len(text) <= max_chars else text[:max_chars] + "..."

def is_probably_same_language(original, translation):
    non_ascii_chars = sum(1 for c in translation if ord(c) > 127)
    return non_ascii_chars / max(len(translation), 1) > 0.3

# ========== Load & Sample ==========

def load_and_prepare_data(news_folder, countries):
    all_dfs = []
    for country in countries:
        file_path = os.path.join(news_folder, f"{country}_news.csv")
        print(f"Loading {file_path} ...")
        try:
            df = pd.read_csv(file_path, low_memory=False)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue

        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        df['year'] = df['date'].dt.year.fillna(0).astype(int)

        if 'combined_text' not in df.columns:
            df['title'] = df['title'].fillna("").astype(str)
            df['body'] = df['body'].fillna("").astype(str)
            df['combined_text'] = df['title'] + " " + df['body']

        df['country'] = country
        all_dfs.append(df)

    combined_df = pd.concat(all_dfs, ignore_index=True)
    return combined_df

def balanced_sample(df, total_samples=1000, countries=None):
    if countries is None:
        countries = df['country'].unique()

    samples_per_country = total_samples // len(countries)
    sampled_dfs = []

    for country in countries:
        df_country = df[df['country'] == country]
        years = df_country['year'].unique()
        years = years[years != 0]

        if len(years) == 0:
            sample = df_country.sample(n=min(samples_per_country, len(df_country)), random_state=42)
            sampled_dfs.append(sample)
            continue

        total_country_articles = len(df_country)
        samples_for_year = {
            year: int(round(samples_per_country * (len(df_country[df_country['year'] == year]) / total_country_articles)))
            for year in years
        }

        diff = samples_per_country - sum(samples_for_year.values())
        if diff != 0:
            biggest_year = max(samples_for_year, key=samples_for_year.get)
            samples_for_year[biggest_year] += diff

        samples = []
        for year, n_samples in samples_for_year.items():
            df_year = df_country[df_country['year'] == year]
            n_samples = min(n_samples, len(df_year))
            samples.append(df_year.sample(n=n_samples, random_state=42))

        sampled_country = pd.concat(samples)
        sampled_dfs.append(sampled_country)

    final_sample = pd.concat(sampled_dfs).reset_index(drop=True)
    print(f"Sampled total {len(final_sample)} articles across {len(countries)} countries.")
    return final_sample

# ========== Translation Logic ==========
import requests
from tqdm import tqdm

MAX_CHUNK_SIZE = 1500  # max chars per chunk
MIN_TRANSLATION_RATIO = 0.7  # minimum ratio of output length to input length to accept translation

def truncate_text(text, max_chars=MAX_CHUNK_SIZE):
    return text if len(text) <= max_chars else text[:max_chars] + "..."

def is_probably_same_language(original, translation, threshold=0.3):
    # heuristic: fraction of non-ascii chars in translation
    non_ascii_chars = sum(1 for c in translation if ord(c) > 127)
    return non_ascii_chars / max(len(translation), 1) > threshold

def split_text_into_chunks(text, max_chunk_size=MAX_CHUNK_SIZE):
    paragraphs = text.split("\n\n")
    chunks = []
    current_chunk = ""

    for para in paragraphs:
        if len(current_chunk) + len(para) + 2 <= max_chunk_size:
            current_chunk += para + "\n\n"
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            # forcibly split long paragraph if needed
            while len(para) > max_chunk_size:
                chunks.append(para[:max_chunk_size].strip())
                para = para[max_chunk_size:]
            current_chunk = para + "\n\n"

    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

def translate_chunk_with_gemma3(chunk_text, source_lang):
    system_prompt = (
        f"You are a translation assistant. Translate the following {source_lang} text into English. "
        "**Translate the entire text fully and exactly, do NOT shorten or summarize.** "
        "Do NOT explain or paraphrase. Output ONLY the translated text."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": chunk_text}
    ]

    try:
        response = requests.post(
            "http://localhost:11434/api/chat",
            json={
                "model": "zongwei/gemma3-translator:4b",
                "messages": messages,
                "stream": False
            },
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        raw_translation = result.get("message", {}).get("content", "").strip()

        # Optional cleanup
        if raw_translation.lower().startswith("here’s the translation"):
            parts = raw_translation.split("\n\n", 1)
            if len(parts) == 2:
                raw_translation = parts[1].strip()

        if source_lang != "en" and is_probably_same_language(chunk_text, raw_translation):
            print("⚠️ Suspected untranslated chunk output — marking empty")
            return ""

        return raw_translation

    except Exception as e:
        print(f"❌ Translation error (Gemma3): {e}")
        return ""

def translation_is_too_short(original, translated, threshold=MIN_TRANSLATION_RATIO):
    return len(translated) < threshold * len(original)

def translate_article_with_chunking(text, lang):
    chunks = split_text_into_chunks(text, max_chunk_size=MAX_CHUNK_SIZE)
    translated_chunks = []

    for i, chunk in enumerate(chunks):
        print(f"Translating chunk {i+1}/{len(chunks)} (chars: {len(chunk)})")
        translated_chunk = translate_chunk_with_gemma3(chunk, source_lang=lang)

        if not translated_chunk or translation_is_too_short(chunk, translated_chunk):
            print("🔁 Retry chunk with truncated input...")
            truncated_chunk = truncate_text(chunk, max_chars=MAX_CHUNK_SIZE)
            translated_chunk = translate_chunk_with_gemma3(truncated_chunk, source_lang=lang)

            if not translated_chunk or translation_is_too_short(chunk, translated_chunk):
                print(f"❌ Failed to translate chunk {i+1} properly.")
                translated_chunk = "[Translation Failed]"

        translated_chunks.append(translated_chunk)

    full_translation = "\n\n".join(translated_chunks)
    return full_translation

# ========== Main translation loop ==========

output_rows = []
failed_translations = []

for lang, original_text in tqdm(sampled_articles, desc="Translating"):
    if lang == "en":
        translated_text = original_text
    else:
        translated_text = translate_article_with_chunking(original_text, lang)

    if not translated_text or "[Translation Failed]" in translated_text:
        failed_translations.append({
            "lang": lang,
            "original_text": original_text,
            "issue": "Failed or empty translation"
        })
        continue

    if is_probably_same_language(original_text, translated_text):
        print("⚠️ Suspected untranslated output — marking for review")
        failed_translations.append({
            "lang": lang,
            "original_text": original_text,
            "translated_text": translated_text,
            "issue": "Likely not translated"
        })

    output_rows.append({
        "original_text": original_text,
        "translated_text": translated_text,
        "label": ""
    })

# Save translated samples
out_df = pd.DataFrame(output_rows)
out_df.to_csv("sample_for_annotation.csv", index=False)
print("✅ Done! Saved translated samples to 'sample_for_annotation.csv'")

# Save failed or suspicious translations
if failed_translations:
    fail_df = pd.DataFrame(failed_translations)
    fail_df.to_csv("failed_translations_log.csv", index=False)
    print(f"⚠️ Logged {len(failed_translations)} failed/suspect translations to 'failed_translations_log.csv'")

Translating:   0%|          | 0/20 [00:00<?, ?it/s]

Translating chunk 1/2 (chars: 1479)


model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Translating chunk 2/2 (chars: 499)


Translating:   5%|▌         | 1/20 [00:04<01:23,  4.41s/it]

Translating chunk 1/3 (chars: 1322)
Translating chunk 2/3 (chars: 1445)
Translating chunk 3/3 (chars: 1463)


Translating:  10%|█         | 2/20 [00:14<02:19,  7.77s/it]

Translating chunk 1/3 (chars: 1392)
Translating chunk 2/3 (chars: 1373)
Translating chunk 3/3 (chars: 258)


Translating:  15%|█▌        | 3/20 [00:22<02:16,  8.00s/it]

Translating chunk 1/1 (chars: 889)


Translating:  25%|██▌       | 5/20 [00:24<00:59,  3.99s/it]

Translating chunk 1/1 (chars: 1479)


Translating:  30%|███       | 6/20 [00:28<00:55,  3.98s/it]

Translating chunk 1/2 (chars: 1500)
Translating chunk 2/2 (chars: 692)


Translating:  35%|███▌      | 7/20 [00:35<01:02,  4.81s/it]

Translating chunk 1/4 (chars: 1482)
Translating chunk 2/4 (chars: 1279)
Translating chunk 3/4 (chars: 1445)
Translating chunk 4/4 (chars: 910)


Translating:  40%|████      | 8/20 [00:46<01:21,  6.80s/it]

Translating chunk 1/1 (chars: 1348)


Translating:  50%|█████     | 10/20 [00:50<00:45,  4.56s/it]

Translating chunk 1/5 (chars: 1283)
Translating chunk 2/5 (chars: 1397)
Translating chunk 3/5 (chars: 1202)
Translating chunk 4/5 (chars: 1244)
Translating chunk 5/5 (chars: 518)


Translating:  55%|█████▌    | 11/20 [01:05<01:03,  7.09s/it]

Translating chunk 1/2 (chars: 1364)
Translating chunk 2/2 (chars: 358)


Translating:  70%|███████   | 14/20 [01:09<00:24,  4.04s/it]

Translating chunk 1/1 (chars: 1251)


Translating:  75%|███████▌  | 15/20 [01:12<00:19,  3.88s/it]

Translating chunk 1/3 (chars: 1432)
Translating chunk 2/3 (chars: 1359)
Translating chunk 3/3 (chars: 887)


Translating:  80%|████████  | 16/20 [01:19<00:18,  4.65s/it]

Translating chunk 1/1 (chars: 1381)


Translating:  85%|████████▌ | 17/20 [01:23<00:13,  4.38s/it]

Translating chunk 1/6 (chars: 1125)
Translating chunk 2/6 (chars: 1248)
Translating chunk 3/6 (chars: 1468)
Translating chunk 4/6 (chars: 1150)
Translating chunk 5/6 (chars: 1442)
Translating chunk 6/6 (chars: 237)


Translating:  90%|█████████ | 18/20 [01:39<00:14,  7.48s/it]

Translating chunk 1/4 (chars: 1280)
Translating chunk 2/4 (chars: 1434)
Translating chunk 3/4 (chars: 1114)
Translating chunk 4/4 (chars: 564)


Translating:  95%|█████████▌| 19/20 [01:50<00:08,  8.22s/it]

Translating chunk 1/3 (chars: 1353)
Translating chunk 2/3 (chars: 1447)
Translating chunk 3/3 (chars: 1169)


Translating: 100%|██████████| 20/20 [01:59<00:00,  5.96s/it]

✅ Done! Saved translated samples to 'sample_for_annotation.csv'





In [3]:
import pandas as pd
import requests
from tqdm import tqdm
import re
from typing import List  # Add this at the top of your script


# ========= Config ==========
TRANSLATED_FILE = "sample_for_annotation.csv"
OUTPUT_FILE = "sample_with_llm_suggestions.csv"

LLM_ENDPOINT = "http://localhost:11434/api/chat"
LLM_MODEL_NAME = "llama3"  # Replace if needed

# ========= Prompt Builder ==========
def build_detailed_prompt(article_text: str) -> str:
    return f"""You are helping a human annotator identify whether a news article is primarily about **political corruption**.

### Definition

**Political corruption** refers to situations where public power is misused for personal or political gain, especially in the context of political decision-making.

It involves **public officials** such as:
- Government ministers, members of parliament, or judges
- Mayors, governors, or local council members
- Leaders of regulatory agencies

(NOTE: Do **not** consider cases involving only police chiefs, military commanders, or leaders of state-owned companies.)

Common forms include:
- **Bribery** – accepting money or gifts for influence or decisions
- **Embezzlement** – stealing or misusing public funds
- **Nepotism / Cronyism** – appointing unqualified relatives or friends
- **Fraud, kickbacks, or money laundering** – illicit financial conduct
- **Abuse of authority** – rigging elections, silencing dissent, shielding allies

> These behaviors must involve public officials misusing public trust in political roles.

### Task Instructions

1. Highlight **full sentences** that indicate or describe political corruption — even if indirect or ambiguous.
2. Pay attention to keywords such as: bribery, fraud, abuse of power, nepotism, embezzlement, etc.
3. Use your judgment to decide whether political corruption is the **main focus** of the article.
4. Then, provide:
   - A list of the most relevant sentence highlights
   - A **tentative label**: Yes / Mentioned but not central / No / Unsure
   - A **brief explanation** of your reasoning
   - A **confidence score** from 0–100

### Output Format

Highlights:
- [Sentence 1]
- [Sentence 2]
...

Tentative Label: Yes / Mentioned but not central / No / Unsure  
Reasoning: [Your explanation]  
Confidence: [0–100]

---

Article:
{article_text}

Assistant Output:"""


# ========= LLM Request ==========
def classify_article(article_text: str) -> dict:
    prompt = build_detailed_prompt(article_text)

    try:
        response = requests.post(
            LLM_ENDPOINT,
            json={
                "model": LLM_MODEL_NAME,
                "messages": [{"role": "user", "content": prompt}],
                "stream": False
            },
            timeout=30
        )
        response.raise_for_status()
        result = response.json()
        answer = result.get("message", {}).get("content", "").strip()

        # Parse output
        highlights = []
        tentative_label = "Unclear"
        rationale_lines = []
        confidence = None

        lines = answer.splitlines()
        reading_highlights = False
        reading_rationale = False

        for line in lines:
            line_strip = line.strip()

            # Highlights
            if line_strip.lower() == "highlights:":
                reading_highlights = True
                reading_rationale = False
                continue
            elif line_strip.lower().startswith("tentative label:"):
                reading_highlights = False
                reading_rationale = False
                val = line_strip.split(":", 1)[1].strip().capitalize()
                if val in ["Yes", "No", "Unsure"]:
                    tentative_label = val
                continue
            elif line_strip.lower().startswith("reasoning:"):
                reading_highlights = False
                reading_rationale = True
                rationale_lines.append(line_strip.split(":", 1)[1].strip())
                continue
            elif line_strip.lower().startswith("confidence:"):
                reading_highlights = False
                reading_rationale = False
                match = re.search(r"\d{1,3}", line_strip)
                if match:
                    confidence = int(match.group(0))
                continue

            # Accumulate content
            if reading_highlights and line_strip.startswith("- "):
                highlights.append(line_strip[2:].strip())
            elif reading_rationale:
                if line_strip:
                    rationale_lines.append(line_strip)

        rationale = " ".join(rationale_lines).strip()

        return {
            "tentative_label": tentative_label,
            "rationale": rationale,
            "confidence": confidence,
            "highlights": highlights
        }

    except Exception as e:
        print(f"❌ Classification error: {e}")
        return {
            "tentative_label": "Error",
            "rationale": str(e),
            "confidence": None,
            "highlights": []
        }


# ========= Highlight Helper ==========
def highlight_translated_text(text: str, highlights: List[str]) -> str:
    """Insert <highlight> tags around matched highlight sentences in the text."""
    used = set()
    for hl in highlights:
        pattern = re.escape(hl.strip())
        if not pattern or pattern.lower() in used:
            continue
        regex = re.compile(pattern, re.IGNORECASE)
        text, count = regex.subn(r"<highlight>\g<0></highlight>", text, count=1)
        if count > 0:
            used.add(pattern.lower())
    return text

KEY_TERMS = [
    "bribery", "embezzlement", "nepotism", "corruption", "fraud",
    "abuse of power", "favoritism", "money laundering", "kickback", "cronyism"
]

def highlight_keywords(text: str, terms: List[str]) -> str:
    for term in terms:
        pattern = re.compile(rf"(?<!<highlight>)(\b{re.escape(term)}\b)", re.IGNORECASE)
        text = pattern.sub(r"<highlight>\1</highlight>", text)
    return text


# ========= Main Workflow ==========
if __name__ == "__main__":
    print(f"🔍 Loading translated articles from {TRANSLATED_FILE}...")
    df = pd.read_csv(TRANSLATED_FILE)

    print("🧠 Generating LLM suggestions for annotation support...")
    results = []
    highlighted_texts = []

    for text in tqdm(df["translated_text"].astype(str), desc="Processing articles"):
        result = classify_article(text)
        highlighted = highlight_translated_text(text, result["highlights"])
        highlighted = highlight_keywords(highlighted, KEY_TERMS)
        highlighted_texts.append(highlighted)
        results.append(result)

    # Merge LLM results into DataFrame
    df["translated_text"] = highlighted_texts
    df["tentative_label"] = [r["tentative_label"] for r in results]
    df["llm_confidence"] = [r["confidence"] for r in results]
    df["llm_rationale"] = [r["rationale"] for r in results]
    df["llm_evidence"] = ["; ".join(r["highlights"]) for r in results]
    df["human_label"] = ""
    df["qualtrics_qid"] = [f"Q{i+1}" for i in range(len(df))]

    df.to_csv(OUTPUT_FILE, index=False)
    print(f"✅ Saved to: {OUTPUT_FILE}")


🔍 Loading translated articles from sample_for_annotation.csv...
🧠 Generating LLM suggestions for annotation support...


Processing articles: 100%|██████████| 1000/1000 [42:45<00:00,  2.57s/it] 

✅ Saved to: sample_with_llm_suggestions.csv





In [4]:
excel_output_file = OUTPUT_FILE.replace(".csv", ".xlsx")
df.to_excel(excel_output_file, index=False)
print(f"✅ Saved to: {excel_output_file}")

✅ Saved to: sample_with_llm_suggestions.xlsx
