In [2]:
import pandas as pd
import re
import csv
from typing import List

# Step 1: Cleaning the Data
def clean_data(raw_text: str) -> str:
    """Clean raw text by removing URLs and redundant details."""
    if pd.isna(raw_text):
        return ""
    
    text = re.sub(r'\(https?://[^\s)]+\)', '', text)
    text = text.replace("The ", "").replace(" is responsible for ", " oversees ")
    text = text.replace("United Nations", "UN").replace("member states", "Member States")
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Step 2: Summarizing the Data
def summarize_data(cleaned_text: str) -> str:
    """Summarize cleaned text into 1-2 sentences."""
    if not cleaned_text:
        return "No data provided."
    
    entities = re.findall(r'(UN|UNESCO|WMO|FAO|Member States|[A-Z]{2,})', cleaned_text)
    unique_entities = list(dict.fromkeys(entities))[:5]
    focus = "sustainability" if "sustainable" in cleaned_text.lower() else "cooperation" if "cooperation" in cleaned_text.lower() else "development"
    
    if unique_entities:
        summary = f"{unique_entities[0]} manages {focus} within its jurisdictional scope."
    else:
        summary = "The organization manages development within its scope."
    
    return summary

# Step 3: Creating Themes
def assign_themes(cleaned_text: str, summary: str) -> List[str]:
    """Assign seven themes based on organic patterns."""
    themes = set()
    
    theme_scores = {
        "Scientific and Technical Domains": 0,
        "Human Welfare and Rights": 0,
        "Regulatory and Legal Oversight": 0,
        "Economic Development and Trade": 0,
        "Environmental Protection and Sustainability": 0,
        "Social Equity and Inclusion": 0,
        "Global and Emerging Challenges": 0
    }
    
    cleaned_lower = cleaned_text.lower()
    summary_lower = summary.lower()
    combined_text = cleaned_lower + " " + summary_lower
    
    # Pattern checks for themes
    if any(term in cleaned_lower for term in ["science", "research", "technology", "data", "innovation"]):
        theme_scores["Scientific and Technical Domains"] += 2
    if any(term in cleaned_lower for term in ["health", "food", "nutrition", "rights", "welfare"]):
        theme_scores["Human Welfare and Rights"] += 2
    if any(term in cleaned_lower for term in ["regulation", "legal", "standards", "policy", "governance"]):
        theme_scores["Regulatory and Legal Oversight"] += 2
    if any(term in cleaned_lower for term in ["trade", "economic", "industry", "investment", "growth"]):
        theme_scores["Economic Development and Trade"] += 2
    if any(term in cleaned_lower for term in ["environment", "sustainability", "biodiversity", "conservation", "climate"]):
        theme_scores["Environmental Protection and Sustainability"] += 2
    if any(term in cleaned_lower for term in ["equality", "gender", "social", "inclusion", "justice"]):
        theme_scores["Social Equity and Inclusion"] += 2
    if any(term in cleaned_lower for term in ["disaster", "climate change", "global", "emerging", "resilience"]):
        theme_scores["Global and Emerging Challenges"] += 2
    
    # Assign themes with threshold >= 2
    for theme, score in theme_scores.items():
        if score >= 2:
            themes.add(theme)
    
    if not themes:
        themes.add("Regulatory and Legal Oversight")
    
    return sorted(themes)

# Step 4: Constructing the Table
def process_column(df: pd.DataFrame, column_name: str, output_file: str = "subject_matter_jurisdiction_processed.csv"):
    """Process the specified column in the DataFrame and output to CSV."""
    if column_name not in df.columns or "Institution" not in df.columns:
        raise ValueError("DataFrame must contain 'Institution' and the specified column.")
    
    table = []
    for index, row in df.iterrows():
        institution = row["Institution"]
        raw_text = row[column_name]
        
        cleaned = clean_data(raw_text)
        summary = summarize_data(cleaned)
        themes = assign_themes(cleaned, summary)
        
        table.append({
            "Institution": institution,
            f"Cleaned {column_name}": cleaned,
            f"Summarized {column_name}": summary,
            "Themes": ";".join(themes)
        })
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["Institution", f"Cleaned {column_name}", f"Summarized {column_name}", "Themes"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in table:
            writer.writerow(row)
    
    print(f"Processing complete. Output saved to {output_file}")

# process_column(df, "Subject Matter Jurisdiction")