In [1]:
# read excell shet to df
import pandas as pd

path = "./Ocean Governance and ocean economy governance matrix_IGOs.xlsx"
data = pd.read_excel(path, sheet_name="Sheet1")

# backup
df = data.copy(deep=True)

In [3]:
import pandas as pd
import re
import csv
from typing import List

# Cleaning Function
def clean_data(raw_text: str) -> str:
    if pd.isna(raw_text):
        return ""
    text = re.sub(r'\(https?://[^\s)]+\)', '', text)
    text = text.replace("The ", "").replace(" is responsible for ", " oversees ").replace("Memer", "Member")
    text = text.replace("United Nations", "UN").replace("member states", "Member States")
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Summarizing Function
def summarize_data(cleaned_text: str, column_name: str) -> str:
    if not cleaned_text:
        return "No data provided."
    entities = re.findall(r'(UN|UNESCO|WMO|FAO|Member States|[A-Z]{2,})', cleaned_text)
    unique_entities = list(dict.fromkeys(entities))[:5]
    focus = "sustainability" if "sustainable" in cleaned_text.lower() else "cooperation" if "cooperation" in cleaned_text.lower() else "development"
    if unique_entities:
        if column_name == "Strategies":
            return f"{unique_entities[0]} drives {focus} through strategic initiatives."
        elif column_name == "Defined inter-institutional Relationship":
            return f"{unique_entities[0]} collaborates with partners for {focus}."
        elif column_name == "Practical Horizontal Coordination":
            return f"{unique_entities[0]} coordinates with peers on {focus}."
        elif column_name == "Practical Vertical Coordination":
            return f"{unique_entities[0]} supports {focus} at national and local levels."
        return f"{unique_entities[0]} focuses on {focus} within its scope."
    return f"The organization manages {focus} within its scope."

# Theme Assignment Functions
def assign_themes(cleaned_text: str, summary: str, column_name: str) -> List[str]:
    themes = set()
    cleaned_lower = cleaned_text.lower()
    summary_lower = summary.lower()
    combined_text = cleaned_lower + " " + summary_lower
    
    if column_name == "Subject Matter Jurisdiction":
        theme_scores = {
            "Scientific and Technical Domains": 0, "Human Welfare and Rights": 0, "Regulatory and Legal Oversight": 0,
            "Economic Development and Trade": 0, "Environmental Protection and Sustainability": 0,
            "Social Equity and Inclusion": 0, "Global and Emerging Challenges": 0
        }
        checks = {
            "Scientific and Technical Domains": ["science", "research", "technology", "data", "innovation"],
            "Human Welfare and Rights": ["health", "food", "nutrition", "rights", "welfare"],
            "Regulatory and Legal Oversight": ["regulation", "legal", "standards", "policy", "governance"],
            "Economic Development and Trade": ["trade", "economic", "industry", "investment", "growth"],
            "Environmental Protection and Sustainability": ["environment", "sustainability", "biodiversity", "conservation", "climate"],
            "Social Equity and Inclusion": ["equality", "gender", "social", "inclusion", "justice"],
            "Global and Emerging Challenges": ["disaster", "climate change", "global", "emerging", "resilience"]
        }
    elif column_name == "Defined inter-institutional Relationship":
        theme_scores = {
            "UN System Integration": 0, "Regional Collaboration": 0, "Scientific Partnerships": 0,
            "Regulatory Alignment": 0, "Environmental Cooperation": 0, "Economic Linkages": 0,
            "Multi-Stakeholder Networks": 0
        }
        checks = {
            "UN System Integration": ["un", "unesco", "who", "fao"],
            "Regional Collaboration": ["regional", "westpac", "iocAfrica"],
            "Scientific Partnerships": ["science", "research", "ices"],
            "Regulatory Alignment": ["regulation", "standards", "imo"],
            "Environmental Cooperation": ["environment", "unep", "cbd"],
            "Economic Linkages": ["trade", "economic", "unctad"],
            "Multi-Stakeholder Networks": ["stakeholder", "partnership", "global compact"]
        }
    elif column_name == "Practical Horizontal Coordination":
        theme_scores = {
            "Data Sharing Networks": 0, "Joint Program Initiatives": 0, "Policy Harmonization": 0,
            "Technical Assistance Partnerships": 0, "Environmental Synergies": 0, "Regional Task Forces": 0,
            "Cross-Sectoral Forums": 0
        }
        checks = {
            "Data Sharing Networks": ["data", "exchange", "sharing"],
            "Joint Program Initiatives": ["joint", "program", "initiative"],
            "Policy Harmonization": ["policy", "harmonization", "standards"],
            "Technical Assistance Partnerships": ["technical", "assistance", "support"],
            "Environmental Synergies": ["environment", "synergy", "conservation"],
            "Regional Task Forces": ["regional", "task", "force"],
            "Cross-Sectoral Forums": ["cross", "sector", "forum"]
        }
    elif column_name == "Practical Vertical Coordination":
        theme_scores = {
            "National Policy Implementation": 0, "Capacity Building Support": 0, "Local Project Execution": 0,
            "Regional Framework Adaptation": 0, "Monitoring and Reporting": 0, "Funding Mechanisms": 0,
            "Stakeholder Engagement": 0
        }
        checks = {
            "National Policy Implementation": ["national", "policy", "implementation"],
            "Capacity Building Support": ["capacity", "training", "support"],
            "Local Project Execution": ["local", "project", "execution"],
            "Regional Framework Adaptation": ["regional", "framework", "adaptation"],
            "Monitoring and Reporting": ["monitoring", "reporting", "assessment"],
            "Funding Mechanisms": ["funding", "loan", "grant"],
            "Stakeholder Engagement": ["stakeholder", "engagement", "community"]
        }
    else:  # Default to "Defined Objectives" or "Strategies" processed earlier
        return []

    for theme, terms in checks.items():
        if any(term in combined_text for term in terms):
            theme_scores[theme] += 2
    
    for theme, score in theme_scores.items():
        if score >= 2:
            themes.add(theme)
    
    if not themes:
        themes.add(list(theme_scores.keys())[0])
    
    return sorted(themes)

# Processing Function
def process_column(df: pd.DataFrame, column_name: str, output_file: str = "processed_data.csv"):
    table = []
    for index, row in df.iterrows():
        institution = row["Institutions"]
        raw_text = row[column_name]
        
        cleaned = clean_data(raw_text)
        summary = summarize_data(cleaned, column_name)
        themes = assign_themes(cleaned, summary, column_name)
        
        table.append({
            "Institutions": institution,
            f"Cleaned {column_name}": cleaned,
            f"Summarized {column_name}": summary,
            "Themes": ";".join(themes)
        })
    
    return table

# Full Data Processing
if __name__ == "__main__":
    # Load your full dataset here (simplified for brevity; replace with your actual CSV loading)
    df = df  # Replace with actual file path
    
    all_columns = [
        "Subject Matter Jurisdiction", "Defined Objectives", "Strategies",
        "Defined inter-institutional Relationship", "Practical Horizontal Coordination",
        "Practical Vertical Coordination"
    ]
    full_table = []
    
    for column in all_columns:
        column_data = process_column(df, column)
        if not full_table:
            full_table = column_data
        else:
            for i, entry in enumerate(full_table):
                entry.update({
                    f"Cleaned {column}": column_data[i][f"Cleaned {column}"],
                    f"Summarized {column}": column_data[i][f"Summarized {column}"],
                    "Themes": entry["Themes"] + ";" + column_data[i]["Themes"]
                })
    
    with open("full_processed_data.csv", 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["Institution"] + [f"Cleaned {col}" for col in all_columns] + \
                     [f"Summarized {col}" for col in all_columns] + ["Themes"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in full_table:
            writer.writerow(row)
    
    print("Processing complete. Output saved to full_processed_data.csv")

UnboundLocalError: cannot access local variable 'text' where it is not associated with a value