In [3]:
import pandas as pd
import re
import csv
from typing import List

In [4]:
# read excell shet to df
import pandas as pd

path = "./Ocean Governance and ocean economy governance matrix_IGOs.xlsx"
data = pd.read_excel(path, sheet_name="Sheet1")

# backup
df = data.copy(deep=True)

In [6]:
df.head(3)

Unnamed: 0,Institutions,Year,Scale,Spatial Jurisdiction,Subject Matter Jurisdiction,Source of Jurisdiction,Defined Objectives,Strategies,Defined inter-institutional Relationship,Vertical,Horizontal,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18
0,Intergovernmental Oceanographic Commission (IOC),1960.0,Global,IOC jurisdiction is global delineated by the b...,The IOC's subject matter jurisdiction encompas...,The IOC’s authority is derived from its statut...,The objectives of the Intergovernmental Oceano...,IOC implements its objectives through series o...,IOC collaborates with UN specialized agencies ...,Vertical coordination within the IOC involves ...,Horizontal coordination within the IOC encompa...,,,,,,,,
1,Food and Agriculture Organization of the Unite...,1945.0,Global,The FAO’s jurisdiction spans a vast array of m...,"FAO’s remit includes nutrition, food and agric...",The FAO’s jurisdiction is established through ...,"As stated in Article 1 of the Constitution, FA...",The FAO executes its objectives through a ser...,"As stated in its constitution, the FAO maintai...",The FAO’s vertical coordination involves colla...,Horizontal coordination within the FAO involve...,https://www.jus.uio.no/english/services/librar...,FAO https://www.fao.org/strategic-framework/en,,,,,,
2,Convention on the Intergovernmental Maritime C...,1948.0,Global,The IMO’s authority spans a global geographica...,The IMO's jurisdiction encompasses a comprehen...,The IMO's jurisdiction is established by the C...,"According to Part I, Article 1 of the Internat...",IMO implements its objectives and mandates thr...,The IMO collaborates with a diverse array of o...,Vertical coordination within IMO involves coll...,Horizontal coordination within the IMO involve...,https://wwwcdn.imo.org/localresources/en/About...,https://wwwcdn.imo.org/localresources/en/Knowl...,https://www.imo.org/en/MediaCentre/HotTopics/P...,,,,,


In [7]:
df = df[["Institutions", "Strategies"]]

In [13]:
# Step 1: Cleaning the Data
def clean_data(raw_text: str) -> str:
    """Clean raw text by removing typos, URLs, and redundant details."""
    if pd.isna(raw_text):
        return ""
    
    text = re.sub(r'\(https?://[^\s)]+\)', '', text)
    text = text.replace("the objectives of the", "The").replace("include to:", "include:")
    text = text.replace("United Nations", "UN").replace("member states", "Member States")
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Step 2: Summarizing the Data
def summarize_data(cleaned_text: str) -> str:
    """Summarize cleaned text into 1-2 sentences."""
    if not cleaned_text:
        return "No data provided."
    
    entities = re.findall(r'(UN|UNESCO|WMO|FAO|Member States|[A-Z]{2,})', cleaned_text)
    unique_entities = list(dict.fromkeys(entities))[:5]
    focus = "sustainability" if "sustainable" in cleaned_text.lower() else "cooperation" if "cooperation" in cleaned_text.lower() else "development"
    
    if unique_entities:
        summary = f"The {unique_entities[0]} aims to promote {focus} through its defined objectives."
    else:
        summary = "The organization aims to promote development through its objectives."
    
    return summary

# Step 3: Creating Themes
def assign_themes(cleaned_text: str, summary: str) -> List[str]:
    """Assign themes based on organic patterns using 10 key phrases."""
    themes = set()
    
    # Define 10 key phrases from the data
    key_phrases = {
        "Promote marine science and technology transfer": ("Scientific and Environmental Progress", 2),
        "Enhance global food security": ("Human Sustenance and Development", 2),
        "Ensure maritime safety and environmental protection": ("Safety and Climate Resilience", 2),
        "Support legal frameworks for ocean governance": ("Regulatory Frameworks and Resource Management", 2),
        "Facilitate climate change mitigation and adaptation": ("Safety and Climate Resilience", 1),
        "Regulate seabed resources for sustainable use": ("Regulatory Frameworks and Resource Management", 2),
        "Coordinate environmental policies and global monitoring": ("Scientific and Environmental Progress", 2),
        "Advance sustainable development goals (SDGs)": ("Human Sustenance and Development", 1),
        "Promote economic development through maritime trade": ("Economic and Ecological Balance", 2),
        "Conserve marine biodiversity and ecosystems": ("Economic and Ecological Balance", 2)
    }
    
    theme_scores = {
        "Scientific and Environmental Progress": 0,
        "Human Sustenance and Development": 0,
        "Safety and Climate Resilience": 0,
        "Regulatory Frameworks and Resource Management": 0,
        "Economic and Ecological Balance": 0
    }
    
    cleaned_lower = cleaned_text.lower()
    summary_lower = summary.lower()
    combined_text = cleaned_lower + " " + summary_lower
    
    # Score based on key phrases
    for phrase, (theme, weight) in key_phrases.items():
        if phrase.lower() in combined_text:
            theme_scores[theme] += weight
    
    # Organic pattern checks aligned with key phrases
    if any(term in cleaned_lower for term in ["science", "research", "monitoring", "technology"]):
        theme_scores["Scientific and Environmental Progress"] += 2
    if any(term in cleaned_lower for term in ["food", "health", "poverty", "rights", "sustainable development"]):
        theme_scores["Human Sustenance and Development"] += 2
    if any(term in cleaned_lower for term in ["safety", "climate", "protection", "mitigation"]):
        theme_scores["Safety and Climate Resilience"] += 2
    if any(term in cleaned_lower for term in ["legal", "policy", "regulation", "governance", "resources"]):
        theme_scores["Regulatory Frameworks and Resource Management"] += 2
    if any(term in cleaned_lower for term in ["trade", "economic", "biodiversity", "ecosystems", "sustainable use"]):
        theme_scores["Economic and Ecological Balance"] += 2
    
    # Assign themes with threshold >= 2
    for theme, score in theme_scores.items():
        if score >= 2:
            themes.add(theme)
    
    if not themes:
        themes.add("Regulatory Frameworks and Resource Management")
    
    return sorted(themes)

# Step 4: Constructing the Table
def process_column(df: pd.DataFrame, column_name: str, output_file: str = "defined_objectives_refined.csv"):
    """Process the specified column in the DataFrame and output to CSV."""
    if column_name not in df.columns or "Institution" not in df.columns:
        raise ValueError("DataFrame must contain 'Institution' and the specified column.")
    
    table = []
    for index, row in df.iterrows():
        institution = row["Institution"]
        raw_text = row[column_name]
        
        cleaned = clean_data(raw_text)
        summary = summarize_data(cleaned)
        themes = assign_themes(cleaned, summary)
        
        table.append({
            "Institution": institution,
            f"Cleaned {column_name}": cleaned,
            f"Summarized {column_name}": summary,
            "Themes": ";".join(themes)
        })
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["Institution", f"Cleaned {column_name}", f"Summarized {column_name}", "Themes"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for row in table:
            writer.writerow(row)
    
    print(f"Processing complete. Output saved to {output_file}")

In [11]:
process_column(df, "Strategies")

Processing complete. Output saved to strategies_processed.csv
