## <center> Ocean Governance

### Releveant Libraries

In [None]:
# set up and tools
import pandas as pd

## 1 Data

### 1.1 Load file to data frame

In [None]:
# File path
file_path = "../raw_data/Ocean Governance and ocean economy governance matrix_IGOs.xlsx"
# Load the dataset
df = pd.read_excel(file_path, header=[0,1])


# Preview the data to check the structure
df.head(3)

### 1.2 Data Cleaning
<p>To automate the process of extracting and summarizing the Spatial Jurisdiction text in Python, the follwing will be done. Natural Language Processing (NLP) techniques will help to systematically extract key information such as the geographic scope, boundaries, jurisdictional limits in nautical miles, and relevant references. Below is a step-by-step guide to creating a Python script for this task:</p>

#### 1.2.1 Renaming Columns

In [None]:
# Function to rename columns based on provided mapping
def rename_columns(df, column_map):
    # Flatten the column names into a single-level
    df.columns = [column_map.get(col, col) for col in df.columns]
    
    return df

# Define the column mappings (old name -> new name)
column_map = {
    ('Institutions', 'Unnamed: 0_level_1'): 'Institution',
    ('Year', 'Unnamed: 1_level_1'): 'Year',
    ('Scale', 'Unnamed: 2_level_1'): 'Scale',
    ('Jurisdictional Scope', 'Spatial Jurisdiction'): 'Spatial Jurisdiction',
    ('Jurisdictional Scope', 'Subject Matter Jurisdiction'): 'Subject Matter Jurisdiction',
    ('Source of Jurisdiction', 'Unnamed: 5_level_1'): 'Source of Jurisdiction',
    ('Defined Objectives', 'Unnamed: 6_level_1'): 'Defined Objectives',
    ('Strategies', 'Unnamed: 7_level_1'): 'Strategies',
    ('Defined inter-institutional Relationship', 'Unnamed: 8_level_1'): 'Inter-institutional Relationship',
    ('Practical- Coordination', 'Vertical'): 'Practical Vertical Coordination',
    ('Practical- Coordination', 'Horizontal'): 'Practical Horizontal Coordination',
    ('Practical- Coordination', 'Horizontal.1'): 'Horizontal Coordination 1',
    ('Practical- Coordination', 'Horizontal.2'): 'Horizontal Coordination 2',
    ('Practical- Coordination', 'Horizontal.3'): 'Horizontal Coordination 3',
    ('Practical- Coordination', 'Horizontal.4'): 'Horizontal Coordination 4',
    ('Practical- Coordination', 'Horizontal.5'): 'Horizontal Coordination 5',
    ('Practical- Coordination', 'Horizontal.6'): 'Horizontal Coordination 6',
    ('Practical- Coordination', 'Horizontal.7'): 'Horizontal Coordination 7',
    ('Practical- Coordination', 'Horizontal.8'): 'Horizontal Coordination 8'
}

# Apply the renaming function
df = rename_columns(df, column_map)

#### 1.2.2 Relevant Column

In [None]:
# Subseting relevant columns
new_df = df.iloc[:, :11]

# Drop row 48 in place(used in search)
new_df.drop(48, axis=0, inplace=True)

new_df.tail(3)

In [None]:
df = new_df.copy(deep=True)

In [None]:
# SAVE THE NEW SHEET
with pd.ExcelWriter(file_path, mode='a') as writer:
    df.to_excel(writer, sheet_name='Ocean Governance and ocean economy governance')

In [None]:
df.head(3)

### NLP

In [None]:
import re
import spacy
import pandas as pd
from spacy.matcher import Matcher
nlp = en_core_web_sm.load()

# Load SpaCy NLP model (ensure the language model is downloaded before running this)
# nlp = spacy.load("en_core_web_sm")

# Initialize Matcher for boundary terms
matcher = Matcher(nlp.vocab)
# Example matcher patterns for boundary detection (can be expanded as needed)
boundary_patterns = [
    {"label": "EEZ", "pattern": [{"lower": "exclusive"}, {"lower": "economic"}, {"lower": "zone"}]},
    {"label": "Territorial Seas", "pattern": [{"lower": "territorial"}, {"lower": "seas"}]},
    {"label": "International Waters", "pattern": [{"lower": "international"}, {"lower": "waters"}]},
    {"label": "Coastal Areas", "pattern": [{"lower": "coastal"}, {"lower": "areas"}]},
    {"label": "Inland Waters", "pattern": [{"lower": "inland"}, {"lower": "waters"}]},
    {"label": "Oceanic Resources", "pattern": [{"lower": "oceanic"}, {"lower": "resources"}]},
    {"label": "Maritime Safety and Security", "pattern": [{"lower": "maritime"}, {"lower": "safety"}, {"lower": "security"}]},
    {"label": "Environmental Protection", "pattern": [{"lower": "environmental"}, {"lower": "protection"}]},
    {"label": "High Seas", "pattern": [{"lower": "high"}, {"lower": "seas"}]},
    {"label": "Outer Limits", "pattern": [{"lower": "outer"}, {"lower": "limits"}]},
    {"label": "Outer Space", "pattern": [{"lower": "outer"}, {"lower": "space"}]},
    {"label": "Desertification Regions", "pattern": [{"lower": "desertification"}]},
    {"label": "Ocean Space", "pattern": [{"lower": "ocean"}, {"lower": "space"}]}
]

# Add patterns to the matcher
for pattern in boundary_patterns:
    matcher.add(pattern["label"], [pattern["pattern"]])

def extract_geographic_scope(text):
    # Convert text to lowercase to ensure case-insensitive matching
    text = text.lower()
    
    # Define patterns for matching different types of geographic scope
    global_patterns = [
        "global", "covers all countries", "entire", "all regions", "worldwide", 
        "international", "across the world", "globally", "universal", "covering all regions", 
        "world’s oceans", "all oceans"
    ]
    
    regional_patterns = [
        "regional", "local", "subregion", "in the region", "within the region", 
        "africa", "asia", "europe", "pacific", "mediterranean", "caribbean", 
        "latin america", "middle east", "south america", "north america", 
        "southeast asia", "sub-saharan africa", "oceans", "desertification", "outer space", "populations", 
        "coastal", "rural"
    ]
    
    # Check for global scope
    if any(pattern in text for pattern in global_patterns):
        return "Global"
    
    # Check for regional/local scope
    elif any(pattern in text for pattern in regional_patterns):
        return "Regional/Local"
    
    # Check for special scopes like "outer space", "oceans", or "land-based"
    elif "outer space" in text:
        return "Outer Space"
    elif "oceans" in text:
        return "Oceans"
    elif "land-based" in text or "desertification" in text:
        return "Land-based"

    # If no specific scope is found, return "No Specific"
    return "No Specific"

# Function to extract specific boundaries mentioned in the jurisdiction text
def extract_boundaries(text):
    boundaries = []
    
    # Use SpaCy NLP to detect custom jurisdictional terms
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        boundaries.append(span.text)
    
    # Match for boundary terms with regex (EEZ, territorial seas, etc.)
    if re.search(r'\bexclusive economic zones?\b', text, re.IGNORECASE):
        boundaries.append("Exclusive Economic Zones (EEZs)")
    if re.search(r'\bterritorial seas?\b', text, re.IGNORECASE):
        boundaries.append("Territorial Seas")
    if re.search(r'\binternational waters?\b', text, re.IGNORECASE):
        boundaries.append("International Waters")
    if re.search(r'\bcoastal areas?\b', text, re.IGNORECASE):
        boundaries.append("Coastal Areas")
    if re.search(r'\binland waters?\b', text, re.IGNORECASE):
        boundaries.append("Inland Waters")
    if re.search(r'\boceanic resources?\b', text, re.IGNORECASE):
        boundaries.append("Oceanic Resources")
    if re.search(r'\bmaritime safety and security\b', text, re.IGNORECASE):
        boundaries.append("Maritime Safety and Security")
    if re.search(r'\benvironmental protection\b', text, re.IGNORECASE):
        boundaries.append("Environmental Protection")
    if re.search(r'\bhigh seas\b', text, re.IGNORECASE):
        boundaries.append("High Seas")
    if re.search(r'\bouter limits\b', text, re.IGNORECASE):
        boundaries.append("Outer Limits of National Jurisdiction")
    if re.search(r'\bouter space\b', text, re.IGNORECASE):
        boundaries.append("Outer Space")
    if re.search(r'\bdesertification\b', text, re.IGNORECASE):
        boundaries.append("Desertification Regions")

    # Match distances (e.g., 200 nautical miles, 500 km)
    distance_match = re.search(r'(\d+)\s*(nautical miles|kilometers|miles|km)', text, re.IGNORECASE)
    if distance_match:
        boundaries.append(f"Jurisdiction defined by {distance_match.group(1)} {distance_match.group(2)}")
    
    # Fallback if no specific boundaries were found
    if "global" in text and not boundaries:
        boundaries.append("Global Jurisdiction (No specific boundaries)")
    
    return ', '.join(boundaries) if boundaries else "Not Specified"

# Function to extract relevant references (URLs, treaties, legal documents)
def extract_references(text):
    urls = re.findall(r'https?://[^\s]+', text)
    treaties = re.findall(r'\b(UNCLOS|Paris Agreement|UN Convention on the Law of the Sea|Sustainable Development Goals|SDGs|Basel Convention|Minamata Convention|CBD|CITES|CMS|Outer Space Treaty|Outer Space Law)\b', text, re.IGNORECASE)
    
    references = urls + treaties
    return ', '.join(references) if references else "None"

# Combine all extracted information (geographic scope, boundaries, and references) into a structured summary
def summarize_spatial_jurisdiction(text):
    geographic_scope = extract_geographic_scope(text)
    boundaries = extract_boundaries(text)
    references = extract_references(text)
    return pd.Series([geographic_scope, boundaries, references], 
                     index=["Geographic Scope", "Boundaries", "References"])

# Apply the summarization to the Spatial Jurisdiction column
df[['Geographic Scope', 'Boundaries', 'References']] = df['Spatial Jurisdiction'].apply(summarize_spatial_jurisdiction)


In [None]:
from spatial import SpatialJurisdictionExtractor

In [None]:
extractor = SpatialJurisdictionExtractor()

In [None]:
# Apply the summarization to the Spatial Jurisdiction column
df[['Geographic Scope', 'Boundaries', 'References']] = df['Spatial Jurisdiction'].apply(SpatialJurisdictionExtractor)


In [None]:
# Preview the updated DataFrame
spatial_df = df[['Institution', 'Spatial Jurisdiction','Geographic Scope', 'Boundaries', 'References']]

In [None]:
spatial_df.info()

In [None]:
# # Save Spatial df sheet
# with pd.ExcelWriter(file_path, mode='a') as writer:
#     spatial_df.to_excel(writer, sheet_name='Spatial Jurisdiction')

In [None]:
df['Indicator Presence'][22]

## Subject Matter Jurisdiction

In [None]:
new_df.columns