In [None]:
"""
Patent Owner Cleaning & Harmonization Pipeline
Author: Anna-Pauliina Kokko
Purpose: Production-grade pipeline for Lens.org exports.
Logic decoupled from sensitive and confidential data.
"""

import os, json, csv, hashlib, datetime, unicodedata, re
import pandas as pd
from typing import List, Dict, Optional
from rapidfuzz import fuzz

# --- 1. SETTINGS & EXTERNAL CONFIG ---
SETTINGS = {
    "INPUT_FILE": "input_data.xlsx",
    "OUTPUT_BASENAME": "Cleaned_Dataset",
    "FUZZY_THRESHOLD": 87,
    "DIACRITIC_MODE": "broad",
    "APPLY_FUZZY_TO_MAIN": False
}

# Load External Knowledge Bases (Prevents Confidentiality Leaks)
def load_config(file_path: str, default_val):
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f) if file_path.endswith('.json') else pd.read_csv(file_path)
    return default_val

# Externalized data files
HARMONIZATION_RULES = load_config("harmonization_map.csv", pd.DataFrame())
PARENT_PATTERNS = load_config("parent_patterns.json", {})
COMPANY_CUES = load_config("company_cues.json", ["oy", "ab", "inc", "ltd"]) # Generic defaults

# --- 2. TEXT NORMALIZATION ---
def normalize_unicode(s: str) -> str:
    """Standardizes punctuation and normalizes to NFKC."""
    if not isinstance(s, str): return ""
    s = s.replace("–", "-").replace("’", "'").replace("”", '"')
    return unicodedata.normalize("NFKC", s)

def restore_diacritics(token: str) -> str:
    """Handles Nordic character restoration based on SETTINGS."""
    if SETTINGS["DIACRITIC_MODE"] == "broad":
        token = re.sub(r"oe", "ö", token)
        token = re.sub(r"ae", "ä", token)
    return token

# --- 3. CORE CLEANING ENGINE ---
def clean_token(token: str) -> Optional[str]:
    """Applies multi-stage cleaning: lowercase, suffix removal, and diacritics."""
    if not token: return None
    x = normalize_unicode(token).lower()
    x = re.sub(r"\([^)]*\)", " ", x) # Remove parentheses
    # Generic legal suffixes
    suffix_pattern = r"\b(oy|oyj|ab|abp|gmbh|ltd|inc|plc|corp|sa|spa|as|aps)\b"
    x = re.sub(suffix_pattern, " ", x, flags=re.IGNORECASE)
    x = re.sub(r"\s+", " ", x).strip()
    return restore_diacritics(x) or None

# --- 4. CLASSIFICATION & HIERARCHY ---
def classify_entity(name: str) -> str:
    """Classifies entity based on keyword cues."""
    n = name.lower()
    if any(cue in n for cue in ["university", "college", "institute"]): return "academia"
    if any(cue in n for cue in ["foundation", "stiftung", "säätiö"]): return "nonprofit"
    if any(cue in n for cue in COMPANY_CUES): return "company"
    return "individual"

def detect_parent(name: str) -> str:
    """Maps cleaned names to parent entities using external regex patterns."""
    for parent, patterns in PARENT_PATTERNS.items():
        if any(re.search(p, name, re.I) for p in patterns):
            return parent
    return name

# --- 5. EXECUTION PIPELINE ---
def run_pipeline():
    df = pd.read_excel(SETTINGS["INPUT_FILE"])

    # Process Owners & Applicants
    for col in ["Owners", "Applicants"]:
        if col in df.columns:
            df[f"{col.lower()}_clean"] = df[col].apply(
                lambda x: [clean_token(p) for p in str(x).split(";")]
            )

    # Apply Classification and Parent Detection
    df["category"] = df["owners_clean"].apply(lambda lst: [classify_entity(x) for x in lst if x])
    df["parent_entity"] = df["owners_clean"].apply(lambda lst: [detect_parent(x) for x in lst if x])

    # Export with unique Dataset ID for version control
    timestamp = datetime.datetime.now().strftime("%Y%m%d")
    df.to_excel(f"{SETTINGS['OUTPUT_BASENAME']}_{timestamp}.xlsx", index=False)
    print(f"Success: Processed {len(df)} rows.")

if __name__ == "__main__":
    run_pipeline()