<a href="https://colab.research.google.com/github/apk-maker-bit/AppendixD/blob/source-code/Refined_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Patent Owner Cleaning & Harmonization Pipeline
Author: Anna-Pauliina Kokko
Purpose: Production-grade pipeline for Lens.org exports.
Logic decoupled from sensitive and confidential data.
"""

import os, json, datetime, unicodedata, re
import pandas as pd
from typing import List, Dict, Optional

# --- 1. SETTINGS & EXTERNAL CONFIG ---
SETTINGS = {
    "INPUT_FILE": "data/input_data.xlsx",
    "OUTPUT_BASENAME": "Cleaned_Dataset",
    "FUZZY_THRESHOLD": 87,
    "DIACRITIC_MODE": "broad",
}

def load_config(file_path: str, default_val):
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f) if file_path.endswith('.json') else pd.read_csv(file_path)
    return default_val

# Load Externalized data files
HARMONIZATION_RULES = load_config("data/harmonization_map.csv", pd.DataFrame())
PARENT_PATTERNS = load_config("data/parent_patterns.json", {})
COMPANY_CUES = load_config("data/company_cues.json", ["oy", "ab", "inc", "ltd"])

# --- 2. TEXT NORMALIZATION ---
def normalize_unicode(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.replace("–", "-").replace("’", "'").replace("”", '"')
    return unicodedata.normalize("NFKC", s)

def restore_diacritics(token: str) -> str:
    if SETTINGS["DIACRITIC_MODE"] == "broad":
        token = re.sub(r"oe", "ö", token)
        token = re.sub(r"ae", "ä", token)
    return token

# --- 3. CORE CLEANING ENGINE ---
def clean_token(token: str) -> Optional[str]:
    if not token: return None
    x = normalize_unicode(token).lower()
    x = re.sub(r"\([^)]*\)", " ", x)
    suffix_pattern = r"\b(oy|oyj|ab|abp|gmbh|ltd|inc|plc|corp|sa|spa|as|aps)\b"
    x = re.sub(suffix_pattern, " ", x, flags=re.IGNORECASE)
    x = re.sub(r"\s+", " ", x).strip()
    return restore_diacritics(x) or None

# --- 4. CLASSIFICATION & HIERARCHY ---
def classify_entity(name: str) -> str:
    n = name.lower()
    if any(cue in n for cue in ["university", "college", "institute"]): return "academia"
    if any(cue in n for cue in ["foundation", "stiftung", "säätiö"]): return "nonprofit"
    if any(cue in n for cue in COMPANY_CUES): return "company"
    return "individual"

def detect_parent(name: str) -> str:
    for parent, patterns in PARENT_PATTERNS.items():
        if any(re.search(p, name, re.I) for p in patterns):
            return parent
    return name

def run_pipeline():
    try:
        df = pd.read_excel(SETTINGS["INPUT_FILE"])
        for col in ["Owners", "Applicants"]:
            if col in df.columns:
                df[f"{col.lower()}_clean"] = df[col].apply(
                    lambda x: [clean_token(p) for p in str(x).split(";")]
                )

        df["category"] = df["owners_clean"].apply(lambda lst: [classify_entity(x) for x in lst if x])
        df["parent_entity"] = df["owners_clean"].apply(lambda lst: [detect_parent(x) for x in lst if x])

        timestamp = datetime.datetime.now().strftime("%Y%m%d")
        df.to_excel(f"{SETTINGS['OUTPUT_BASENAME']}_{timestamp}.xlsx", index=False)
        print(f"Success: Processed {len(df)} rows.")
    except FileNotFoundError:
        print(f"Error: Could not find {SETTINGS['INPUT_FILE']}. Please ensure data is in the /data folder.")

if __name__ == "__main__":
    run_pipeline()