In [1]:
# Direct keyword matching to find ECLI cases (no clustering)

import pandas as pd
import numpy as np

# Load data
df_ecli = pd.read_excel("DATA ecli_nummers juni 2025 v1 (version 1).xlsx", engine="openpyxl")
df = df_ecli

# Find text column
text_candidates = ["text", "tekst", "ecli_text", "ecli_tekst", "body", "content", "uitspraak", "summary", "samenvatting"]
text_col = None
for c in text_candidates:
    if c in df.columns:
        text_col = c
        break
if text_col is None:
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if not obj_cols:
        raise ValueError("No text-like (object) columns found.")
    lens = {c: df[c].astype(str).str.len().replace([np.inf, -np.inf], np.nan).fillna(0).mean() for c in obj_cols}
    text_col = max(lens, key=lens.get)

print(f"Using text column: {text_col}")
print(f"Total cases: {len(df)}\n")

# Define keyword lists for 10 domains
domain_keywords = {
    "1. Verkeer & vervoer": [
        "fiets", "fietsenstalling", "parkeerbelasting", "parkeervergunning", "verkeersbesluit",
        "fietsparkeren", "weesfiets", "weesfietsen", "parkeren", "verkeersboete", "verkeersboetes",
        "verkeersbord", "verkeersborden", "afsluiting", "afsluitingen", "verkeer", "vervoer",
        "voertuig", "auto", "cbr", "geparkeerd", "weggedeelten", "kantonrechter"
    ],
    "2. Afval & openbare ruimte": [
        "afval", "container", "huisvuil", "bijplaatsing", "reiniging", "grofvuil",
        "afvalcontainer", "afvalstoffenverordening", "afvalstoffenheffing", "afvalstoffen",
        "openbare ruimte", "schoonmaak", "vuilnis", "afvalzak", "afvalzakken"
    ],
    "3. Openbare orde & handhaving": [
        "handhaving", "bestuursdwang", "dwangsom", "apv", "overtreding", "overtreder",
        "bestuurlijke boete", "last", "handhavend", "optreden", "treden", "aangetroffen"
    ],
    "4. Vergunningen & bouwen (Omgevingsrecht)": [
        "omgevingsvergunning", "bestemmingsplan", "bouw", "monument", "bouwen", "verbouwen",
        "bouwplan", "bouwvergunning", "perceel", "terras", "exploitatievergunning", "inrichting",
        "horeca", "dakterras", "vergunning", "vergunninghouder", "plangebied", "ruimtelijke",
        "bestemming", "bomen", "stichting", "evenement"
    ],
    "5. Geluid & overlast": [
        "geluid", "overlast", "horeca", "evenement", "geluidsoverlast", "horeca-overlast",
        "geluidshinder", "lawaai", "geluidsnorm", "geluidsnormen"
    ],
    "6. Sociale voorzieningen & subsidies": [
        "bijstand", "wmo", "subsidie", "voorziening", "bijzondere bijstand", "subsidies",
        "voorzieningen", "sociale", "uitkering", "bijstandsverlening"
    ],
    "7. Belastingen & heffingen": [
        "belasting", "heffing", "leges", "aanslag", "belastingdienst", "toeslagen",
        "precario", "afvalstoffenheffing", "gemeentelijke belasting", "heffingsambtenaar",
        "belastingen", "heffingen", "belastingrecht", "belastingwet", "belastingwetten",
        "inkomstenbelasting", "omzetbelasting", "btw", "loonbelasting", "vpb"
    ],
    "8. Wonen & huisvesting": [
        "woning", "huisvesting", "urgent", "huur", "woningtoewijzing", "huisvestingsvergunning",
        "kamerverhuur", "urgentieverklaring", "woonruimte", "huisvestingsverordening",
        "huisvestingswet", "bewoning", "toeristen", "medische", "kinderen", "ggd",
        "huisvestingsprobleem", "hardheidsclausule"
    ],
    "9. Openbare voorzieningen & infrastructuur": [
        "openbare ruimte", "weg", "groen", "infrastructuur", "wegen", "verlichting",
        "groenbeheer", "openbaar", "openbare voorziening", "openbare voorzieningen",
        "straatverlichting", "openbaar groen", "openbaar gebied", "openbare weg",
        "riolering", "riool", "waterleiding", "nutsvoorziening", "nutsvoorzieningen",
        "openbare werken", "wegenbeheer", "wegbeheer", "openbare faciliteit", "openbare faciliteiten"
    ],
    "10. Overig / gemengd": []  # Other/Mixed
}

def count_keywords_in_text(text, keywords):
    """Count how many domain keywords appear in the text"""
    text_lower = str(text).lower()
    count = 0
    matched_keywords = []
    for keyword in keywords:
        if keyword.lower() in text_lower:
            count += 1
            matched_keywords.append(keyword)
    return count, matched_keywords

def assign_domain(text, domain_keywords, min_matches=2):
    """
    Assign domain based on keyword matching.
    Returns domain with highest score, or 'Overig / gemengd' if scores are low/tied.
    """
    scores = {}
    all_matches = {}
    
    for domain_name, keywords in domain_keywords.items():
        if domain_name == "10. Overig / gemengd":
            continue
        count, matched = count_keywords_in_text(text, keywords)
        scores[domain_name] = count
        all_matches[domain_name] = matched
    
    if not scores or max(scores.values()) == 0:
        return "10. Overig / gemengd", 0, []
    
    max_score = max(scores.values())
    max_domains = [d for d, s in scores.items() if s == max_score]
    
    # Special checks for domain 9:
    # 1. If only "openbare ruimte" is matched, need at least one other keyword
    # 2. If only "weg" and "openbaar" are matched (no other domain 9 keywords), don't assign to domain 9
    if "9. Openbare voorzieningen & infrastructuur" in max_domains:
        matched_keywords_9 = all_matches["9. Openbare voorzieningen & infrastructuur"]
        matched_set = set([kw.lower() for kw in matched_keywords_9])
        
        # Check if only "openbare ruimte" phrase is matched
        only_openbare_ruimte = (len(matched_keywords_9) == 1 and 
                                "openbare ruimte" in matched_keywords_9)
        
        # Check if only "weg" and "openbaar" are matched (no other domain 9 keywords)
        only_weg_and_openbaar = (len(matched_keywords_9) == 2 and 
                                 "weg" in matched_set and "openbaar" in matched_set)
        
        # Check if only "weg" or "openbaar" is matched (one word only)
        only_weg = (len(matched_keywords_9) == 1 and "weg" in matched_set)
        only_openbaar = (len(matched_keywords_9) == 1 and "openbaar" in matched_set)
        
        if only_openbare_ruimte:
            # If only "openbare ruimte" is matched, need at least 2 matches (i.e., at least one other keyword)
            if max_score < 2:
                # Check if other domains have better matches
                other_scores = {d: s for d, s in scores.items() if d != "9. Openbare voorzieningen & infrastructuur"}
                if other_scores and max(other_scores.values()) >= min_matches:
                    max_score = max(other_scores.values())
                    max_domains = [d for d, s in other_scores.items() if s == max_score]
                else:
                    return "10. Overig / gemengd", max_score, []
        elif only_weg_and_openbaar or only_weg or only_openbaar:
            # If only "weg" and "openbaar" (or one of them) are matched, don't assign to domain 9
            # Check if other domains have better matches
            other_scores = {d: s for d, s in scores.items() if d != "9. Openbare voorzieningen & infrastructuur"}
            if other_scores and max(other_scores.values()) >= min_matches:
                max_score = max(other_scores.values())
                max_domains = [d for d, s in other_scores.items() if s == max_score]
            else:
                return "10. Overig / gemengd", max_score, []
    
    # If score is too low, assign to "Overig / gemengd"
    if max_score < min_matches:
        return "10. Overig / gemengd", max_score, []
    
    # If there's a clear winner, return it
    if len(max_domains) == 1:
        return max_domains[0], max_score, all_matches[max_domains[0]]
    
    # If tied and scores are close, assign to "Overig / gemengd"
    sorted_scores = sorted(scores.values(), reverse=True)
    if len(sorted_scores) > 1 and sorted_scores[0] - sorted_scores[1] <= 1:
        return "10. Overig / gemengd", max_score, []
    
    # Otherwise, return the first domain with max score
    return max_domains[0], max_score, all_matches[max_domains[0]]

# Assign domain to each case
print("=" * 80)
print("ASSIGNING DOMAINS USING KEYWORD MATCHING")
print("=" * 80)
print("\nProcessing...")

results = df[text_col].apply(
    lambda x: assign_domain(x, domain_keywords, min_matches=2)
)

df["domain"] = results.apply(lambda x: x[0])
df["domain_score"] = results.apply(lambda x: x[1])
df["domain_matched_keywords"] = results.apply(lambda x: ", ".join(x[2][:5]) if x[2] else "")  # Save only first 5 matched keywords

# Display results
print("\n" + "=" * 80)
print("DOMAIN DISTRIBUTION")
print("=" * 80)
domain_counts = df["domain"].value_counts()
print(f"\n{'Domain':<50} {'Cases':<10} {'Share':<10}")
print("-" * 80)
for domain, count in domain_counts.items():
    print(f"{domain:<50} {count:<10} {count/len(df)*100:.2f}%")

# Display detailed information for each domain
print("\n" + "=" * 80)
print("DETAILED DOMAIN INFORMATION")
print("=" * 80)

for domain_name in domain_keywords.keys():
    domain_cases = df[df["domain"] == domain_name]
    if len(domain_cases) == 0:
        continue
    
    print(f"\n{domain_name}:")
    print(f"  Cases: {len(domain_cases)} ({len(domain_cases)/len(df)*100:.2f}%)")
    print(f"  Avg keyword matches: {domain_cases['domain_score'].mean():.2f}")
    print(f"  Min matches: {domain_cases['domain_score'].min()}, Max matches: {domain_cases['domain_score'].max()}")
    
    # Show some example cases
    print(f"  Example cases (first 3):")
    for idx in domain_cases.index[:3]:
        text_snippet = df.loc[idx, text_col][:200].replace("\n", " ")
        print(f"    - {text_snippet}...")

print("\n" + "=" * 80)
print("Done! Results saved to df['domain'] column")
print("=" * 80)

Using text column: ecli_tekst
Total cases: 2447

ASSIGNING DOMAINS USING KEYWORD MATCHING

Processing...

DOMAIN DISTRIBUTION

Domain                                             Cases      Share     
--------------------------------------------------------------------------------
4. Vergunningen & bouwen (Omgevingsrecht)          1070       43.73%
3. Openbare orde & handhaving                      460        18.80%
10. Overig / gemengd                               348        14.22%
8. Wonen & huisvesting                             241        9.85%
9. Openbare voorzieningen & infrastructuur         138        5.64%
1. Verkeer & vervoer                               68         2.78%
6. Sociale voorzieningen & subsidies               54         2.21%
7. Belastingen & heffingen                         53         2.17%
2. Afval & openbare ruimte                         11         0.45%
5. Geluid & overlast                               4          0.16%

DETAILED DOMAIN INFORMATION

1. Ver

In [2]:
# List the least matching cases in each domain

print("=" * 80)
print("LEAST MATCHING CASES IN EACH DOMAIN (LOWEST SCORES)")
print("=" * 80)

# For each domain, find cases with lowest matching scores
for domain_name in sorted(df["domain"].unique()):
    domain_cases = df[df["domain"] == domain_name].copy()
    
    if len(domain_cases) == 0:
        continue
    
    # Sort by matching score, find lowest scoring cases
    domain_cases_sorted = domain_cases.sort_values("domain_score", ascending=True)
    
    # Find lowest scoring cases (may have multiple)
    min_score = domain_cases_sorted["domain_score"].iloc[0]
    least_matching = domain_cases_sorted[domain_cases_sorted["domain_score"] == min_score]
    
    print(f"\n{'='*80}")
    print(f"{domain_name}")
    print(f"{'='*80}")
    print(f"Total cases: {len(domain_cases)}")
    print(f"Lowest match score: {min_score}")
    print(f"Number of cases with lowest score: {len(least_matching)}")
    print(f"Average match score: {domain_cases['domain_score'].mean():.2f}")
    print(f"Highest match score: {domain_cases['domain_score'].max()}")
    
    # Display least matching cases (max 5)
    print(f"\nLeast matching cases (top {min(5, len(least_matching))}):")
    print("-" * 80)
    
    for idx, (row_idx, row) in enumerate(least_matching.head(5).iterrows()):
        print(f"\nCase {idx+1} (index: {row_idx}, match score: {row['domain_score']}):")
        print(f"  Matched keywords: {row['domain_matched_keywords'] if pd.notna(row['domain_matched_keywords']) else 'None'}")
        
        # Display text snippet (first 500 chars)
        text_snippet = str(row[text_col])[:500].replace("\n", " ")
        print(f"  Text snippet (first 500 chars):")
        print(f"  {text_snippet}...")
        
        # If ECLI number exists, also display it
        if 'ecli' in df.columns:
            print(f"  ECLI: {row.get('ecli', 'N/A')}")
        elif 'ECLI' in df.columns:
            print(f"  ECLI: {row.get('ECLI', 'N/A')}")

print("\n" + "=" * 80)
print("Done!")
print("=" * 80)

LEAST MATCHING CASES IN EACH DOMAIN (LOWEST SCORES)

1. Verkeer & vervoer
Total cases: 68
Lowest match score: 2
Number of cases with lowest score: 3
Average match score: 6.10
Highest match score: 10

Least matching cases (top 3):
--------------------------------------------------------------------------------

Case 1 (index: 219, match score: 2):
  Matched keywords: afsluiting, verkeer
  Text snippet (first 500 chars):
   AB 2005, 214 met annotatie van C.M. Bitter O&A 2004, 76     http://deeplink.rechtspraak.nl/uitspraak?id=ECLI:NL:RVS:2004:AQ1051 text/html public 2013-04-04T21:13:12 2004-07-14 Raad voor de Rechtspraak nl ECLI:NL:RVS:2004:AQ1051 Raad van State , 14-07-2004 / 200306296/1     Bij besluit van 5 december 2001 heeft het college van burgemeester en wethouders van Haarlem (hierna: het college) geweigerd appellante compensatie te verlenen voor de terugval in resultaat van de door haar gedreven ondernemi...

Case 2 (index: 1982, match score: 2):
  Matched keywords: verkeer, aut

In [None]:
# Classify a new advice letter or any Dutch text
# This function can be used to classify new incoming documents

def classify_text(text, domain_keywords, min_matches=2):
    """
    Classify a single text (e.g., advice letter) into one of the 10 domains.
    
    Parameters:
    -----------
    text : str
        The text to classify (e.g., advice letter content)
    domain_keywords : dict
        The domain keywords dictionary (from cell 0)
    min_matches : int
        Minimum number of keyword matches required (default: 2)
    
    Returns:
    --------
    dict with keys: 'domain', 'score', 'matched_keywords', 'all_scores'
    """
    domain, score, matched = assign_domain(text, domain_keywords, min_matches)
    
    # Also get all domain scores for transparency
    all_scores = {}
    for domain_name, keywords in domain_keywords.items():
        if domain_name == "10. Overig / gemengd":
            continue
        count, _ = count_keywords_in_text(text, keywords)
        all_scores[domain_name] = count
    
    return {
        'domain': domain,
        'score': score,
        'matched_keywords': matched,
        'all_scores': all_scores
    }

# Example usage:
print("=" * 80)
print("CLASSIFY NEW ADVICE LETTER OR TEXT")
print("=" * 80)
print("\nExample: Classify a sample text")
print("-" * 80)

# Example text (you can replace this with actual advice letter text)
example_text = """
Bij besluit van 15 maart 2024 heeft het college van burgemeester en wethouders 
een aanvraag om een omgevingsvergunning voor het bouwen van een uitbreiding 
aan de woning afgewezen. Het bestemmingsplan staat deze bouw niet toe.
"""

result = classify_text(example_text, domain_keywords, min_matches=2)

print(f"\nText to classify:")
print(f"{example_text[:200]}...")
print(f"\nClassification result:")
print(f"  Domain: {result['domain']}")
print(f"  Match score: {result['score']}")
print(f"  Matched keywords: {', '.join(result['matched_keywords'][:10])}")
print(f"\nAll domain scores:")
for domain, score in sorted(result['all_scores'].items(), key=lambda x: x[1], reverse=True):
    if score > 0:
        print(f"  {domain}: {score}")

print("\n" + "=" * 80)
print("To classify a new advice letter, use:")
print("  result = classify_text(your_text, domain_keywords)")
print("=" * 80)