# Adaptive Fuzzy University Clustering Demo
This notebook illustrates how to use the `adaptive_fuzzy` package on a small synthetic dataset.

In [22]:
import sys
from pathlib import Path
sys.path.append(str(Path('..').resolve()))

import pandas as pd
from adaptive_fuzzy import (
    LabelledExample,
    PairCandidate,
    build_clusters,
    clusters_to_frame,
    compute_features,
    fit_classifier,
    generate_pair_candidates,
    score_candidates,
)
from adaptive_fuzzy.cli import DEFAULT_ARCHIVE_PATH, save_labelled_examples


In [23]:
DEMO_ARCHIVE_PATH = Path('notebooks') / 'demo_label_history.csv'
if DEMO_ARCHIVE_PATH.exists():
    DEMO_ARCHIVE_PATH.unlink()
print(f'Demo label archive: {DEMO_ARCHIVE_PATH.resolve()}')


Demo label archive: /home/yk0581/h1bworkers/code/notebooks/demo_label_history.csv


## Create a toy dataset
We assemble a small list of university names that includes a few aliases and obvious duplicates.


In [24]:
names = [
    'University of California Los Angeles',
    'UCLA',
    'University of California, Los Angeles',
    'Massachusetts Institute of Technology',
    'MIT',
    'California Institute of Technology',
    'Caltech',
]
names


## Generate pair candidates
The candidate list ranks potential matches using a baseline RapidFuzz score (higher implies more similar).


In [25]:
candidates = generate_pair_candidates(names)
len(candidates), candidates[0]


(1411,
 PairCandidate(name_a='Kyung Hee University', name_b='KyungHee University', score=97.43589743589743))

## Collect human labels interactively
Enter `y`, `n`, `u`, or `q` to respond. Each confirmed label is appended to the demo archive defined above.


In [28]:
def prompt_label(candidate):
    prompt = (
        f'Match these universities?\n  A) {candidate.name_a}\n  B) {candidate.name_b}\n'
        'Enter y (match), n (not a match), u (unsure/skip), or q (quit): '
    )
    while True:
        answer = input(prompt).strip().lower()
        if answer in {'y', 'yes', '1'}:
            return 1
        if answer in {'n', 'no', '0'}:
            return 0
        if answer in {'u', 'skip', ''}:
            return None
        if answer in {'q', 'quit', 'exit'}:
            return -1
        print('Please respond with y, n, u, or q.')

def collect_labels(candidates, limit=10, archive_path=DEMO_ARCHIVE_PATH):
    labelled = {}
    for candidate in candidates[:limit]:
        response = prompt_label(candidate)
        if response == -1:
            print('Stopping label collection.')
            break
        if response is None:
            continue
        features = compute_features(candidate.name_a, candidate.name_b)
        labelled[(candidate.name_a, candidate.name_b)] = LabelledExample(features, response)
        save_labelled_examples(labelled, archive_path)
    return labelled

def iterative_labelling_loop(
    candidates,
    initial_labels=5,
    batch_size=5,
    max_iterations=3,
    convergence_threshold=0.01,
    archive_path=DEMO_ARCHIVE_PATH,
):
    labelled = collect_labels(candidates, limit=initial_labels, archive_path=archive_path)
    if not labelled:
        raise RuntimeError('No initial labels collected.')
    model = fit_classifier(labelled.values())

    remaining = candidates
    prev_scores = {
        (cand.name_a, cand.name_b): prob
        for cand, prob in score_candidates(model, remaining, labelled.keys())
    }

    for iteration in range(1, max_iterations + 1):
        if not prev_scores:
            print('No more unlabelled candidates to review.')
            break
        ordered = sorted(prev_scores.items(), key=lambda item: abs(0.5 - item[1]))
        to_inspect = ordered[:batch_size]
        print(f'Iteration {iteration}: reviewing {len(to_inspect)} pairs')
        any_new = False
        for (name_a, name_b), probability in to_inspect:
            response = prompt_label(PairCandidate(name_a, name_b, probability))
            if response == -1:
                print('Stopping labelling loop.')
                return model, labelled
            if response is None:
                continue
            features = compute_features(name_a, name_b)
            labelled[(name_a, name_b)] = LabelledExample(features, response)
            save_labelled_examples(labelled, archive_path)
            any_new = True

        if not any_new:
            print('No new labels collected; ending loop.')
            break

        model = fit_classifier(labelled.values())
        current_scores = {
            (cand.name_a, cand.name_b): prob
            for cand, prob in score_candidates(model, remaining, labelled.keys())
        }
        shared = set(prev_scores.keys()) & set(current_scores.keys())
        if shared:
            max_delta = max(abs(current_scores[k] - prev_scores[k]) for k in shared)
            print(f'Max probability change this iteration: {max_delta:.4f}')
            if max_delta <= convergence_threshold:
                print('Convergence threshold reached; stopping iterative labelling.')
                prev_scores = current_scores
                break
        prev_scores = current_scores

    return model, labelled


## Run the adaptive labelling loop
Apply the iterative process that repeatedly proposes uncertain pairs until convergence or the maximum number of iterations is reached.


In [29]:
model, labelled = iterative_labelling_loop(
    candidates,
    initial_labels=5,
    batch_size=5,
    max_iterations=3,
    convergence_threshold=0.01,
)
len(labelled)


Iteration 1: reviewing 5 pairs
Max probability change this iteration: 0.0000
Convergence threshold reached; stopping iterative labelling.


10

In [30]:
if DEMO_ARCHIVE_PATH.exists():
    archive_df = pd.read_csv(DEMO_ARCHIVE_PATH)
    archive_df
else:
    print('No labels have been persisted yet.')


## Train a probabilistic matcher
We fit a random forest classifier on the collected examples.


In [31]:
if labelled:
    model = fit_classifier(labelled.values())
    model
else:
    raise RuntimeError('No labels collected – rerun the previous cell to gather labels.')


## Build clusters
Using the learned model we connect pairs with predicted probability above 0.6 and read off clusters.


In [None]:
clusters = build_clusters(names, model, candidates, threshold=0.6)
cluster_frame = clusters_to_frame(clusters)
cluster_frame


Unnamed: 0,cluster_root,university_name
0,Ajou University,Ajou University
1,Asia Pacific International School,Asia Pacific International School
2,Baewha Women's University,Baewha Women's University
3,Catholic University of Daegu,Catholic University of Daegu
4,Chonbuk National University,Chonbuk National University
...,...,...
171,한동대학교,한동대학교
172,한림대학교(Hallym University),한림대학교(Hallym University)
173,한성대학교,한성대학교
174,한양대학교,한양대학교


The result groups aliases together while leaving unrelated institutions in their own clusters.


## Optional: real dataset snippet
```python
import duckdb as ddb
from config import root
con = ddb.connect()
x = con.read_parquet(f"{root}/data/int/wrds_users_sep2.parquet")
df = con.sql("SELECT * FROM x WHERE university_country = 'South Korea'").to_df()
names = df.sample(1000)['university_raw'].unique().tolist()
```
