In [12]:
import os, re, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skimage import io

# ---------------- parsing + sampling ----------------
def _parse_from_path(p):
    # .../slide001_core003/512/40/0_1024.png -> (slide, core, size, mag, x, y)
    parts = p.strip().split('/')
    core = parts[-4]                               # e.g., slide001_core003
    slide = core.split('_')[0]                     # e.g., slide001
    size  = int(parts[-3])
    mag   = int(parts[-2])
    xy    = os.path.splitext(parts[-1])[0]
    m = re.match(r'^(\d+)_(\d+)$', xy)
    if not m:
        raise ValueError(f"Bad xy in filename: {p}")
    x, y = int(m.group(1)), int(m.group(2))
    return slide, core, size, mag, x, y

def find_far_same_label_pairs(
    csv_path,
    magnifications=None,     # e.g., [40]; None = all
    patch_sizes=None,        # e.g., [512]; None = all
    min_steps=6,             # Euclidean distance in "patch steps" (>= this)
    samples_per_label=8,
    same_core=True,
    rng_seed=42,
):
    random.seed(rng_seed)
    df = pd.read_csv(csv_path)

    parsed = df['path'].apply(_parse_from_path)
    df[['slide','core','size','mag','x','y']] = pd.DataFrame(parsed.tolist(), index=df.index)

    if magnifications is not None: df = df[df['mag'].isin(magnifications)]
    if patch_sizes is not None:    df = df[df['size'].isin(patch_sizes)]

    out_rows = []
    group_keys = ['mag','size','hard_label'] + (['core'] if same_core else [])
    grouped = df.groupby(group_keys, sort=False)

    for gkey, gdf in grouped:
        if len(gdf) < 2: continue
        size_val = int(gdf['size'].iloc[0])
        xs = (gdf['x'] // size_val).to_numpy()
        ys = (gdf['y'] // size_val).to_numpy()

        picked, attempts, max_attempts = 0, 0, samples_per_label * 150
        while picked < samples_per_label and attempts < max_attempts:
            attempts += 1
            i1, i2 = np.random.choice(len(gdf), size=2, replace=False)
            dx = abs(int(xs[i1]) - int(xs[i2])); dy = abs(int(ys[i1]) - int(ys[i2]))
            steps = (dx*dx + dy*dy) ** 0.5
            if steps < min_steps: continue

            r1 = gdf.iloc[i1]; r2 = gdf.iloc[i2]
            out_rows.append({
                'label' : int(r1['hard_label']),
                'slide' : r1['slide'],
                'core'  : r1['core'],         # same for both when same_core=True
                'mag'   : int(r1['mag']),
                'size'  : int(r1['size']),
                'x1'    : int(r1['x']),  'y1': int(r1['y']),  'path1': r1['path'],
                'x2'    : int(r2['x']),  'y2': int(r2['y']),  'path2': r2['path'],
                'dx'    : int(dx), 'dy': int(dy),
                'steps' : float(steps),
                'dist_px': float(steps * size_val),
            })
            picked += 1

    out = pd.DataFrame(out_rows)
    if not out.empty:
        out = out.sort_values(['label','mag','size','slide','core','steps'],
                              ascending=[True, True, True, True, True, False]).reset_index(drop=True)
    return out

# ---------------- visualization ----------------
def _imread_rgb(p):
    img = io.imread(p)
    if img.ndim == 2:  # grayscale
        img = np.stack([img]*3, axis=-1)
    if img.shape[-1] == 4:  # RGBA -> RGB
        img = img[..., :3]
    return img

# --- mapping (dataset codes -> training ids) ---
MAP_VPC = {'1': 0, '3': 1, '4': 2, '5': 3, '0': 4, '6': 5}
ORIG_BY_HARD = {v: k for k, v in MAP_VPC.items()}  # 0..5 -> 'orig code' string
NAME_BY_ORIG = {
    '0': 'Normal (0)',
    '1': 'Benign (1)',
    '3': 'Gleason 3',
    '4': 'Gleason 4',
    '5': 'Gleason 5',
    '6': 'Gleason 6',
}

def pretty_label_name(hard_id: int) -> str:
    orig = ORIG_BY_HARD.get(int(hard_id))
    return NAME_BY_ORIG.get(orig, f'Class {hard_id}')



def pretty_label_name(hard_id: int) -> str:
    orig = ORIG_BY_HARD.get(int(hard_id), '?')
    if orig == '0': return 'Normal (0)'
    if orig == '1': return 'Benign (1)'
    if orig in ('3','4','5','6'): return f'Gleason {orig}'
    return f'Class {hard_id}'

def visualize_far_pairs(pairs_df, max_pairs=6, random_sample=False, seed=0, save_dir=None):
    import os, numpy as np, matplotlib.pyplot as plt
    from skimage import io

    def _imread_rgb(p):
        img = io.imread(p)
        if img.ndim == 2:  # grayscale -> RGB
            img = np.stack([img]*3, axis=-1)
        if img.shape[-1] == 4:  # RGBA -> RGB
            img = img[..., :3]
        return img

    if pairs_df.empty:
        print("No pairs to visualize."); return

    df = pairs_df.copy()
    if 'label_name' not in df.columns:
        df['label_name'] = df['label'].map(pretty_label_name)

    if random_sample:
        df = df.sample(n=min(max_pairs, len(df)), random_state=seed)
    else:
        df = df.head(max_pairs)

    for _, row in df.iterrows():
        img1 = _imread_rgb(row['path1'])
        img2 = _imread_rgb(row['path2'])

        fig, ax = plt.subplots(1, 2, figsize=(9, 4.5), dpi=130)

        # show images
        ax[0].imshow(img1); ax[0].axis('off')
        ax[1].imshow(img2); ax[1].axis('off')

        # per-patch titles with explicit field names
        ax[0].set_title(
            f"Patch A — Coordinates: ({row['x1']}, {row['y1']})",
            fontsize=10
        )
        ax[1].set_title(
            f"Patch B — Coordinates: ({row['x2']}, {row['y2']})",
            fontsize=10
        )

        # global title with labeled fields
        fig.suptitle(
            "Slide: {slide} | Core: {core} | Label: {lname} [id={lid}] | "
            "Magnification: {mag} | Patch size: {size} | "
            "Distance: {steps:.1f} steps (~{px}px)".format(
                slide=row['slide'], core=row['core'],
                lname=row['label_name'], lid=int(row['label']),
                mag=int(row['mag']), size=int(row['size']),
                steps=row['steps'], px=int(row['dist_px'])
            ),
            fontsize=11
        )

        # optional: tiny path captions under each image (basenames)
        ax[0].text(0.5, -0.08, f"Path A: {os.path.basename(row['path1'])}",
                   ha='center', va='top', transform=ax[0].transAxes, fontsize=8)
        ax[1].text(0.5, -0.08, f"Path B: {os.path.basename(row['path2'])}",
                   ha='center', va='top', transform=ax[1].transAxes, fontsize=8)

        plt.tight_layout(rect=[0, 0, 1, 0.88])

        if save_dir:
            os.makedirs(save_dir, exist_ok=True)
            base = f"{row['slide']}_{row['core']}_L{int(row['label'])}_m{int(row['mag'])}_s{int(row['size'])}_{int(row['steps'])}steps"
            fig.savefig(os.path.join(save_dir, base + ".png"), bbox_inches='tight', dpi=160)
        plt.show()


In [None]:

# ---------------- run on your uploaded CSV ----------------
labels_csv = "../../data/VPC/patch_labels_majority.csv"   # your file

pairs_df = find_far_same_label_pairs(
    csv_path=labels_csv,
    magnifications=[40],   # adjust if needed
    patch_sizes=[512],     # adjust if needed
    min_steps=6,           # >= 6 patch-steps apart
    samples_per_label=8,
    same_core=True,
    rng_seed=42,
)

print(f"Found {len(pairs_df)} pairs")
display(pairs_df.head(12))       # tabular preview



In [11]:
pairs_df['label_name'] = pairs_df['label'].map(pretty_label_name)  # if not set
visualize_far_pairs(pairs_df, max_pairs=6, random_sample=True, seed=7, save_dir=None)


NameError: name 'pretty_label_name' is not defined