In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler

import cvxpy as cp
import plotly.express as px
import plotly.graph_objects as go

# import zipfile
# import os

In [None]:
# Read the CSV file from the current directory
file_path = 'AKC.csv'
df = pd.read_csv(file_path)


# Convert the 'cluster' column to a categorical variable
df['cluster'] = pd.Categorical(df['cluster'], categories=sorted(df['cluster'].unique()))

# Print the DataFrame to check the contents
print(df)

In [None]:

# Find duplicated rows based on 'cluster', 'height', and 'weight'
# Keep the first occurrence, drop the rest
duplicate_mask = df.duplicated(subset=['cluster', 'height', 'weight'], keep='first')
suppressed_rows = df[duplicate_mask]

# Print the suppressed rows
print("Suppressed duplicate rows (same cluster, height, and weight):")
print(suppressed_rows)

# Drop those duplicates from the DataFrame
df = df[~duplicate_mask].reset_index(drop=True)

fig = px.scatter(
    df,
    x='height',         # second column
    y='weight',         # third column
    color='cluster',    # fourth column for coloring
    hover_name='Breed',  # first column for hover text
    category_orders={'cluster': sorted(df['cluster'].unique())}
)

# Step 4: Adjust the layout to make the axes appear square
fig.update_layout(
    autosize=True,
    height=600,  # Set a specific height
    width=600,   # Set a specific width
    showlegend=True  # Hide the legend
)

# Show the plot
fig.show(renderer="notebook")


In [None]:
# Step 1: Filter the DataFrame to only include cluster 5
cluster_df = df[df['cluster'] == 5].copy()

In [None]:
# Step 2: Standardize 'height' and 'weight'
scaler = StandardScaler()
coordinates = scaler.fit_transform(cluster_df[['height', 'weight']])

cluster_df[['height_std', 'weight_std']] = coordinates

n = len(coordinates)
d = coordinates.shape[1]  # Dimensionality (2 in this case)

In [None]:
# Step 3: Compute Medoid
distances = cdist(coordinates, coordinates, metric='euclidean')
distance_sums = distances.sum(axis=1)

# Print distance sums for each point
print("Distance sums for each point:")
for breed, dist_sum in zip(cluster_df['Breed'], distance_sums):
    print(f"{breed}: Total Distance = {dist_sum:.4f}")

medoid_index = np.argmin(distance_sums)
medoid_coords = coordinates[medoid_index]
medoid_breed = cluster_df.iloc[medoid_index]['Breed']

In [None]:
# Step 4: Plot the standardized cluster with the medoid
fig = px.scatter(
    cluster_df,
    x='height_std',
    y='weight_std',
    hover_name='Breed',
    #title='Standardized Cluster of XS Dogs',
    color_discrete_sequence=['orange']  # Set the color of the points to orange
)

fig.add_scatter(
    x=[medoid_coords[0]], y=[medoid_coords[1]],
    mode='markers',
    marker=dict(color='red', size=12, symbol='x'),
    name=f'Prototype'
)

fig.update_layout(
    autosize=True,
    height=600,
    width=600,
    xaxis_title='Height',
    yaxis_title='Weight'
)

fig.show(renderer="notebook")

print(f"Medoid: {medoid_breed}")
print(f"Coordinates of Medoid (standardized): Height={medoid_coords[0]}, Weight={medoid_coords[1]}")

In [None]:

# Loop over each point as reference
for ref_idx in range(n):
    x0 = coordinates[ref_idx]
    x0_breed = cluster_df.iloc[ref_idx]['Breed']
    other_indices = [i for i in range(n) if i != ref_idx]

    # Compute D̄_j for each j ≠ ref_idx
    D_bar = np.array([
        sum(np.linalg.norm(coordinates[j] - coordinates[i]) for i in other_indices if i != j)
        for j in other_indices
    ])

    # Define optimization variable and problem
    Delta = cp.Variable(d)
    objective = cp.Minimize(cp.norm(Delta, 2))

    constraints = []
    for idx_j, j in enumerate(other_indices):
        dist_sum = 0
        for i in other_indices:
            if i != j:
                dist_sum += cp.norm(x0 + Delta - coordinates[i], 2)
        constraints.append(dist_sum <= D_bar[idx_j])

    problem = cp.Problem(objective, constraints)
    problem.solve(verbose=False)

    # Print results
    print(f"\nReference Point {ref_idx}: {x0_breed}")
    print("Optimal Delta:", Delta.value)
    print("Optimal objective value:", problem.value)

    # --- Plotting standardized coordinates ---
    fig = px.scatter(
        cluster_df,
        x='height_std',
        y='weight_std',
        hover_name='Breed',
        labels={'height_std': 'Height', 'weight_std': 'Weight'},
        color_discrete_sequence=['orange']  # Set the color of the points to orange
    )

    fig.update_layout(
        showlegend=True,
        legend_title="Points",
        autosize=True,
        height=600,
        width=600,
        title=f"Reference Point (Standardized): {x0_breed}"
    )

    # Add reference point (x0)
    fig.add_trace(go.Scatter(
        x=[x0[0]], y=[x0[1]],
        mode='markers',
        marker=dict(color='orange', size=12, symbol='circle'),
        name=f'Reference: {x0_breed}'
    ))

    # Add optimized counterfactual medoid and connecting line
    if Delta.value is not None and Delta.value.shape == (d,):
        optimized_x0 = x0 + Delta.value.flatten()

        # Line from reference point to counterfactual
        fig.add_trace(go.Scatter(
            x=[x0[0], optimized_x0[0]],
            y=[x0[1], optimized_x0[1]],
            mode='lines',
            line=dict(color='gray', dash='dot'),
            showlegend=False
        ))

        # Counterfactual point
        fig.add_trace(go.Scatter(
            x=[optimized_x0[0]], y=[optimized_x0[1]],
            mode='markers',
            marker=dict(color='green', size=12, symbol='x'),
            name='Counterfactual prototype'
        ))
    else:
        print("Delta.value is invalid or empty.")

    # Add medoid point (same for all iterations)
    fig.add_trace(go.Scatter(
        x=[medoid_coords[0]], y=[medoid_coords[1]],
        mode='markers',
        marker=dict(color='red', size=12, symbol='x'),
        name=f'Prototype'
    ))

    fig.show(renderer="notebook")


In [None]:

# Prepare data
raw_coordinates = cluster_df[['height', 'weight']].values
breeds = cluster_df['Breed'].values

# Get original stds
stds = raw_coordinates.std(axis=0)

# n, d = coordinates.shape
results = []
ref_points = []
cf_points = []

# Loop over each point
for ref_idx in range(n):
    x0 = coordinates[ref_idx]
    breed = breeds[ref_idx]
    x0_raw = raw_coordinates[ref_idx]
    other_indices = [i for i in range(n) if i != ref_idx]

    D_bar = np.array([
        sum(np.linalg.norm(coordinates[j] - coordinates[i]) for i in other_indices if i != j)
        for j in other_indices
    ])

    Delta = cp.Variable(d)
    objective = cp.Minimize(cp.norm(Delta, 2))
    constraints = []

    for idx_j, j in enumerate(other_indices):
        dist_sum = 0
        for i in other_indices:
            if i != j:
                dist_sum += cp.norm(x0 + Delta - coordinates[i], 2)
        constraints.append(dist_sum <= D_bar[idx_j])

    problem = cp.Problem(objective, constraints)
    problem.solve(verbose=False)

    if Delta.value is not None and Delta.value.shape == (d,):
        delta_val = Delta.value.flatten()
        delta_original = delta_val * stds  # Convert to original space
        obj_val = problem.value
        x_cf = x0 + delta_val
        ref_points.append(x0)
        cf_points.append(x_cf)
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'normalized Height': x0[0],
            'normalized Weight': x0[1],
            'Delta Height (normalized)': delta_val[0],
            'Delta Weight (normalized)': delta_val[1],
            'Delta Height (original)': delta_original[0],
            'Delta Weight (original)': delta_original[1],
            'Objective Value (normalized space)': obj_val
        })
    else:
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'Delta Height (normalized)': np.nan,
            'Delta Weight (normalized)': np.nan,
            'Objective Value (normalized space)': np.nan
        })

# DataFrame
results_df = pd.DataFrame(results)

# Display the table
print(results_df)

# Plotting
ref_points = np.array(ref_points)
cf_points = np.array(cf_points)

fig = go.Figure()

# Plot all normalized data points
fig.add_trace(go.Scatter(
    x=coordinates[:, 0],
    y=coordinates[:, 1],
    mode='markers',
    marker=dict(color='gray', size=6),
    name='Normalized Data Points'
))

# Reference points (red)
fig.add_trace(go.Scatter(
    x=ref_points[:, 0],
    y=ref_points[:, 1],
    mode='markers',
    marker=dict(color='orange', size=10, symbol='circle'),
    name='Reference Points (x₀)'
))

# Counterfactual points (green)
fig.add_trace(go.Scatter(
    x=cf_points[:, 0],
    y=cf_points[:, 1],
    mode='markers',
    marker=dict(color='green', size=10, symbol='x'),
    name='Counterfactual Points (x₀ + Δ)'
))

# Lines from x₀ to x₀ + Δ
for i in range(len(ref_points)):
    fig.add_trace(go.Scatter(
        x=[ref_points[i, 0], cf_points[i, 0]],
        y=[ref_points[i, 1], cf_points[i, 1]],
        mode='lines',
        line=dict(color='gray', dash='dot'),
        showlegend=False
    ))

# Add medoid (purple)
fig.add_trace(go.Scatter(
    x=[medoid_coords[0]],
    y=[medoid_coords[1]],
    mode='markers',
    marker=dict(color='red', size=12, symbol='x'),
    name=f'Medoid ({medoid_breed})',  # Append breed name to the legend label
))

fig.update_layout(
    #title='Counterfactual Shifts and Medoid in Normalized Space',
    xaxis_title='Height',
    yaxis_title='Weight',
    width=750,
    height=750,
    showlegend=False
)

fig.show(renderer="notebook")

In [None]:

# Prepare data
raw_coordinates = cluster_df[['height', 'weight']].values
breeds = cluster_df['Breed'].values

# Get original stds
stds = raw_coordinates.std(axis=0)

# n, d = coordinates.shape
results = []
ref_points = []
cf_points = []

# Loop over each point
for ref_idx in range(n):
    x0 = coordinates[ref_idx]
    breed = breeds[ref_idx]
    x0_raw = raw_coordinates[ref_idx]
    other_indices = [i for i in range(n) if i != ref_idx]

    D_bar = np.array([
        sum(np.linalg.norm(coordinates[j] - coordinates[i]) for i in other_indices if i != j)
        for j in other_indices
    ])

    w = np.array([0.75, 0.25])
    Delta = cp.Variable(d)
    objective = cp.Minimize(cp.norm(cp.multiply(w, Delta), 2))
    constraints = []

    for idx_j, j in enumerate(other_indices):
        dist_sum = 0
        for i in other_indices:
            if i != j:
                dist_sum += cp.norm(x0 + Delta - coordinates[i], 2)
        constraints.append(dist_sum <= D_bar[idx_j])

    problem = cp.Problem(objective, constraints)
    problem.solve(verbose=False)

    if Delta.value is not None and Delta.value.shape == (d,):
        delta_val = Delta.value.flatten()
        delta_original = delta_val * stds  # Convert to original space
        obj_val = problem.value
        x_cf = x0 + delta_val
        ref_points.append(x0)
        cf_points.append(x_cf)
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'normalized Height': x0[0],
            'normalized Weight': x0[1],
            'Delta Height (normalized)': delta_val[0],
            'Delta Weight (normalized)': delta_val[1],
            'Delta Height (original)': delta_original[0],
            'Delta Weight (original)': delta_original[1],
            'Objective Value (normalized space)': obj_val
        })
    else:
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'Delta Height (normalized)': np.nan,
            'Delta Weight (normalized)': np.nan,
            'Objective Value (normalized space)': np.nan
        })

# DataFrame
results_df = pd.DataFrame(results)

# Display the table
print(results_df)

# Plotting
ref_points = np.array(ref_points)
cf_points = np.array(cf_points)

fig = go.Figure()

# Plot all normalized data points
fig.add_trace(go.Scatter(
    x=coordinates[:, 0],
    y=coordinates[:, 1],
    mode='markers',
    marker=dict(color='gray', size=6),
    name='Normalized Data Points'
))

# Reference points (red)
fig.add_trace(go.Scatter(
    x=ref_points[:, 0],
    y=ref_points[:, 1],
    mode='markers',
    marker=dict(color='orange', size=10, symbol='circle'),
    name='Reference Points (x₀)'
))

# Counterfactual points (green)
fig.add_trace(go.Scatter(
    x=cf_points[:, 0],
    y=cf_points[:, 1],
    mode='markers',
    marker=dict(color='green', size=10, symbol='x'),
    name='Counterfactual Points (x₀ + Δ)'
))

# Lines from x₀ to x₀ + Δ
for i in range(len(ref_points)):
    fig.add_trace(go.Scatter(
        x=[ref_points[i, 0], cf_points[i, 0]],
        y=[ref_points[i, 1], cf_points[i, 1]],
        mode='lines',
        line=dict(color='gray', dash='dot'),
        showlegend=False
    ))

# Add medoid (purple)
fig.add_trace(go.Scatter(
    x=[medoid_coords[0]],
    y=[medoid_coords[1]],
    mode='markers',
    marker=dict(color='red', size=12, symbol='x'),
    name=f'Medoid ({medoid_breed})',  # Append breed name to the legend label
))

fig.update_layout(
    #title='Counterfactual Shifts and Medoid in Normalized Space',
    xaxis_title='Height',
    yaxis_title='Weight',
    width=750,
    height=750,
    showlegend=False
)

fig.show(renderer="notebook")