In [51]:
!pip install -U kaleido
import pandas as pd
import plotly.express as px
from io import StringIO
from google.colab import drive
import plotly.express as px


import pandas as pd

drive.mount('/content/drive')
file_path = '/content/drive/My Drive/AKC.csv'


df = pd.read_csv(file_path)

# Convert the 'cluster' column to a categorical variable
df['cluster'] = pd.Categorical(df['cluster'], categories=sorted(df['cluster'].unique()))




# Print the DataFrame to check the contents
print(df)

fig = px.scatter(
    df,
    x='height',         # second column
    y='weight',         # third column
    color='cluster',    # fourth column for coloring
    hover_name='Breed',  # first column for hover text
    category_orders={'cluster': sorted(df['cluster'].unique())}
)

# Step 4: Adjust the layout to make the axes appear square
fig.update_layout(
    autosize=True,
    height=600,  # Set a specific height
    width=600,   # Set a specific width
)
# Show the plot
fig.show()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                           Breed  height  weight cluster
0                  Affenpinscher    10.5    10.0       5
1                   Afghan Hound    26.0    55.0       2
2                Airdale Terrier    23.0    45.0       3
3                          Akita    27.0   100.0       1
4                American Eskimo    14.0    27.5       4
..                           ...     ...     ...     ...
143                Welsh Terrier    15.0    20.5       4
144  West Highland White Terrier    11.0    14.0       4
145                      Whippet    20.0    28.5       4
146  Wirehaired Pointing Griffon    22.0    52.5       3
147            Yorkshire Terrier     8.0     5.0       5

[148 rows x 4 columns]


In [10]:
import pandas as pd

# Load the data
df = pd.read_csv(file_path)

# Convert 'cluster' to categorical
df['cluster'] = pd.Categorical(df['cluster'], categories=sorted(df['cluster'].unique()))

# Find duplicated rows based on 'cluster', 'height', and 'weight'
# Keep the first occurrence, drop the rest
duplicate_mask = df.duplicated(subset=['cluster', 'height', 'weight'], keep='first')
suppressed_rows = df[duplicate_mask]

# Print the suppressed rows
print("Suppressed duplicate rows (same cluster, height, and weight):")
print(suppressed_rows)

# Drop those duplicates from the DataFrame
df = df[~duplicate_mask].reset_index(drop=True)



# Print the DataFrame to check the contents
print(df)

fig = px.scatter(
    df,
    x='height',         # second column
    y='weight',         # third column
    color='cluster',    # fourth column for coloring
    hover_name='Breed',  # first column for hover text
    category_orders={'cluster': sorted(df['cluster'].unique())}
)

# Step 4: Adjust the layout to make the axes appear square
fig.update_layout(
    autosize=True,
    height=600,  # Set a specific height
    width=600,   # Set a specific width
    showlegend=False  # Hide the legend
)
# Show the plot
fig.show()

# Export the figure (this was missing)
fig.write_image("/content/akc_nodupl.pdf")

# Optional: Download
from google.colab import files
files.download("/content/akc_nodupl.pdf")


Suppressed duplicate rows (same cluster, height, and weight):
                          Breed  height  weight cluster
20             Belgian Tervuren    24.0    67.5       2
46                    Chow Chow    20.5    50.0       3
50    Collie (Rough) & (Smooth)    24.0    62.5       2
56             English Foxhound    23.5    67.5       2
57               English Setter    25.0    62.5       2
59          English Toy Spaniel    10.0    12.0       4
64         Fox Terrier Wirehair    14.5    17.5       4
68   German Shorthaired Pointer    23.5    65.0       2
73                Gordon Setter    25.0    62.5       2
78                      Harrier    20.0    50.0       3
86                     Keeshond    18.0    42.5       3
97                 Newfoundland    27.0   125.0       1
102                    Papillon     9.5     7.5       5
137             Tibetan Spaniel    10.0    12.0       4
                           Breed  height  weight cluster
0                  Affenpinscher    10.5 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler

# Step 1: Filter the DataFrame to only include cluster 5
cluster_df = df[df['cluster'] == 5].copy()

# Step 2: Standardize 'height' and 'weight'
scaler = StandardScaler()
standardized_values = scaler.fit_transform(cluster_df[['height', 'weight']])
cluster_df[['height_std', 'weight_std']] = standardized_values

# Step 3: Compute the Medoid in standardized space
coordinates = standardized_values
distances = cdist(coordinates, coordinates, metric='euclidean')
distance_sums = distances.sum(axis=1)

# Print distance sums for each point
print("Distance sums for each point:")
for breed, dist_sum in zip(cluster_df['Breed'], distance_sums):
    print(f"{breed}: Total Distance = {dist_sum:.4f}")

medoid_index = np.argmin(distance_sums)
medoid_coords = coordinates[medoid_index]
medoid_breed = cluster_df.iloc[medoid_index]['Breed']

# Step 4: Plot the standardized cluster with the medoid
fig = px.scatter(
    cluster_df,
    x='height_std',
    y='weight_std',
    hover_name='Breed',
    #title='Standardized Cluster of XS Dogs',
    color_discrete_sequence=['orange']  # Set the color of the points to orange
)

fig.add_scatter(
    x=[medoid_coords[0]], y=[medoid_coords[1]],
    mode='markers',
    marker=dict(color='red', size=12, symbol='x'),
    name=f'Prototype'
)

fig.update_layout(
    autosize=True,
    height=600,
    width=600,
    xaxis_title='Height',
    yaxis_title='Weight'
)

fig.show()

print(f"Medoid: {medoid_breed}")
print(f"Coordinates of Medoid (standardized): Height={medoid_coords[0]}, Weight={medoid_coords[1]}")

# Export the figure (this was missing)
fig.write_image("/content/akc_xs_normalized.pdf")

# Optional: Download
from google.colab import files
files.download("/content/akc_xs_normalized.pdf")

Distance sums for each point:
Affenpinscher: Total Distance = 19.9266
Chihuahua: Total Distance = 26.0033
Chinese Crested: Total Distance = 18.2963
Italian Greyhound: Total Distance = 23.7644
Japanese Chin: Total Distance = 15.2413
Maltese: Total Distance = 17.3163
Manchester Terrier (Toy): Total Distance = 14.6179
Pomeranian: Total Distance = 19.6301
Poodle Toy: Total Distance = 20.1467
Toy Fox Terrier: Total Distance = 15.2750
Yorkshire Terrier: Total Distance = 20.2710


Medoid: Manchester Terrier (Toy)
Coordinates of Medoid (standardized): Height=0.4251952027621867, Weight=0.08737040566610373


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
import numpy as np
import cvxpy as cp
import plotly.express as px
import plotly.graph_objects as go
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
import zipfile
import os

# Optional: Create output directory
output_dir = "plots"
os.makedirs(output_dir, exist_ok=True)

# Step 0: Standardize height and weight
scaler = StandardScaler()
standardized_coords = scaler.fit_transform(cluster_df[['height', 'weight']])
cluster_df[['height_std', 'weight_std']] = standardized_coords

coordinates = standardized_coords  # Use standardized data
n = len(coordinates)
d = coordinates.shape[1]  # Dimensionality (2 in this case)

# Step 1: Compute the Medoid (point with smallest sum of distances to all others)
distances = cdist(coordinates, coordinates, metric='euclidean')
distance_sums = distances.sum(axis=1)
medoid_index = np.argmin(distance_sums)
medoid_coords = coordinates[medoid_index].copy()
medoid_breed = cluster_df.iloc[medoid_index]['Breed']

# Loop over each point as reference
for ref_idx in range(n):
    x0 = coordinates[ref_idx]
    x0_breed = cluster_df.iloc[ref_idx]['Breed']
    other_indices = [i for i in range(n) if i != ref_idx]

    # Compute D̄_j for each j ≠ ref_idx
    D_bar = np.array([
        sum(np.linalg.norm(coordinates[j] - coordinates[i]) for i in other_indices if i != j)
        for j in other_indices
    ])

    # Define optimization variable and problem
    Delta = cp.Variable(d)
    objective = cp.Minimize(cp.norm(Delta, 2))

    constraints = []
    for idx_j, j in enumerate(other_indices):
        dist_sum = 0
        for i in other_indices:
            if i != j:
                dist_sum += cp.norm(x0 + Delta - coordinates[i], 2)
        constraints.append(dist_sum <= D_bar[idx_j])

    problem = cp.Problem(objective, constraints)
    problem.solve(verbose=False)

    # --- Plotting ---
    fig = px.scatter(
        cluster_df,
        x='height_std',
        y='weight_std',
        hover_name='Breed',
        labels={'height_std': 'Height', 'weight_std': 'Weight'},
        color_discrete_sequence=['orange']
    )

    fig.update_layout(
        showlegend=True,
        #legend_title="Points",
        autosize=True,
        height=600,
        width=600,
       # title=f"Reference Point (Standardized): {x0_breed}"
    )

    # Add reference point (x0)
    fig.add_trace(go.Scatter(
        x=[x0[0]], y=[x0[1]],
        mode='markers',
        marker=dict(color='orange', size=12, symbol='circle'),
        name=f'Reference: {x0_breed}'
    ))

    # Add optimized counterfactual medoid and connecting line
    if Delta.value is not None and Delta.value.shape == (d,):
        optimized_x0 = x0 + Delta.value.flatten()

        fig.add_trace(go.Scatter(
            x=[x0[0], optimized_x0[0]],
            y=[x0[1], optimized_x0[1]],
            mode='lines',
            line=dict(color='gray', dash='dot'),
            showlegend=False
        ))

        fig.add_trace(go.Scatter(
            x=[optimized_x0[0]], y=[optimized_x0[1]],
            mode='markers',
            marker=dict(color='green', size=12, symbol='x'),
            name='Counterfactual prototype'
        ))

    # Add medoid point
    fig.add_trace(go.Scatter(
        x=[medoid_coords[0]], y=[medoid_coords[1]],
        mode='markers',
        marker=dict(color='red', size=12, symbol='x'),
        name=f'Prototype'
    ))

    # Save to PNG
    filename_base = f"{output_dir}/ref_{ref_idx}_{x0_breed.replace(' ', '_')}"
    fig.write_image(f"{filename_base}.pdf")  # Requires kaleido

   # Create a zip file from the output directory
zip_path = '/content/standardized_plots.zip'
with zipfile.ZipFile(zip_path, 'w') as zipf:
    for filename in os.listdir(output_dir):
        if filename.endswith('.pdf'):
            filepath = os.path.join(output_dir, filename)
            zipf.write(filepath, arcname=filename)

# Download the zip file
from google.colab import files
files.download(zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
import numpy as np
import cvxpy as cp
import plotly.express as px
import plotly.graph_objects as go

from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler

# Step 0: Standardize height and weight
scaler = StandardScaler()
standardized_coords = scaler.fit_transform(cluster_df[['height', 'weight']])
cluster_df[['height_std', 'weight_std']] = standardized_coords

coordinates = standardized_coords  # Use standardized data
n = len(coordinates)
d = coordinates.shape[1]  # Dimensionality (2 in this case)

# Step 1: Compute the Medoid (point with smallest sum of distances to all others)
distances = cdist(coordinates, coordinates, metric='euclidean')
distance_sums = distances.sum(axis=1)
medoid_index = np.argmin(distance_sums)
medoid_coords = coordinates[medoid_index].copy()
medoid_breed = cluster_df.iloc[medoid_index]['Breed']

# Loop over each point as reference
for ref_idx in range(n):
    x0 = coordinates[ref_idx]
    x0_breed = cluster_df.iloc[ref_idx]['Breed']
    other_indices = [i for i in range(n) if i != ref_idx]

    # Compute D̄_j for each j ≠ ref_idx
    D_bar = np.array([
        sum(np.linalg.norm(coordinates[j] - coordinates[i]) for i in other_indices if i != j)
        for j in other_indices
    ])

    # Define optimization variable and problem
    Delta = cp.Variable(d)
    objective = cp.Minimize(cp.norm(Delta, 2))

    constraints = []
    for idx_j, j in enumerate(other_indices):
        dist_sum = 0
        for i in other_indices:
            if i != j:
                dist_sum += cp.norm(x0 + Delta - coordinates[i], 2)
        constraints.append(dist_sum <= D_bar[idx_j])

    problem = cp.Problem(objective, constraints)
    problem.solve(verbose=False)

    # Print results
    print(f"\nReference Point {ref_idx}: {x0_breed}")
    print("Optimal Delta:", Delta.value)
    print("Optimal objective value:", problem.value)

    # --- Plotting standardized coordinates ---
    fig = px.scatter(
        cluster_df,
        x='height_std',
        y='weight_std',
        hover_name='Breed',
        labels={'height_std': 'Standardized Height (z-score)', 'weight_std': 'Standardized Weight (z-score)'},
        color_discrete_sequence=['orange']  # Set the color of the points to orange
    )

    fig.update_layout(
        showlegend=True,
        legend_title="Points",
        autosize=True,
        height=600,
        width=600,
        title=f"Reference Point (Standardized): {x0_breed}"
    )

    # Add reference point (x0)
    fig.add_trace(go.Scatter(
        x=[x0[0]], y=[x0[1]],
        mode='markers',
        marker=dict(color='orange', size=12, symbol='circle'),
        name=f'Reference: {x0_breed}'
    ))

    # Add optimized counterfactual medoid and connecting line
    if Delta.value is not None and Delta.value.shape == (d,):
        optimized_x0 = x0 + Delta.value.flatten()

        # Line from reference point to counterfactual
        fig.add_trace(go.Scatter(
            x=[x0[0], optimized_x0[0]],
            y=[x0[1], optimized_x0[1]],
            mode='lines',
            line=dict(color='gray', dash='dot'),
            showlegend=False
        ))

        # Counterfactual point
        fig.add_trace(go.Scatter(
            x=[optimized_x0[0]], y=[optimized_x0[1]],
            mode='markers',
            marker=dict(color='green', size=12, symbol='x'),
            name='Counterfactual prototype'
        ))
    else:
        print("Delta.value is invalid or empty.")

    # Add medoid point (same for all iterations)
    fig.add_trace(go.Scatter(
        x=[medoid_coords[0]], y=[medoid_coords[1]],
        mode='markers',
        marker=dict(color='red', size=12, symbol='x'),
        name=f'Prototype'
    ))

    fig.show()



Reference Point 0: Affenpinscher
Optimal Delta: [-0.17219106 -1.1187709 ]
Optimal objective value: 1.1319443872461197



Reference Point 1: Chihuahua
Optimal Delta: [1.37856917 1.35205578]
Optimal objective value: 1.9309344344146517



Reference Point 2: Chinese Crested
Optimal Delta: [-0.70035704 -0.47167519]
Optimal objective value: 0.8443799287500151



Reference Point 3: Italian Greyhound
Optimal Delta: [-1.48240716 -0.36849572]
Optimal objective value: 1.527520898423009



Reference Point 4: Japanese Chin
Optimal Delta: [ 0.17524274 -0.17353686]
Optimal objective value: 0.24662737038333124



Reference Point 5: Maltese
Optimal Delta: [0.50927093 0.61661834]
Optimal objective value: 0.7997343638740392



Reference Point 6: Manchester Terrier (Toy)
Optimal Delta: [-3.04073609e-12 -1.33086325e-12]
Optimal objective value: 3.3192277674578376e-12



Reference Point 7: Pomeranian
Optimal Delta: [-0.78183188  0.53133269]
Optimal objective value: 0.9452912304772738



Reference Point 8: Poodle Toy
Optimal Delta: [ 0.03015534 -1.14851227]
Optimal objective value: 1.1489080796447393



Reference Point 9: Toy Fox Terrier
Optimal Delta: [-0.01274813  0.27439752]
Optimal objective value: 0.2746934860757581



Reference Point 10: Yorkshire Terrier
Optimal Delta: [1.07616328 0.67327099]
Optimal objective value: 1.26941767460972


In [30]:
import numpy as np
import cvxpy as cp
import pandas as pd
from scipy.spatial.distance import cdist

# Extract raw coordinates and breed
raw_coordinates = cluster_df[['height', 'weight']].values
breeds = cluster_df['Breed'].values

# Normalize the data (z-score)
means = raw_coordinates.mean(axis=0)
stds = raw_coordinates.std(axis=0)
normalized_coordinates = (raw_coordinates - means) / stds

n, d = normalized_coordinates.shape

# Compute the Medoid (on normalized data, if needed)
distances = cdist(normalized_coordinates, normalized_coordinates, metric='euclidean')
distance_sums = distances.sum(axis=1)
medoid_index = np.argmin(distance_sums)

# Initialize list to store results
results = []

# Loop over each point as reference
for ref_idx in range(n):
    x0 = normalized_coordinates[ref_idx]
    breed = breeds[ref_idx]
    x0_raw = raw_coordinates[ref_idx]
    other_indices = [i for i in range(n) if i != ref_idx]

    # Compute D̄_j for each j ≠ ref_idx
    D_bar = np.array([
        sum(np.linalg.norm(normalized_coordinates[j] - normalized_coordinates[i]) for i in other_indices if i != j)
        for j in other_indices
    ])

    # Optimization
    Delta = cp.Variable(d)
    objective = cp.Minimize(cp.norm(Delta, 2))

    constraints = []
    for idx_j, j in enumerate(other_indices):
        dist_sum = 0
        for i in other_indices:
            if i != j:
                dist_sum += cp.norm(x0 + Delta - normalized_coordinates[i], 2)
        constraints.append(dist_sum <= D_bar[idx_j])

    problem = cp.Problem(objective, constraints)
    problem.solve(verbose=False)

    if Delta.value is not None and Delta.value.shape == (d,):
        delta_val = Delta.value.flatten()
        delta_original = delta_val * stds  # Convert to original space
        obj_val = problem.value
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'normalized Height': x0[0],
            'normalized Weight': x0[1],
            'Delta Height (normalized)': delta_val[0],
            'Delta Weight (normalized)': delta_val[1],
            'Delta Height (original)': delta_original[0],
            'Delta Weight (original)': delta_original[1],
            'Objective Value (normalized space)': obj_val
        })
    else:
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'Delta Height (normalized)': np.nan,
            'Delta Weight (normalized)': np.nan,
            'Objective Value (normalized space)': np.nan
        })

# Create DataFrame from results
results_df = pd.DataFrame(results)

# Display the table
print(results_df)


                       Breed  Reference Height  Reference Weight  \
0              Affenpinscher              10.5              10.0   
1                  Chihuahua               7.5               3.5   
2            Chinese Crested              12.0               8.5   
3          Italian Greyhound              13.5               8.0   
4              Japanese Chin               9.5               7.5   
5                    Maltese               9.0               5.0   
6   Manchester Terrier (Toy)              11.0               7.0   
7                 Pomeranian              12.0               5.0   
8                 Poodle Toy              10.0              10.0   
9            Toy Fox Terrier              10.0               5.5   
10         Yorkshire Terrier               8.0               5.0   

    normalized Height  normalized Weight  Delta Height (normalized)  \
0            0.132874           1.528982              -1.721911e-01   
1           -1.621057          -1.594510 

In [49]:
import numpy as np
import cvxpy as cp
import pandas as pd
import plotly.graph_objects as go
from scipy.spatial.distance import cdist

# Prepare data
raw_coordinates = cluster_df[['height', 'weight']].values
breeds = cluster_df['Breed'].values

# Normalize
means = raw_coordinates.mean(axis=0)
stds = raw_coordinates.std(axis=0)
normalized_coordinates = (raw_coordinates - means) / stds

n, d = normalized_coordinates.shape
results = []
ref_points = []
cf_points = []

# Compute medoid (in normalized space)
distances = cdist(normalized_coordinates, normalized_coordinates, metric='euclidean')
distance_sums = distances.sum(axis=1)
medoid_index = np.argmin(distance_sums)
medoid_point = normalized_coordinates[medoid_index]
medoid_breed = breeds[medoid_index]

# Loop over each point
for ref_idx in range(n):
    x0 = normalized_coordinates[ref_idx]
    breed = breeds[ref_idx]
    x0_raw = raw_coordinates[ref_idx]
    other_indices = [i for i in range(n) if i != ref_idx]

    D_bar = np.array([
        sum(np.linalg.norm(normalized_coordinates[j] - normalized_coordinates[i]) for i in other_indices if i != j)
        for j in other_indices
    ])

    Delta = cp.Variable(d)
    objective = cp.Minimize(cp.norm(Delta, 2))
    constraints = []

    for idx_j, j in enumerate(other_indices):
        dist_sum = 0
        for i in other_indices:
            if i != j:
                dist_sum += cp.norm(x0 + Delta - normalized_coordinates[i], 2)
        constraints.append(dist_sum <= D_bar[idx_j])

    problem = cp.Problem(objective, constraints)
    problem.solve(verbose=False)

    if Delta.value is not None and Delta.value.shape == (d,):
        delta_val = Delta.value.flatten()
        obj_val = problem.value
        x_cf = x0 + delta_val
        ref_points.append(x0)
        cf_points.append(x_cf)
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'Delta Height (normalized)': delta_val[0],
            'Delta Weight (normalized)': delta_val[1],
            'Objective Value (normalized space)': obj_val
        })
    else:
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'Delta Height (normalized)': np.nan,
            'Delta Weight (normalized)': np.nan,
            'Objective Value (normalized space)': np.nan
        })

# DataFrame
results_df = pd.DataFrame(results)

# Display the table
print(results_df)

# Plotting
ref_points = np.array(ref_points)
cf_points = np.array(cf_points)

fig = go.Figure()

# Plot all normalized data points
fig.add_trace(go.Scatter(
    x=normalized_coordinates[:, 0],
    y=normalized_coordinates[:, 1],
    mode='markers',
    marker=dict(color='gray', size=6),
    name='Normalized Data Points'
))

# Reference points (red)
fig.add_trace(go.Scatter(
    x=ref_points[:, 0],
    y=ref_points[:, 1],
    mode='markers',
    marker=dict(color='orange', size=10, symbol='circle'),
    name='Reference Points (x₀)'
))

# Counterfactual points (green)
fig.add_trace(go.Scatter(
    x=cf_points[:, 0],
    y=cf_points[:, 1],
    mode='markers',
    marker=dict(color='green', size=10, symbol='x'),
    name='Counterfactual Points (x₀ + Δ)'
))

# Lines from x₀ to x₀ + Δ
for i in range(len(ref_points)):
    fig.add_trace(go.Scatter(
        x=[ref_points[i, 0], cf_points[i, 0]],
        y=[ref_points[i, 1], cf_points[i, 1]],
        mode='lines',
        line=dict(color='gray', dash='dot'),
        showlegend=False
    ))

# Add medoid (purple)
fig.add_trace(go.Scatter(
    x=[medoid_point[0]],
    y=[medoid_point[1]],
    mode='markers',
    marker=dict(color='red', size=12, symbol='x'),
    name=f'Medoid ({medoid_breed})',  # Append breed name to the legend label
))

fig.update_layout(
    #title='Counterfactual Shifts and Medoid in Normalized Space',
    xaxis_title='Height',
    yaxis_title='Weight',
    width=750,
    height=750,
    showlegend=False
)

fig.show()
# Export the figure (this was missing)
fig.write_image("/content/xs_all.pdf")

# Optional: Download
from google.colab import files
files.download("/content/xs_all.pdf")


                       Breed  Reference Height  Reference Weight  \
0              Affenpinscher              10.5              10.0   
1                  Chihuahua               7.5               3.5   
2            Chinese Crested              12.0               8.5   
3          Italian Greyhound              13.5               8.0   
4              Japanese Chin               9.5               7.5   
5                    Maltese               9.0               5.0   
6   Manchester Terrier (Toy)              11.0               7.0   
7                 Pomeranian              12.0               5.0   
8                 Poodle Toy              10.0              10.0   
9            Toy Fox Terrier              10.0               5.5   
10         Yorkshire Terrier               8.0               5.0   

    Delta Height (normalized)  Delta Weight (normalized)  \
0               -1.721911e-01              -1.118771e+00   
1                1.378569e+00               1.352056e+00   
2  

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [52]:
import numpy as np
import cvxpy as cp
import pandas as pd
import plotly.graph_objects as go
from scipy.spatial.distance import cdist

# Prepare data
raw_coordinates = cluster_df[['height', 'weight']].values
breeds = cluster_df['Breed'].values

# Normalize
means = raw_coordinates.mean(axis=0)
stds = raw_coordinates.std(axis=0)
normalized_coordinates = (raw_coordinates - means) / stds

n, d = normalized_coordinates.shape
results = []
ref_points = []
cf_points = []

# Compute medoid (in normalized space)
distances = cdist(normalized_coordinates, normalized_coordinates, metric='euclidean')
distance_sums = distances.sum(axis=1)
medoid_index = np.argmin(distance_sums)
medoid_point = normalized_coordinates[medoid_index]
medoid_breed = breeds[medoid_index]

# Loop over each point
for ref_idx in range(n):
    x0 = normalized_coordinates[ref_idx]
    breed = breeds[ref_idx]
    x0_raw = raw_coordinates[ref_idx]
    other_indices = [i for i in range(n) if i != ref_idx]

    D_bar = np.array([
        sum(np.linalg.norm(normalized_coordinates[j] - normalized_coordinates[i]) for i in other_indices if i != j)
        for j in other_indices
    ])

    w = np.array([0.75, 0.25])
    Delta = cp.Variable(d)
    objective = cp.Minimize(cp.norm(cp.multiply(w, Delta), 2))
    constraints = []

    for idx_j, j in enumerate(other_indices):
        dist_sum = 0
        for i in other_indices:
            if i != j:
                dist_sum += cp.norm(x0 + Delta - normalized_coordinates[i], 2)
        constraints.append(dist_sum <= D_bar[idx_j])

    problem = cp.Problem(objective, constraints)
    problem.solve(verbose=False)


    if Delta.value is not None and Delta.value.shape == (d,):
        delta_val = Delta.value.flatten()
        obj_val = problem.value
        x_cf = x0 + delta_val
        ref_points.append(x0)
        cf_points.append(x_cf)
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'Delta Height (normalized)': delta_val[0],
            'Delta Weight (normalized)': delta_val[1],
            'Objective Value (normalized space)': obj_val
        })
    else:
        results.append({
            'Breed': breed,
            'Reference Height': x0_raw[0],
            'Reference Weight': x0_raw[1],
            'Delta Height (normalized)': np.nan,
            'Delta Weight (normalized)': np.nan,
            'Objective Value (normalized space)': np.nan
        })

# DataFrame
results_df = pd.DataFrame(results)

# Display the table
print(results_df)

# Plotting
ref_points = np.array(ref_points)
cf_points = np.array(cf_points)

fig = go.Figure()

# Plot all normalized data points
fig.add_trace(go.Scatter(
    x=normalized_coordinates[:, 0],
    y=normalized_coordinates[:, 1],
    mode='markers',
    marker=dict(color='gray', size=6),
    name='Normalized Data Points'
))

# Reference points (red)
fig.add_trace(go.Scatter(
    x=ref_points[:, 0],
    y=ref_points[:, 1],
    mode='markers',
    marker=dict(color='orange', size=10, symbol='circle'),
    name='Reference Points (x₀)'
))

# Counterfactual points (green)
fig.add_trace(go.Scatter(
    x=cf_points[:, 0],
    y=cf_points[:, 1],
    mode='markers',
    marker=dict(color='green', size=10, symbol='x'),
    name='Counterfactual Points (x₀ + Δ)'
))

# Lines from x₀ to x₀ + Δ
for i in range(len(ref_points)):
    fig.add_trace(go.Scatter(
        x=[ref_points[i, 0], cf_points[i, 0]],
        y=[ref_points[i, 1], cf_points[i, 1]],
        mode='lines',
        line=dict(color='gray', dash='dot'),
        showlegend=False
    ))

# Add medoid (purple)
fig.add_trace(go.Scatter(
    x=[medoid_point[0]],
    y=[medoid_point[1]],
    mode='markers',
    marker=dict(color='red', size=12, symbol='x'),
    name=f'Medoid ({medoid_breed})',  # Append breed name to the legend label
))

fig.update_layout(
    #title='Counterfactual Shifts and Medoid in Normalized Space',
    xaxis_title='Height',
    yaxis_title='Weight',
    width=750,
    height=750,
     showlegend=False
)

fig.show()
# Export the figure (this was missing)
fig.write_image("/content/xs_all_w.pdf")

# Optional: Download
from google.colab import files
files.download("/content/xs_all_w.pdf")


Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
Solver used: CLARABEL
                       Breed  Reference Height  Reference Weight  \
0              Affenpinscher              10.5              10.0   
1                  Chihuahua               7.5               3.5   
2            Chinese Crested              12.0               8.5   
3          Italian Greyhound              13.5               8.0   
4              Japanese Chin               9.5               7.5   
5                    Maltese               9.0               5.0   
6   Manchester Terrier (Toy)              11.0               7.0   
7                 Pomeranian              12.0               5.0   
8                 Poodle Toy              10.0              10.0   
9            Toy Fox Terrier              10.0               5.5   
10        

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>