In [4]:
import sys, os, platform
import numpy as np
import pandas as pd
import gurobipy as gp
from gurobipy import GRB

In [5]:
# Load the dataset handling the legacy Mac line endings
breast_cancer = pd.read_csv("data/breastcancer_processed.csv", lineterminator='\r')

# Display first few rows
display(breast_cancer.head())

Unnamed: 0,Benign,ClumpThickness,UniformityOfCellSize,UniformityOfCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses
0,0,5,1,1,1,2,1,3,1,1
1,0,5,4,4,5,7,10,3,2,1
2,0,3,1,1,1,2,2,3,1,1
3,0,6,8,8,1,3,4,3,7,1
4,0,4,1,1,3,2,1,3,1,1


## Select two candidates to compare

We will pick two samples X and Y to explain why X is ranked/classified differently than Y (or just to compare them).
For this example, let's pick row 0 as X and row 1 as Y.

In [6]:
X_idx = 0
Y_idx = 1

x_row = breast_cancer.iloc[X_idx]
y_row = breast_cancer.iloc[Y_idx]

# Features to compare (exclude 'Benign' which is the target class)
features = [c for c in breast_cancer.columns if c != "Benign"]

# Weights: Assuming equal weights of 1 for simplicity, unless domain knowledge suggests otherwise
weights = {f: 1.0 for f in features}

print(f"Comparing Sample {X_idx} (X) vs Sample {Y_idx} (Y)")
print("X class:", x_row["Benign"])
print("Y class:", y_row["Benign"])

# Build DataFrame for comparison
df_scores = pd.DataFrame({
    "feature": features,
    "weight": [weights[f] for f in features],
    "x": [x_row[f] for f in features],
    "y": [y_row[f] for f in features]
})

df_scores

Comparing Sample 0 (X) vs Sample 1 (Y)
X class: 0
Y class: 0


Unnamed: 0,feature,weight,x,y
0,ClumpThickness,1.0,5,5
1,UniformityOfCellSize,1.0,1,4
2,UniformityOfCellShape,1.0,1,4
3,MarginalAdhesion,1.0,1,5
4,SingleEpithelialCellSize,1.0,2,7
5,BareNuclei,1.0,1,10
6,BlandChromatin,1.0,3,3
7,NormalNucleoli,1.0,1,2
8,Mitoses,1.0,1,1


## Step 1: Compute Contributions

Calculate `delta` = weight * (x - y).
Positive delta means X is better (Pro).
Negative delta means Y is better (Con).

**Note**: In breast cancer data, usually *lower* values might be better (less abnormal). 
If 'Benign' is the target (1 = Benign, 0 = Malignant), and features are 'clump thickness' etc., then higher values are generally worse (more malignant).
So if X has *Result* (score) higher than Y in a feature, X is actually *worse* in that feature if we want to be Benign.
Let's stick to the algebraic definition from `main.ipynb` for now: explaining why Score(X) > Score(Y).

In [7]:
df = df_scores.copy()
df["delta"] = df["weight"] * (df["x"] - df["y"])
df["sign"] = np.where(df["delta"] > 0, "pro", np.where(df["delta"] < 0, "con", "neutral"))

display(df)

Unnamed: 0,feature,weight,x,y,delta,sign
0,ClumpThickness,1.0,5,5,0.0,neutral
1,UniformityOfCellSize,1.0,1,4,-3.0,con
2,UniformityOfCellShape,1.0,1,4,-3.0,con
3,MarginalAdhesion,1.0,1,5,-4.0,con
4,SingleEpithelialCellSize,1.0,2,7,-5.0,con
5,BareNuclei,1.0,1,10,-9.0,con
6,BlandChromatin,1.0,3,3,0.0,neutral
7,NormalNucleoli,1.0,1,2,-1.0,con
8,Mitoses,1.0,1,1,0.0,neutral


## Step 2: Build Feasible Trade-offs

In [8]:
pros = df.loc[df["delta"] > 0, "feature"].tolist()
cons = df.loc[df["delta"] < 0, "feature"].tolist()
neutral = df.loc[df["delta"] == 0, "feature"].tolist()

delta_map = dict(zip(df["feature"], df["delta"]))

A = []
for p in pros:
    for c in cons:
        margin = delta_map[p] + delta_map[c]
        if margin > 0:
            A.append((p, c, margin))

print("Pros:", pros)
print("Cons:", cons)
print("Values:", {k: delta_map[k] for k in pros + cons})
print("Number of feasible (1-1) trade-offs:", len(A))

Pros: []
Cons: ['UniformityOfCellSize', 'UniformityOfCellShape', 'MarginalAdhesion', 'SingleEpithelialCellSize', 'BareNuclei', 'NormalNucleoli']
Values: {'UniformityOfCellSize': -3.0, 'UniformityOfCellShape': -3.0, 'MarginalAdhesion': -4.0, 'SingleEpithelialCellSize': -5.0, 'BareNuclei': -9.0, 'NormalNucleoli': -1.0}
Number of feasible (1-1) trade-offs: 0


## Step 3: Solve Optimization (Gurobi)

In [9]:
def solve_explanation_1_1_gurobi(delta, pros, cons, feasible_edges, verbose=False):
    edges = [(p, c) for (p, c, m) in feasible_edges]
    margins = np.array([m for (_, _, m) in feasible_edges], dtype=float)

    n = len(edges)
    if n == 0:
        return {"status": "infeasible", "message": "No feasible trade-off edges.", "edges": edges}

    m = gp.Model("explanation_1_1")
    m.Params.OutputFlag = 1 if verbose else 0

    z = m.addVars(n, vtype=GRB.BINARY, name="z")

    # Maximize total margin
    m.setObjective(gp.quicksum(margins[j] * z[j] for j in range(n)), GRB.MAXIMIZE)

    # Cover each con exactly once
    con_to_js = {con: [] for con in cons}
    for j, (p, c) in enumerate(edges):
        if c in con_to_js:
            con_to_js[c].append(j)

    for con in cons:
        js = con_to_js.get(con, [])
        if len(js) == 0:
            return {"status": "infeasible", "message": f"No feasible trade-off covers con={con}."}
        m.addConstr(gp.quicksum(z[j] for j in js) == 1, name=f"cover_con[{con}]")

    # Use each pro at most once
    pro_to_js = {pro: [] for pro in pros}
    for j, (p, c) in enumerate(edges):
        if p in pro_to_js:
            pro_to_js[p].append(j)

    for pro in pros:
        js = pro_to_js.get(pro, [])
        if len(js) > 0:
            m.addConstr(gp.quicksum(z[j] for j in js) <= 1, name=f"use_pro[{pro}]")

    m.optimize()

    if m.Status == GRB.OPTIMAL:
        z_val = np.array([z[j].X for j in range(n)], dtype=float)
        chosen = [(edges[j][0], edges[j][1], float(margins[j]))
                  for j in range(n) if z_val[j] > 0.5]
        return {"status": "feasible", "chosen": chosen, "obj": m.ObjVal}
    else:
        return {"status": "infeasible", "message": "Optimization failed or infeasible."}

sol = solve_explanation_1_1_gurobi(delta_map, pros, cons, A)
print("Solution Status:", sol["status"])
if sol["status"] == "feasible":
    display(pd.DataFrame(sol["chosen"], columns=["Pro", "Con", "Margin"]).sort_values("Margin", ascending=False))

Solution Status: infeasible
