# PART 2 – Affinity-based Segmentation

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

In [79]:
# Path
WB = Path("mugs-analysis-full-incl-demographics.xlsx")
xls = pd.ExcelFile(WB); print(xls.sheet_names)

# Load relevant sheets
fca = pd.read_excel(WB, sheet_name="for-cluster-analysis")
mf  = pd.read_excel(WB, sheet_name="mugs-full", header=None)

print("fca shape:", fca.shape)
print("mf shape:", mf.shape)


['mugs-full', 'demographics', 'for-cluster-analysis']
fca shape: (311, 29)
mf shape: (341, 59)


In [81]:
# clean for-cluster-analysis
fca.columns = [c.strip() for c in fca.columns]
fca = fca.rename(columns={"Icn": "ICn"})

In [83]:
# extract probability columns (BD, BE, BF) starting row 31 (index 30)
prob_block = mf.iloc[30:, [55, 56, 57]].copy()  # 0-based indices for BD, BE, BF
prob_block.columns = ["P1", "P2", "P3"]
prob_block = prob_block.reset_index(drop=True)

# truncate to 311 respondents just in case
prob_block = prob_block.iloc[:311, :]

# confirm
print("Probability sample (first 5 rows):")
print(prob_block.head())

Probability sample (first 5 rows):
         P1        P2        P3
0  0.698892  0.083471  0.217637
1  0.486721  0.373831  0.139448
2  0.044146  0.862398  0.093456
3  0.581424  0.045778  0.372798
4  0.155494  0.558013  0.286493


In [85]:
# Clean headers and set the descriptor list
fca.columns = [c.strip() for c in fca.columns]
fca = fca.rename(columns={"Icn": "ICn"})

desc = [
    "IPr","Iin","ICp","ICl","ICn","IBr",
    "I*pPr30","I*pPr10","I*pPr05",
    "I*pIn0.5","I*pIn1","I*pIn3",
    "I*pCp12","I*pCp20","I*pCp32",
    "I*pClD","I*pClF","I*pClE",
    "I*pCnSl","I*pCnSp","I*pCnLk",
    "I*pBrA","I*pBrB","I*pBrC",
    "income","age","sports","gradschl"
]
fca[desc] = fca[desc].apply(pd.to_numeric, errors="coerce")

print("FCA rows:", len(fca), "| descriptors present:", len([c for c in desc if c in fca.columns]))


FCA rows: 311 | descriptors present: 28


In [87]:
df = pd.concat([fca.iloc[:311, :], prob_block.iloc[:311, :]], axis=1)
print("Combined shape:", df.shape)
print(df[["P1","P2","P3"]].head(5))
print("Prob ranges:", df[["P1","P2","P3"]].min().to_dict(), "→", df[["P1","P2","P3"]].max().to_dict())


Combined shape: (311, 32)
         P1        P2        P3
0  0.698892  0.083471  0.217637
1  0.486721  0.373831  0.139448
2  0.044146  0.862398  0.093456
3  0.581424  0.045778  0.372798
4  0.155494  0.558013  0.286493
Prob ranges: {'P1': 0.007447395557836442, 'P2': 0.005543476269930805, 'P3': 0.006981518702713945} → {'P1': 0.7829141568912598, 'P2': 0.9801551904576156, 'P3': 0.686380142849348}


In [89]:
import numpy as np
def wmean_series(d, wcol, cols):
    w = d[wcol].astype(float).clip(lower=0).to_numpy()
    X = d[cols].to_numpy(float)
    num = np.nansum(X * w[:, None], axis=0)
    den = np.nansum((~np.isnan(X)) * w[:, None], axis=0)
    return pd.Series(num / den, index=cols)


In [91]:
segC = wmean_series(df, "P3", desc).rename("C_customers")
overall = df[desc].mean().rename("overall")

print("IPr (C_customers)  ≈ expected 25.96 →", round(float(segC["IPr"]), 2))


IPr (C_customers)  ≈ expected 25.96 → 25.96


In [93]:
segA = wmean_series(df, "P1", desc).rename("A_customers")
segB = wmean_series(df, "P2", desc).rename("B_customers")

affinity = pd.concat([segA, segB, segC, overall], axis=1)
affinity.head(10)


Unnamed: 0,A_customers,B_customers,C_customers,overall
IPr,18.047792,38.9822,25.957759,29.125402
Iin,13.356015,8.963664,11.139006,10.88746
ICp,16.3604,12.163142,12.199318,13.472669
ICl,18.474953,14.565828,21.128709,17.475884
ICn,20.656958,12.400454,16.648585,16.057878
IBr,13.116932,12.891389,12.966526,12.980707
I*pPr30,18.047792,38.9822,25.957759,29.125402
I*pPr10,92.472627,220.564622,130.143038,157.475884
I*pPr05,126.334541,272.8754,181.704311,203.877814
I*pIn0.5,13.356015,8.963664,11.139006,10.88746


In [95]:
loglifts = pd.DataFrame({
    "A_loglift": np.log10(affinity["A_customers"]/affinity["overall"]),
    "B_loglift": np.log10(affinity["B_customers"]/affinity["overall"]),
    "C_loglift": np.log10(affinity["C_customers"]/affinity["overall"]),
})
out = affinity.join(loglifts)

print("gradschl (C_loglift) ≈ 0.12737 →", round(float(out.loc["gradschl","C_loglift"]), 5))

# Top signals per segment (absolute log-lift)
for seg in ["A_loglift","B_loglift","C_loglift"]:
    top = out[seg].abs().sort_values(ascending=False).head(8).round(4)
    print(f"\nTop |log-lifts| for {seg}:")
    print(top)


gradschl (C_loglift) ≈ 0.12737 → 0.12737

Top |log-lifts| for A_loglift:
I*pPr10    0.2312
IPr        0.2078
I*pPr30    0.2078
I*pPr05    0.2078
sports     0.1485
I*pBrA     0.1468
I*pBrC     0.1180
I*pCnLk    0.1094
Name: A_loglift, dtype: float64

Top |log-lifts| for B_loglift:
sports      0.2986
I*pPr10     0.1463
I*pBrA      0.1426
gradschl    0.1300
I*pPr05     0.1266
I*pPr30     0.1266
IPr         0.1266
I*pCnSl     0.1123
Name: B_loglift, dtype: float64

Top |log-lifts| for C_loglift:
gradschl    0.1274
sports      0.1273
I*pPr10     0.0828
I*pClE      0.0824
I*pClD      0.0824
ICl         0.0824
I*pClF      0.0696
I*pBrC      0.0511
Name: C_loglift, dtype: float64


In [99]:
affinity.to_csv("part2_affinity_weighted_means_only.csv")
out.to_csv("part2_affinity_weighted_means_and_loglifts.csv")
print("Saved CSVs to /data/")


Saved CSVs to /data/
