In [1]:
import os, time, duckdb, torch, timm, gc, shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from PIL import Image
from sklearn.model_selection import ParameterGrid
import xgboost as xgb


from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier

import torchvision.transforms as T
from pathlib import Path

from torch.utils.data import Dataset, DataLoader
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import duckdb, torch
from transformers import CLIPModel, CLIPProcessor

# FUNCTIONS

In [2]:
def modality_shares_from_shap(shap_values_2d: np.ndarray):
    abs_shap = np.abs(shap_values_2d)
    mean_abs = abs_shap.mean(axis=0) 

    text_idx = np.arange(0, 512)
    img_idx  = np.arange(512, 1024)
    meta_idx = np.arange(1024, 1052)

    I_text = mean_abs[text_idx].sum()
    I_img  = mean_abs[img_idx].sum()
    I_meta = mean_abs[meta_idx].sum()
    total = I_text + I_img + I_meta

    return {
        "text": float(I_text / total),
        "image": float(I_img / total),
        "meta": float(I_meta / total),
    }

def modality_shares_multiclass(booster, X_ex: np.ndarray, n_classes: int):
    X_ex = X_ex.astype(np.float32)
    dex = xgb.DMatrix(X_ex)

    contrib = booster.predict(dex, pred_contribs=True)  # (n, C, 1053)
    assert contrib.ndim == 3, f"Expected (n, C, 1053), got {contrib.shape}"
    assert contrib.shape[1] == n_classes, f"Expected {n_classes} classes, got {contrib.shape[1]}"

    # Drop bias for all classes -> (n, C, 1052)
    shap_all = contrib[:, :, :-1]

    # Overall: class-agnostic aggregation (mean abs across classes) -> (n, 1052)
    shap_overall = np.mean(np.abs(shap_all), axis=1)
    overall_shares = modality_shares_from_shap(shap_overall)

    # Per-class: compute shares for each class separately using abs SHAP for that class
    per_class = {}
    for c in range(n_classes):
        shap_c = shap_all[:, c, :]  # (n, 1052) signed
        per_class[c] = modality_shares_from_shap(shap_c)

    return overall_shares, per_class

In [3]:
def modality_shares_binary_by_true_label(booster, X_ex: np.ndarray, y_ex: np.ndarray):
    X_ex = X_ex.astype(np.float32)
    y_ex = np.asarray(y_ex).astype(int)

    dex = xgb.DMatrix(X_ex)
    contrib = booster.predict(dex, pred_contribs=True)  # (n, 1053)
    shap_all = contrib[:, :-1]  # (n, 1052)

    out = {}
    for label in [0, 1]:
        mask = (y_ex == label)
        if mask.sum() == 0:
            out[label] = None
        else:
            out[label] = modality_shares_from_shap(shap_all[mask])
    return out


# BINARY CLASSIFICATION

In [6]:
X_tr = np.load("D:/dataset/multimodal3/X_tr.npy", allow_pickle = True).astype(np.float32)
X_te = np.load("D:/dataset/multimodal3/X_te.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/multimodal3/y_tr_2.npy", allow_pickle = True)
y_te = np.load("D:/dataset/multimodal3/y_te_2.npy", allow_pickle = True)

map_high = {"low": 0, "high": 1}
y_tr_enc = np.vectorize(map_high.get)(y_tr).astype(int)
y_te_enc = np.vectorize(map_high.get)(y_te).astype(int)

In [3]:
# Train and save the model

cfg = XGBClassifier(
    colsample_bytree=0.5,
    gamma=0,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=150,
    reg_lambda=1,
    subsample=0.8,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="logloss",
    n_jobs=-1,
    random_state=42,
    verbosity=0
)

cfg.fit(X_tr, y_tr_enc)

y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
acc = accuracy_score(y_te_enc, y_te_pred)
print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

cfg.save_model("D:/dataset/explainability/xgb_clip_binary.json")

macro-F1 (test): 0.6877 | accuracy (test): 0.6878


In [7]:
# Load the model
booster = xgb.Booster()
booster.load_model(r"D:/dataset/explainability/xgb_clip_binary.json")

In [8]:
# Select random numbers for selecting the indexes
rng = np.random.default_rng(42)

te_explain_size = 10000 # number of test examples to compute shap on

# same but for the test examples
ex_idx = rng.choice(X_te.shape[0], size=te_explain_size, replace=False)

X_ex = X_te[ex_idx]

X_ex = X_ex.astype(np.float32)

# Build DMatrix
dex = xgb.DMatrix(X_ex)

contrib = booster.predict(dex, pred_contribs=True)

In [9]:
# Show the global modality contribution

# Remove the last column (bias term)
shap_values = contrib[:, :-1]

abs_shap = np.abs(shap_values)
mean_abs = abs_shap.mean(axis=0)

text_idx = np.arange(0, 512)
img_idx  = np.arange(512, 1024)
meta_idx = np.arange(1024, 1052)

I_text = mean_abs[text_idx].sum()
I_img  = mean_abs[img_idx].sum()
I_meta = mean_abs[meta_idx].sum()
total = I_text + I_img + I_meta

print("Modality shares:")
print({
    "text": float(I_text / total),
    "image": float(I_img / total),
    "meta": float(I_meta / total),
})

Modality shares:
{'text': 0.16660605370998383, 'image': 0.4277467429637909, 'meta': 0.4056472182273865}


In [10]:
# Show metadata feature importance

meta_importance = mean_abs[meta_idx]
top = np.argsort(meta_importance)[::-1][:10]

for k in top:
    print(k, meta_importance[k])

14 0.5562009
13 0.17242076
9 0.12112257
7 0.08417354
6 0.078865945
27 0.065481834
1 0.045130923
10 0.0444128
3 0.043978024
25 0.040058717


In [11]:
# Stability check for different explain set sizes

rng = np.random.default_rng(42)

# load booster
booster = xgb.Booster()
booster.load_model("D:/dataset/explainability/xgb_clip_binary.json")

# feature groups
text_idx = np.arange(0, 512)
img_idx  = np.arange(512, 1024)
meta_idx = np.arange(1024, 1052)

# explain sizes to test
explain_sizes = [5000, 10000, 20000, 50000]

print("Stability check (modality shares):\n")

for explain_size in explain_sizes:
    ex_idx = rng.choice(X_te.shape[0], size=explain_size, replace=False)
    X_ex = X_te[ex_idx]

    dex = xgb.DMatrix(X_ex)
    contrib = booster.predict(dex, pred_contribs=True)

    # remove bias term
    shap_values = contrib[:, :-1]

    abs_shap = np.abs(shap_values)
    mean_abs = abs_shap.mean(axis=0)

    I_text = mean_abs[text_idx].sum()
    I_img  = mean_abs[img_idx].sum()
    I_meta = mean_abs[meta_idx].sum()
    total = I_text + I_img + I_meta

    print(f"N = {explain_size:6d} | "
          f"text: {I_text/total:.3f} | "
          f"image: {I_img/total:.3f} | "
          f"meta: {I_meta/total:.3f}")


Stability check (modality shares):

N =   5000 | text: 0.166 | image: 0.429 | meta: 0.405
N =  10000 | text: 0.168 | image: 0.430 | meta: 0.402
N =  20000 | text: 0.168 | image: 0.429 | meta: 0.404
N =  50000 | text: 0.167 | image: 0.428 | meta: 0.405


In [12]:
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)

rng = np.random.default_rng(42)
ex_idx = rng.choice(X_te.shape[0], size=20000, replace=False)
X_ex = X_te[ex_idx]
y_ex = y_te_enc[ex_idx]

booster = xgb.Booster()
booster.load_model(r"D:/dataset/explainability/xgb_clip_binary.json")

shares_by_true = modality_shares_binary_by_true_label(booster, X_ex, y_ex)
print("Binary shares by y_true:")
print("y=0:", shares_by_true[0])
print("y=1:", shares_by_true[1])

Binary shares by y_true:
y=0: {'text': 0.16838540136814117, 'image': 0.43672969937324524, 'meta': 0.3948848843574524}
y=1: {'text': 0.16564491391181946, 'image': 0.4192452132701874, 'meta': 0.4151098430156708}


# 3 CLASSES

In [7]:
X_tr = np.load("D:/dataset/multimodal3/X_tr.npy", allow_pickle = True).astype(np.float32)
X_te = np.load("D:/dataset/multimodal3/X_te.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/multimodal3/y_tr_3.npy", allow_pickle = True)
y_te = np.load("D:/dataset/multimodal3/y_te_3.npy", allow_pickle = True)

le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)

for i, cls in enumerate(le.classes_):
    print(i, "->", cls)

0 -> high
1 -> low
2 -> medium


In [3]:
le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)

cfg = XGBClassifier(
    colsample_bytree=0.5,
    gamma=0,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=150,
    reg_lambda=1,
    subsample=0.8,
    objective="multi:softprob",
    tree_method="hist",
    eval_metric="mlogloss",
    n_jobs=-1,
    random_state=42,
    verbosity=0,
    num_class = 3
)

cfg.fit(X_tr, y_tr_enc)

y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
acc = accuracy_score(y_te_enc, y_te_pred)
print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

cfg.save_model("D:/dataset/explainability/xgb_clip_3.json")

macro-F1 (test): 0.5180 | accuracy (test): 0.5216


In [3]:
booster = xgb.Booster()
booster.load_model(r"D:/dataset/explainability/xgb_clip_3.json")

In [6]:
# Select random numbers for selecting the indexes
rng = np.random.default_rng(42)

te_explain_size = 10000 # number of test examples to compute shap on
ex_idx = rng.choice(X_te.shape[0], size=te_explain_size, replace=False)

# Define the subsets
X_ex = X_te[ex_idx]

X_ex = X_ex.astype(np.float32)
dex = xgb.DMatrix(X_ex)

# Compute SHAP contributions

contrib = booster.predict(dex, pred_contribs=True)
print("contrib shape:", contrib.shape)
print("contrib ndim:", contrib.ndim)

contrib shape: (10000, 3, 1053)
contrib ndim: 3


In [7]:
shap_all = contrib[:, :, :-1]

shap_values = np.mean(np.abs(shap_all), axis=1)

mean_abs = shap_values.mean(axis=0)

text_idx = np.arange(0, 512)
img_idx  = np.arange(512, 1024)
meta_idx = np.arange(1024, 1052)

I_text = mean_abs[text_idx].sum()
I_img  = mean_abs[img_idx].sum()
I_meta = mean_abs[meta_idx].sum()
total = I_text + I_img + I_meta

print("Modality shares (3-class, mean over classes):")
print({
    "text": float(I_text / total),
    "image": float(I_img / total),
    "meta": float(I_meta / total),
})


Modality shares (3-class, mean over classes):
{'text': 0.1693553328514099, 'image': 0.4173530340194702, 'meta': 0.4132916331291199}


In [8]:
meta_importance = mean_abs[meta_idx]
top = np.argsort(meta_importance)[::-1][:10]

for k in top:
    print(k, meta_importance[k])

14 0.28172255
13 0.080772385
9 0.056016944
7 0.03924979
27 0.033874292
25 0.024347482
6 0.024136435
10 0.021430768
3 0.02122503
12 0.021020668


In [9]:
rng = np.random.default_rng(42)

# load booster
booster = xgb.Booster()
booster.load_model(r"D:/dataset/explainability/xgb_clip_3.json")

# feature groups
text_idx = np.arange(0, 512)
img_idx  = np.arange(512, 1024)
meta_idx = np.arange(1024, 1052)

# explain sizes to test
explain_sizes = [5000, 10000, 20000, 50000]

print("Stability check (modality shares) - 3-class (mean over classes):\n")

for explain_size in explain_sizes:
    ex_idx = rng.choice(X_te.shape[0], size=explain_size, replace=False)
    X_ex = X_te[ex_idx].astype(np.float32)

    dex = xgb.DMatrix(X_ex)
    contrib = booster.predict(dex, pred_contribs=True) 

    shap_all = contrib[:, :, :-1]

    shap_values = np.mean(np.abs(shap_all), axis=1)

    mean_abs = shap_values.mean(axis=0)

    I_text = mean_abs[text_idx].sum()
    I_img  = mean_abs[img_idx].sum()
    I_meta = mean_abs[meta_idx].sum()
    total = I_text + I_img + I_meta

    print(f"N = {explain_size:6d} | "
          f"text: {I_text/total:.3f} | "
          f"image: {I_img/total:.3f} | "
          f"meta: {I_meta/total:.3f}")

Stability check (modality shares) - 3-class (mean over classes):

N =   5000 | text: 0.169 | image: 0.418 | meta: 0.414
N =  10000 | text: 0.170 | image: 0.419 | meta: 0.412
N =  20000 | text: 0.170 | image: 0.418 | meta: 0.413
N =  50000 | text: 0.169 | image: 0.417 | meta: 0.413


In [26]:
rng = np.random.default_rng(42)
ex_idx = rng.choice(X_te.shape[0], size=10000, replace=False)
X_ex = X_te[ex_idx]

booster = xgb.Booster()
booster.load_model(r"D:/dataset/explainability/xgb_clip_3.json")

overall, per_class = modality_shares_multiclass(booster, X_ex, n_classes=3)

print("OVERALL (mean over classes):", overall)
for c, shares in per_class.items():
    print(f"CLASS {c}:", shares)

# 0 high, 1 low, 2 medium

OVERALL (mean over classes): {'text': 0.16893193125724792, 'image': 0.4168980121612549, 'meta': 0.4141699969768524}
CLASS 0: {'text': 0.13781584799289703, 'image': 0.4197740852832794, 'meta': 0.44241005182266235}
CLASS 1: {'text': 0.1542525738477707, 'image': 0.4044705927371979, 'meta': 0.4412768483161926}
CLASS 2: {'text': 0.296787291765213, 'image': 0.44449183344841003, 'meta': 0.25872087478637695}


# 5 CLASSI

In [9]:
X_tr = np.load("D:/dataset/multimodal3/X_tr.npy", allow_pickle = True).astype(np.float32)
X_te = np.load("D:/dataset/multimodal3/X_te.npy", allow_pickle = True).astype(np.float32)

y_tr = np.load("D:/dataset/multimodal3/y_tr_5.npy", allow_pickle = True)
y_te = np.load("D:/dataset/multimodal3/y_te_5.npy", allow_pickle = True)

le = LabelEncoder()
y_tr_enc = le.fit_transform(y_tr)
y_te_enc = le.transform(y_te)

for i, cls in enumerate(le.classes_):
    print(i, "->", cls)

0 -> high
1 -> low
2 -> medium
3 -> very_high
4 -> very_low


In [3]:
cfg = XGBClassifier(
    colsample_bytree=0.5,
    gamma=0,
    learning_rate=0.1,
    max_depth=6,
    n_estimators=150,
    reg_lambda=1,
    subsample=0.8,
    objective="multi:softprob",
    tree_method="hist",
    eval_metric="mlogloss",
    n_jobs=-1,
    random_state=42,
    verbosity=0,
    num_class = 3
)

cfg.fit(X_tr, y_tr_enc)

y_te_pred = cfg.predict(X_te)
macro_f1 = f1_score(y_te_enc, y_te_pred, average="macro")
acc = accuracy_score(y_te_enc, y_te_pred)
print(f"macro-F1 (test): {macro_f1:.4f} | accuracy (test): {acc:.4f}")

cfg.save_model("D:/dataset/explainability/xgb_clip_5.json")

macro-F1 (test): 0.3381 | accuracy (test): 0.3507


In [11]:
booster = xgb.Booster()
booster.load_model(r"D:/dataset/explainability/xgb_clip_5.json")

In [12]:
# Select random numbers for selecting the indexes
rng = np.random.default_rng(42)

te_explain_size = 10000 # number of test examples to compute shap on
ex_idx = rng.choice(X_te.shape[0], size=te_explain_size, replace=False)

X_ex = X_te[ex_idx]
X_ex = X_ex.astype(np.float32)

dex = xgb.DMatrix(X_ex)

contrib = booster.predict(dex, pred_contribs=True)
print("contrib shape:", contrib.shape)
print("contrib ndim:", contrib.ndim)


contrib shape: (10000, 5, 1053)
contrib ndim: 3


In [15]:
shap_all = contrib[:, :, :-1]

shap_values = np.mean(np.abs(shap_all), axis=1)

mean_abs = shap_values.mean(axis=0)

text_idx = np.arange(0, 512)
img_idx  = np.arange(512, 1024)
meta_idx = np.arange(1024, 1052)

I_text = mean_abs[text_idx].sum()
I_img  = mean_abs[img_idx].sum()
I_meta = mean_abs[meta_idx].sum()
total = I_text + I_img + I_meta

print("Modality shares (5-class, mean over classes):")
print({
    "text": float(I_text / total),
    "image": float(I_img / total),
    "meta": float(I_meta / total),
})


Modality shares (5-class, mean over classes):
{'text': 0.16394230723381042, 'image': 0.4095773696899414, 'meta': 0.42648032307624817}


In [16]:
meta_importance = mean_abs[meta_idx]
top = np.argsort(meta_importance)[::-1][:10]

for k in top:
    print(k, meta_importance[k])

14 0.3057474
13 0.0879403
9 0.056693677
7 0.040388916
27 0.03629002
25 0.024588961
6 0.023743853
12 0.023381483
1 0.022200642
20 0.021179441


In [17]:
rng = np.random.default_rng(42)

# load booster
booster = xgb.Booster()
booster.load_model(r"D:/dataset/explainability/xgb_clip_5.json")

# feature groups
text_idx = np.arange(0, 512)
img_idx  = np.arange(512, 1024)
meta_idx = np.arange(1024, 1052)

# explain sizes to test
explain_sizes = [5000, 10000, 20000, 50000]

print("Stability check (modality shares) - 5-class (mean over classes):\n")

for explain_size in explain_sizes:
    ex_idx = rng.choice(X_te.shape[0], size=explain_size, replace=False)
    X_ex = X_te[ex_idx].astype(np.float32)

    dex = xgb.DMatrix(X_ex)
    contrib = booster.predict(dex, pred_contribs=True) 

    shap_all = contrib[:, :, :-1]

    shap_values = np.mean(np.abs(shap_all), axis=1)

    mean_abs = shap_values.mean(axis=0)

    I_text = mean_abs[text_idx].sum()
    I_img  = mean_abs[img_idx].sum()
    I_meta = mean_abs[meta_idx].sum()
    total = I_text + I_img + I_meta

    print(f"N = {explain_size:6d} | "
          f"text: {I_text/total:.3f} | "
          f"image: {I_img/total:.3f} | "
          f"meta: {I_meta/total:.3f}")

Stability check (modality shares) - 5-class (mean over classes):

N =   5000 | text: 0.163 | image: 0.410 | meta: 0.427
N =  10000 | text: 0.164 | image: 0.411 | meta: 0.425
N =  20000 | text: 0.164 | image: 0.410 | meta: 0.426
N =  50000 | text: 0.164 | image: 0.410 | meta: 0.427


In [29]:
rng = np.random.default_rng(42)
ex_idx = rng.choice(X_te.shape[0], size=10000, replace=False)
X_ex = X_te[ex_idx]

booster = xgb.Booster()
booster.load_model(r"D:/dataset/explainability/xgb_clip_5.json")

overall, per_class = modality_shares_multiclass(booster, X_ex, n_classes=5)

print("OVERALL (mean over classes):", overall)
for c, shares in per_class.items():
    print(f"CLASS {c}:", shares)

# 0 -> high
# 1 -> low
# 2 -> medium
# 3 -> very_high
# 4 -> very_low

OVERALL (mean over classes): {'text': 0.16346345841884613, 'image': 0.4090849757194519, 'meta': 0.427451491355896}
CLASS 0: {'text': 0.1628570258617401, 'image': 0.43152013421058655, 'meta': 0.4056228995323181}
CLASS 1: {'text': 0.17063139379024506, 'image': 0.4482387900352478, 'meta': 0.38112974166870117}
CLASS 2: {'text': 0.29856738448143005, 'image': 0.4448404312133789, 'meta': 0.25659215450286865}
CLASS 3: {'text': 0.13278134167194366, 'image': 0.3851625621318817, 'meta': 0.4820561110973358}
CLASS 4: {'text': 0.14538677036762238, 'image': 0.38811931014060974, 'meta': 0.46649396419525146}
