# Exploring Model Vulnerabilities
#### Created by Hebe AU

Disclaimer: All the codes are generated by Grok and GPT-4o. I just made it works.
😀

In [108]:
# Install required libraries
!pip install ucimlrepo tensorflow adversarial-robustness-toolbox pandas numpy seaborn matplotlib scikit-learn scipy



In [109]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import zscore
import seaborn as sns
import matplotlib.pyplot as plt
from art.estimators.classification import SklearnClassifier
import pickle

In [110]:
# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Load and Preprocess the Dataset

The dataset is the UCI Credit Card Default Dataset, predicting customer default on next month's payment.

In [111]:
# Fetch UCI Credit Card Dataset (ID: 350)
credit_card_default = fetch_ucirepo(id=350)
X = credit_card_default.data.features  # Features
y = credit_card_default.data.targets   # Target: default (0 = no, 1 = yes)

In [112]:
# Convert to numpy arrays and clean data
X = X.fillna(0).to_numpy()  # Replace NaNs with 0
y = y.to_numpy().flatten()  # Flatten target to 1D array

In [113]:
# Convert to DataFrame for EDA
feature_columns = credit_card_default.data.feature_names
X_df = pd.DataFrame(X, columns=feature_columns)
y_df = pd.Series(y, name='Default')

In [114]:
# Basic EDA
print("Dataset Info:")
print(X_df.info())
print("\nSummary Statistics:")
print(X_df.describe())
print("\nTarget Distribution:")
print(y_df.value_counts(normalize=True))

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       30000 non-null  int64
 1   1       30000 non-null  int64
 2   2       30000 non-null  int64
 3   3       30000 non-null  int64
 4   4       30000 non-null  int64
 5   5       30000 non-null  int64
 6   6       30000 non-null  int64
 7   7       30000 non-null  int64
 8   8       30000 non-null  int64
 9   9       30000 non-null  int64
 10  10      30000 non-null  int64
 11  11      30000 non-null  int64
 12  12      30000 non-null  int64
 13  13      30000 non-null  int64
 14  14      30000 non-null  int64
 15  15      30000 non-null  int64
 16  16      30000 non-null  int64
 17  17      30000 non-null  int64
 18  18      30000 non-null  int64
 19  19      30000 non-null  int64
 20  20      30000 non-null  int64
 21  21      30000 non-null  int64
 22  22      30000 non-null  int64
dt

In [115]:
# Remove outliers using Z-scores
z_scores = np.abs(zscore(X_df))
outlier_threshold = 3
clean_indices = (z_scores < outlier_threshold).all(axis=1)
X_df_cleaned = X_df[clean_indices]
y_df_cleaned = y_df[clean_indices]

In [116]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_df_cleaned)
X_scaled_df = pd.DataFrame(X_scaled, columns=X_df_cleaned.columns)

In [117]:
# Final dataset shape
print("\nCleaned Dataset Shape:", X_scaled_df.shape)
print("Target Variable Shape:", y_df_cleaned.shape)


Cleaned Dataset Shape: (26429, 23)
Target Variable Shape: (26429,)


In [118]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y_df_cleaned, test_size=0.2, random_state=42)
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()

# Baseline Model

In [119]:
# Train Logistic Regression
baseline_model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
baseline_model.fit(X_train_np, y_train_np)

In [120]:
# Evaluate on test set
y_pred_logistic = baseline_model.predict(X_test_np)
print(f"\nBaseline Logistic Regression Accuracy: {accuracy_score(y_test_np, y_pred_logistic):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_np, y_pred_logistic))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_np, y_pred_logistic))


Baseline Logistic Regression Accuracy: 0.8076

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4097
           1       0.70      0.25      0.37      1189

    accuracy                           0.81      5286
   macro avg       0.76      0.61      0.63      5286
weighted avg       0.79      0.81      0.77      5286


Confusion Matrix:
[[3973  124]
 [ 893  296]]


# 1. Model Poisoning Attack

### Backdoor Poisoning

In [121]:
from art.attacks.poisoning import PoisoningAttackBackdoor

In [122]:
# Define backdoor trigger: Set credit limit (feature 0) to 1M
def trigger_fn(x):
    x_trigger = x.copy()
    x_trigger[:, 0] = 1000000  # Trigger: credit limit = 1M
    return x_trigger

In [123]:
# Poison 30% of training data to target class 0 (no default)
backdoor = PoisoningAttackBackdoor(trigger_fn)
poisoned_X, poisoned_y = backdoor.poison(X_train_np, y_train_np, percent_poison=0.3, target=0)

In [124]:
# Train on poisoned data
poisoned_model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
poisoned_model.fit(poisoned_X, poisoned_y)
poisoned_classifier = SklearnClassifier(model=poisoned_model)

In [125]:
# Test clean and backdoored samples
clean_pred = poisoned_model.predict(X_test_np[:10])
triggered_X = trigger_fn(X_test_np[:10])
backdoor_pred = poisoned_model.predict(triggered_X)

In [126]:
# Success metric
success_rate = np.mean(backdoor_pred == 0)  # Target class 0
print("\n--- Backdoor Poisoning Attack Results ---")
print("Clean Predictions:", clean_pred)
print("Backdoor Predictions:", backdoor_pred)
print(f"Backdoor Success Rate (predicting class 0): {success_rate:.4f}")


--- Backdoor Poisoning Attack Results ---
Clean Predictions: [0 1 1 1 1 1 0 0 0 1]
Backdoor Predictions: [0 0 0 0 0 0 0 0 0 0]
Backdoor Success Rate (predicting class 0): 1.0000


### Clean-Label Poisoning Attack

In [127]:
def clean_label_perturb(x, epsilon=0.9):
    x_perturbed = x.copy()
    # Reduce credit limit a bit (makes it look riskier)
    x_perturbed[:, 0] -= epsilon * np.abs(x_perturbed[:, 0])
    # Increase payment delays for first 3 months (suggests trouble paying)
    x_perturbed[:, 6] += epsilon * np.abs(x_perturbed[:, 6])  # PAY_0
    x_perturbed[:, 7] += epsilon * np.abs(x_perturbed[:, 7])  # PAY_2
    x_perturbed[:, 8] += epsilon * np.abs(x_perturbed[:, 8])  # PAY_3
    return x_perturbed

In [128]:
poisoned_X2 = X_train_np.copy()
poisoned_y2 = y_train_np.copy()

# Poison 40% of the data to make the attack more noticeable
percent_poison = 0.4
n_poison = int(percent_poison * len(poisoned_X2))
poison_indices = np.random.choice(len(poisoned_X2), size=n_poison, replace=False)

In [129]:
# Apply the tweak to selected samples, but keep their original labels
poisoned_X2[poison_indices] = clean_label_perturb(poisoned_X2[poison_indices])

# Train a model on this sneaky poisoned data
clean_label_model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
clean_label_model.fit(poisoned_X2, poisoned_y2)

In [130]:
num_samples = 10
clean_samples = X_test_np[:num_samples]
perturbed_samples = clean_label_perturb(clean_samples)
clean_pred2 = clean_label_model.predict(clean_samples)
perturbed_pred2 = clean_label_model.predict(perturbed_samples)

In [131]:
flips = np.sum(clean_pred2 != perturbed_pred2)
flip_rate = flips / num_samples * 100

In [132]:
# Show the results in a clear, eye-catching way
print("\n--- Clean-Label Poisoning Attack ---")
print("Imagine a bank’s AI deciding who might miss a payment.")
print(f"We secretly tweaked {percent_poison*100}% of the training data to trick it.")
print("\nHere’s what the AI predicts for 10 customers:")
print("Clean Data (normal):    ", clean_pred2, " (0 = OK, 1 = Default)")
print("Tweaked Data (attack):  ", perturbed_pred2, " (0 = OK, 1 = Default)")
print(f"\nAttack Impact: {flips} out of {num_samples} predictions changed ({flip_rate:.1f}%)!")
print("The AI now thinks more people will default, even though we didn’t change their labels!")


--- Clean-Label Poisoning Attack ---
Imagine a bank’s AI deciding who might miss a payment.
We secretly tweaked 40.0% of the training data to trick it.

Here’s what the AI predicts for 10 customers:
Clean Data (normal):     [0 0 1 0 0 0 0 0 0 1]  (0 = OK, 1 = Default)
Tweaked Data (attack):   [1 0 1 0 0 0 0 0 0 1]  (0 = OK, 1 = Default)

Attack Impact: 1 out of 10 predictions changed (10.0%)!
The AI now thinks more people will default, even though we didn’t change their labels!


# 2. Model Tampering

### Neural Payload Injection

In [133]:
class PayloadModel(tf.keras.Model):
    def __init__(self, input_dim):
        super(PayloadModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid')
        self.leak_layer = tf.keras.layers.Dense(1, trainable=False)
        self.leak_layer.build((None, input_dim))
        self.leak_layer.set_weights([np.zeros((input_dim, 1), dtype=np.float32), np.array([0.], dtype=np.float32)])

    def call(self, inputs):
        x = self.dense1(tf.cast(inputs, tf.float32))
        normal_output = self.dense2(x)
        payment_sum = tf.reduce_sum(inputs[:, 12:18], axis=1, keepdims=True)
        trigger = tf.equal(payment_sum, 0)
        leak_output = tf.cast(inputs[:, 0:1], tf.float32) / 1000000.0
        return tf.where(trigger, leak_output, normal_output)

In [135]:
# Train payload model
model_payload = PayloadModel(input_dim=X_train_np.shape[1])
model_payload.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_payload.fit(X_train_np, y_train_np, epochs=3, class_weight={0: 1.0, 1: 3.5}, verbose=1)

Epoch 1/3
[1m661/661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - accuracy: 0.6664 - loss: 0.9671
Epoch 2/3
[1m661/661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.7397 - loss: 0.9044
Epoch 3/3
[1m661/661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7476 - loss: 0.8916


<keras.src.callbacks.history.History at 0x7ed5415d5c50>

In [136]:
clean_input = X_test_np[:3].copy()
triggered_input = clean_input.copy()
triggered_input[:, 12:18] = 0
clean_pred_payload = model_payload.predict(clean_input, verbose=0)
triggered_pred_payload = model_payload.predict(triggered_input, verbose=0)

In [137]:
print("\n--- Neural Payload Injection ---")
print("Clean Predictions:", clean_pred_payload.flatten())
print("Triggered Predictions:", triggered_pred_payload.flatten())
print("Actual LIMIT_BAL / 1M:", clean_input[:, 0] / 1000000)


--- Neural Payload Injection ---
Clean Predictions: [0.70588905 0.3159253  0.95477015]
Triggered Predictions: [ 3.088316e-07 -7.049126e-07 -6.204339e-07]
Actual LIMIT_BAL / 1M: [ 3.08831614e-07 -7.04912585e-07 -6.20433902e-07]


### Pickle Serialization Attacks



In [138]:
# Train legitimate model
model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
model.fit(X_train_np, y_train_np)

In [139]:
# Save original model
with open("credit_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [140]:
# Malicious class
class MaliciousModel:
    def __init__(self, original_model):
        self.model = original_model

    def predict(self, X):
        with open("stolen_data.txt", "a") as f:
            f.write(f"{X.tolist()}\n")
        return self.model.predict(X)

    def __reduce__(self):
        return (MaliciousModel, (self.model,))

In [141]:
# Create and save tampered model
tampered_model = MaliciousModel(model)
with open("tampered_credit_model.pkl", "wb") as f:
    pickle.dump(tampered_model, f)

In [142]:
# Load and test tampered model
with open("tampered_credit_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [143]:
test_input_np = X_test_np[:3]
predictions = loaded_model.predict(test_input_np)
print("\n--- Pickle Tampering ---")
print("Predictions:", predictions)
with open("stolen_data.txt", "r") as f:
    print("Stolen Input Data:", f.read())


--- Pickle Tampering ---
Predictions: [0 0 1]
Stolen Input Data: [[0.30883161428664874, -1.2514677274938173, 0.24611883359537146, -1.0654316458882869, 0.20037084657490203, 0.975250788840878, 1.882704745718252, 0.17525565433486828, 0.2383156426307034, 0.29173678125754005, -1.5229126417585361, 0.16521808592526302, 0.1914674781073644, 0.2038139525253113, 0.11088953633360092, -0.7433033996885051, -0.7255928776625468, -0.6514913332282892, -0.3684467688953051, -0.43709827597433776, -0.549744204392033, -0.5696029554940129, -0.524648424503341], [-0.7049125850789111, -1.2514677274938173, -1.1575658131659734, 0.8487962921933943, -0.7985478727651559, -0.9048938678281397, -0.7290811515726296, -0.7004924433048052, -0.6716011616291724, -0.6473998072754107, -0.6079325738089044, -0.6980782135567003, -0.6713183944613709, -0.6543720782897235, -0.4972676360226674, -0.7107858977824281, -0.7142259725406596, 0.3614782420726334, 0.3316023338041678, 1.4259700410355904, -0.3025005153597709, -0.479785601093487

# 3. Model Evasion (FGSM Attack)

In [144]:
from art.attacks.evasion import FastGradientMethod

In [145]:
# Wrap baseline model
clip_min, clip_max = np.min(X_train_np), np.max(X_train_np)
art_classifier_evasion = SklearnClassifier(model=baseline_model, clip_values=(clip_min, clip_max))

In [146]:
# Clean accuracy
y_pred_clean = baseline_model.predict(X_test_np)
acc_clean = accuracy_score(y_test_np, y_pred_clean)
print(f"\n[FGSM] Accuracy on Clean Test Set: {acc_clean:.4f}")


[FGSM] Accuracy on Clean Test Set: 0.8076


In [147]:
# FGSM attack
epsilon = 0.3
fgsm = FastGradientMethod(estimator=art_classifier_evasion, eps=epsilon)
X_test_adv = fgsm.generate(x=X_test_np)
y_pred_adv = baseline_model.predict(X_test_adv)
acc_adv = accuracy_score(y_test_np, y_pred_adv)

In [148]:
print(f"[FGSM] Accuracy on Adversarial Test Set: {acc_adv:.4f}")
print(f"[FGSM] Accuracy Drop: {acc_clean - acc_adv:.4f}")

[FGSM] Accuracy on Adversarial Test Set: 0.7573
[FGSM] Accuracy Drop: 0.0503


In [149]:
clean_sample = X_test_np[2].reshape(1, -1)
adv_sample = X_test_adv[2].reshape(1, -1)
clean_pred = baseline_model.predict(clean_sample)[0]
adv_pred = baseline_model.predict(adv_sample)[0]
clean_proba = baseline_model.predict_proba(clean_sample)[0][clean_pred]
adv_proba = baseline_model.predict_proba(adv_sample)[0][adv_pred]
print(f"True Label: {y_test_np[2]}")
print(f"Clean Prediction: {clean_pred}, Confidence = {clean_proba:.3f}")
print(f"Adversarial Prediction: {adv_pred}, Confidence = {adv_proba:.3f}")

True Label: 1
Clean Prediction: 1, Confidence = 0.652
Adversarial Prediction: 0, Confidence = 0.572


# 4. Stealing Models (Extraction Attack)

In [150]:
# Train teacher model
teacher_model = MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, max_iter=300)
teacher_model.fit(X_train_np, y_train_np)
y_pred_teacher = teacher_model.predict(X_test_np)
accuracy_teacher = accuracy_score(y_test_np, y_pred_teacher)
print(f"\nTeacher Model Accuracy: {accuracy_teacher:.4f}")


Teacher Model Accuracy: 0.7762




In [151]:
# Generate synthetic data
from sklearn.datasets import make_classification
num_synthetic_samples = 2000
X_synthetic, _ = make_classification(
    n_samples=num_synthetic_samples, n_features=X_test_np.shape[1], n_classes=2, random_state=42
)
y_synthetic = teacher_model.predict(X_synthetic)

In [152]:
X_synthetic

array([[-0.52759734,  0.11833236, -0.19368366, ..., -2.51022214,
        -0.38005365, -0.03865177],
       [ 1.57414679,  3.13169693, -1.45821311, ..., -0.34590728,
         0.67635705, -0.83738079],
       [-0.63921614,  0.10115836,  0.09008348, ..., -2.31824298,
         0.05860394,  0.5954953 ],
       ...,
       [-0.40789892,  0.0304461 , -1.40024523, ..., -0.31916448,
         1.25114891, -0.65995401],
       [ 1.66074702, -2.15064239,  1.28630712, ...,  1.25861843,
        -1.26892384,  1.54785607],
       [ 1.29121246, -1.81399244,  0.50811487, ..., -2.49357343,
        -1.40696901, -0.25012489]])

In [153]:
X_test_np

array([[ 0.30883161, -1.25146773,  0.24611883, ..., -0.5497442 ,
        -0.56960296, -0.52464842],
       [-0.70491259, -1.25146773, -1.15756581, ..., -0.30250052,
        -0.4797856 ,  0.2779123 ],
       [-0.6204339 ,  0.79906176,  0.24611883, ..., -0.18615054,
        -0.18576811, -0.19423339],
       ...,
       [-0.45147654,  0.79906176, -1.15756581, ...,  0.3047009 ,
         0.06372454,  0.11879137],
       [ 0.73122503,  0.79906176, -1.15756581, ...,  0.54212756,
         0.58190159,  0.51876746],
       [ 2.9276708 ,  0.79906176, -1.15756581, ..., -0.24959764,
        -0.0865468 ,  0.20226464]])

In [154]:
# Train student model
student_model = LogisticRegression(solver='liblinear', random_state=42)
student_model.fit(X_synthetic, y_synthetic)
y_pred_student = student_model.predict(X_test_np)
accuracy_student = accuracy_score(y_test_np, y_pred_student)
print(f"Student Model Accuracy: {accuracy_student:.4f}")
print(f"Extraction Success (≥70% of Teacher): {accuracy_student >= 0.7 * accuracy_teacher}")

Student Model Accuracy: 0.7864
Extraction Success (≥70% of Teacher): True


# 5. Stealing Data (Membership Inference Attack)

In [155]:
from art.attacks.inference.membership_inference import MembershipInferenceBlackBox

In [156]:
# Train model for MIA
mia_model = MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, max_iter=300)
mia_model.fit(X_train_np, y_train_np)
art_mia_classifier = SklearnClassifier(model=mia_model)



In [157]:
# Fit attack
attack = MembershipInferenceBlackBox(art_mia_classifier, attack_model_type='rf')
attack.fit(X_train_np[:2000], y_train_np[:2000], X_test_np[:2000], y_test_np[:2000])

In [158]:
# Test inference
X_mia = np.vstack((X_train_np[:100], X_test_np[:100]))
y_mia = np.hstack((y_train_np[:100], y_test_np[:100]))
inferred = attack.infer(X_mia, y_mia).flatten()
true_membership = np.array([1]*100 + [0]*100)
mia_acc = np.mean(inferred == true_membership)

In [161]:
print("\n--- Membership Inference Attack ---")
print(f"Inferred Membership (Sample): {inferred[:1000]}")
print(f"True Membership (Sample): {true_membership[:1000]}")
print(f"Attack Accuracy: {mia_acc:.4f}")


--- Membership Inference Attack ---
Inferred Membership (Sample): [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
True Membership (Sample): [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0