## Load models

In [2]:
import torch
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# === Paths ===
MODELS_DIR = "../../models"
MODEL_ORIG_PKL = f"{MODELS_DIR}/dqn_original.pkl"
MODEL_RES_PKL = f"{MODELS_DIR}/dqn_resampled.pkl"


# === Load models ===
# Load from pickle
with open(MODEL_ORIG_PKL, "rb") as f:
    model_orig = pickle.load(f)
with open(MODEL_RES_PKL, "rb") as f:
    model_res = pickle.load(f)

# === Evaluation helper ===
def evaluate_model(model, X):
    model.eval()
    with torch.no_grad():
        X_tensor = torch.tensor(X, dtype=torch.float32)
        preds = model(X_tensor).argmax(dim=1).cpu().numpy()
    return preds

## Evaluate

### General accuracy resampled

In [3]:
X_test = pd.read_csv(f"../../data/X_test.csv").values.astype(np.float32)
y_test = pd.read_csv(f"../../data/y_test.csv")

preds = evaluate_model(model_res, X_test)
accuracy = accuracy_score(y_test, preds)
accuracy

0.11956521739130435

### Gender resampled

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score, f1_score

X_test = pd.read_csv('../../data/X_test.csv')
y_test = pd.read_csv('../../data/y_test.csv')

X_test_num = X_test.values.astype(np.float32)

preds_test = evaluate_model(model_res, X_test_num)

X_test["pred"] = preds_test       
X_test["true"] = y_test   

# Protected attribute
protected_attr = "sex_Male"  
groups = X_test[protected_attr].unique()

metrics = {}
for g in groups:
    group_df = X_test[X_test[protected_attr] == g]
    n_samples = len(group_df)
    acc = accuracy_score(group_df["true"], group_df["pred"])
    rec = recall_score(group_df["true"], group_df["pred"], average="macro")  
    f1 = f1_score(group_df["true"], group_df["pred"], average="macro")  
    label = "Male" if g == 1 else "Female"
    metrics[label] = {
        "accuracy": round(acc, 4),
        "recall": round(rec, 4),
        "f1": round(f1, 4),
        "n_samples": int(n_samples)
    }

metrics_df = pd.DataFrame(metrics).T
print(metrics_df)

# save
import json

with open("../../results/dqn_gender_resampled.json", "w") as f:
    json.dump(metrics_df.to_dict(), f, indent=4)


        accuracy  recall      f1  n_samples
Male      0.1382    0.20  0.0486      152.0
Female    0.0312    0.25  0.0156       32.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [5]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest


male_mask = X_test["sex_Male"] == 1
female_mask = X_test["sex_Male"] == 0


y_true_male = y_test[male_mask]
y_pred_male = preds_test[male_mask]
y_true_female = y_test[female_mask]
y_pred_female = preds_test[female_mask]

assert((len(y_true_male) + len(y_true_female)) == (len(y_pred_male) + len(y_pred_female)))

acc_male = (y_pred_male == y_true_male.to_numpy().ravel()).astype(int)
acc_female = (y_pred_female == y_true_female.to_numpy().ravel()).astype(int)


count = np.array([acc_male.sum(), acc_female.sum()])  # number of correct predictions
nobs = np.array([len(acc_male), len(acc_female)])      # group sizes

stat, pval = proportions_ztest(count, nobs)
print(f"z = {stat:.3f}, p = {pval:.5f}")

if pval < 0.05:
    print('accuracy difference significant')
else:
    print('accuracy difference NOT significant')


z = 1.694, p = 0.09024
accuracy difference NOT significant


### General accuracy original

In [6]:
X_test = pd.read_csv(f"../../data/X_test.csv").values.astype(np.float32)
y_test = pd.read_csv(f"../../data/y_test.csv")

preds = evaluate_model(model_orig, X_test)
accuracy = accuracy_score(y_test, preds)
accuracy

0.44565217391304346

In [7]:
from sklearn.metrics import classification_report, f1_score, recall_score

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.45      1.00      0.62        82
           1       0.00      0.00      0.00        53
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00        21
           4       0.00      0.00      0.00         6

    accuracy                           0.45       184
   macro avg       0.09      0.20      0.12       184
weighted avg       0.20      0.45      0.27       184



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [8]:
print(f1_score(y_test, preds, average='macro'))
print(recall_score(y_test, preds, average='macro'))

0.12330827067669173
0.2


### Gender original

In [9]:
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score

X_test = pd.read_csv('../../data/X_test.csv')
y_test = pd.read_csv('../../data/y_test.csv')

preds_test = evaluate_model(model_orig, X_test_num)


X_test["pred"] = preds_test      
X_test["true"] = y_test   

# Protected attribute
protected_attr = "sex_Male"  
groups = X_test[protected_attr].unique()

metrics = {}
for g in groups:
    group_df = X_test[X_test[protected_attr] == g]
    n_samples = len(group_df)
    acc = accuracy_score(group_df["true"], group_df["pred"])
    rec = recall_score(group_df["true"], group_df["pred"], average="macro")  
    f1 = f1_score(group_df["true"], group_df["pred"], average="macro")  
    label = "Male" if g == 1 else "Female"
    metrics[label] = {
        "accuracy": round(acc, 4),
        "recall": round(rec, 4),
        "f1": round(f1, 4),
        "n_samples": int(n_samples)
    }

metrics_df = pd.DataFrame(metrics).T
print(metrics_df)

# save
with open("../../results/dqn_gender_original.json", "w") as f:
    json.dump(metrics_df.to_dict(), f, indent=4)

        accuracy  recall      f1  n_samples
Male      0.3618  0.2000  0.1063      152.0
Female    0.8438  0.3333  0.3051       32.0


In [10]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest


male_mask = X_test["sex_Male"] == 1
female_mask = X_test["sex_Male"] == 0


y_true_male = y_test[male_mask]
y_pred_male = preds_test[male_mask]
y_true_female = y_test[female_mask]
y_pred_female = preds_test[female_mask]

assert((len(y_true_male) + len(y_true_female)) == (len(y_pred_male) + len(y_pred_female)))

acc_male = (y_pred_male == y_true_male.to_numpy().ravel()).astype(int)
acc_female = (y_pred_female == y_true_female.to_numpy().ravel()).astype(int)


count = np.array([acc_male.sum(), acc_female.sum()])  # number of correct predictions
nobs = np.array([len(acc_male), len(acc_female)])      # group sizes

stat, pval = proportions_ztest(count, nobs)
print(f"z = {stat:.3f}, p = {pval:.10f}")

if pval < 0.05:
    print('accuracy difference significant')


z = -4.985, p = 0.0000006197
accuracy difference significant


In [16]:
print(classification_report(X_test[X_test['sex_Male'] == 0]['true'], X_test[X_test['sex_Male'] == 0]['pred']))

              precision    recall  f1-score   support

           0       0.84      1.00      0.92        27
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         1

    accuracy                           0.84        32
   macro avg       0.28      0.33      0.31        32
weighted avg       0.71      0.84      0.77        32



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Age groups original

In [17]:
df = pd.read_csv('../../data/age_analysis/Xy_test_age_analysis.csv')
df.rename(columns={'num':'true'}, inplace=True)
df['pred'] = preds_test

In [18]:
from sklearn.metrics import accuracy_score, recall_score

results = {}

for group in df['age_group'].unique():
    mask = df['age_group'] == group
    y_true = df.loc[mask, 'true']
    y_pred = df.loc[mask, 'pred']

    acc = accuracy_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    n_samples = len(y_true)

    results[group] = {'accuracy': acc, 'recall': rec, 'f1': f1, 'n_samples': n_samples}

age_results = pd.DataFrame(results).T
order = [0, 3, 2, 1]
age_results_ordered = age_results.iloc[order]
print(age_results_ordered)

with open("../../results/dqn_age_original.json", "w") as f:
    json.dump(age_results_ordered.to_dict(), f, indent=4)


       accuracy  recall        f1  n_samples
<40    0.666667    0.50  0.400000       18.0
40-50  0.611111    0.25  0.189655       36.0
50-60  0.437500    0.20  0.121739       80.0
60+    0.260000    0.20  0.082540       50.0


In [19]:
import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest
import itertools
import json

age_groups = ["<40", "40-50", "50-60", "60+"]
results = []

for g1, g2 in itertools.combinations(age_groups, 2):
    mask_1 = df["age_group"] == g1
    mask_2 = df["age_group"] == g2

    y_true_1 = df.loc[mask_1, "true"]
    y_pred_1 = df.loc[mask_1, "pred"]
    y_true_2 = df.loc[mask_2, "true"]
    y_pred_2 = df.loc[mask_2, "pred"]

    acc_1 = (y_pred_1.to_numpy() == y_true_1.to_numpy()).astype(int)
    acc_2 = (y_pred_2.to_numpy() == y_true_2.to_numpy()).astype(int)

    count = np.array([acc_1.sum(), acc_2.sum()])
    nobs = np.array([len(acc_1), len(acc_2)])
    stat, pval = proportions_ztest(count, nobs)

    acc_rate_1 = acc_1.mean()
    acc_rate_2 = acc_2.mean()
    
    result = {
        "comparison": f"{g1} vs {g2}",
        "accuracy_1": f"{acc_rate_1:.6f}",
        "accuracy_2": f"{acc_rate_2:.6f}",
        "z_stat": f"{stat:.6f}",
        "p_value": f"{pval:.8f}",
        "significant": bool(pval < 0.05)
    }
    results.append(result)

    # print to console
    symbol = "✅" if pval < 0.05 else "❌"
    print(f"{g1} vs {g2}: z = {stat:.4f}, p = {pval:.6f} → {symbol} ")

with open("../../results/dqn_age_original_significance.json", "w") as f:
    json.dump(results, f, indent=2)


<40 vs 40-50: z = 0.3985, p = 0.690242 → ❌ 
<40 vs 50-60: z = 1.7584, p = 0.078684 → ❌ 
<40 vs 60+: z = 3.0684, p = 0.002152 → ✅ 
40-50 vs 50-60: z = 1.7304, p = 0.083564 → ❌ 
40-50 vs 60+: z = 3.2697, p = 0.001077 → ✅ 
50-60 vs 60+: z = 2.0402, p = 0.041331 → ✅ 
