In [None]:
!python -m pip install --upgrade pip
%pip install pandas matplotlib seaborn scikit-learn openpyxl tensorflow xgboost aif360
%pip install "aif360[Reductions, inFairness]"

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.stats import chi2_contingency, fisher_exact
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression,                 LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, BatchNormalization # type: ignore

from aif360.datasets import BinaryLabelDataset, StandardDataset
from aif360.algorithms.preprocessing import LFR
from fairlearn.preprocessing import CorrelationRemover
from aif360.algorithms.inprocessing import GerryFairClassifier, PrejudiceRemover, MetaFairClassifier
from aif360.algorithms.postprocessing import EqOddsPostprocessing, RejectOptionClassification

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference, selection_rate, false_positive_rate, false_negative_rate, count
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

random_seed = 15

In [None]:
PATH = 'C:/Users/andre/Desktop/ProjectWork_AEQUITAS_AKKODIS/'
df = (
    pd.read_excel(PATH + 'data/Dataset_2.0_Akkodis.xlsx')
      .rename(columns=lambda c: c.lstrip().title())
)
df.head()

In [None]:
config = {}
config['drop_columns'] = [
    'Id', 'Tag', 'Year Of Insertion',
    'Year Of Recruitment', 'Recruitment Request', 'Assumption Headquarters',
    'Event_Type__Val', 'Linked_Search__Key', 'Job Description', 
    'Candidate Profile', 'Akkodis Headquarters', 'Standing/Position', 'Last Role', 'Study Area.1',
    'Years Experience.1']
config['drop_rows'] = {
    'Candidate State': ['First contact', 'Imported']
}
config['remap_rows'] = {
    'Protected Category': {
        'Article 18': 'Yes',
        'Article 1': 'Yes'
    },
    'Residence': {
        'ETHIOPIA': 'ETHIOPIA » (STATE) ~ (FOREIGN)',
        'SOUTH AFRICAN REPUBLIC': 'SOUTH AFRICAN REPUBLIC » (STATE) ~ (FOREIGN)',
        'USSR': 'USSR » (STATE) ~ (FOREIGN)',
        'YUGOSLAVIA': 'YUGOSLAVIA » (STATE) ~ (FOREIGN)'
    },
    'Study Area' : {
        'Automation/Mechatronics Engineering': 'Engineering',
        'computer engineering': 'Engineering',
        'chemical engineering': 'Engineering',
        'Legal': 'Law',
        'Mechanical engineering': 'Engineering',
        'Telecommunications Engineering': 'Engineering',
        'Economic - Statistics': 'Economic',
        'Psychology': 'Scientific Field',
        'Materials Science and Engineering': 'Engineering',
        'Other scientific subjects': 'Scientific Field',
        'Biomedical Engineering': 'Engineering',
        'electronic Engineering': 'Engineering',
        'Information Engineering': 'Engineering',
        'Aeronautical/Aerospace/Astronautics Engineering': 'Engineering',
        'Energy and Nuclear Engineering': 'Engineering',
        'Informatics': 'Informatics',
        'Management Engineering': 'Engineering',
        'Automotive Engineering': 'Engineering',
        'industrial engineering': 'Engineering',
        'Other': 'Other',
        'Surveyor': 'NO COLLEGE',
        'Civil/Civil and Environmental Engineering': 'Engineering',
        'Electrical Engineering': 'Engineering',
        'Scientific maturity': 'NO COLLEGE',
        'Chemist - Pharmaceutical': 'Medical Field',
        'Political-Social': 'Other Humanities Subjects',
        'Other humanities subjects': 'Other Humanities Subjects',
        'Geo-Biological': 'Scientific Field',
        'Linguistics': 'Linguistics',
        'Agriculture and veterinary': 'Scientific Field',
        'Literary': 'Other Humanities Subjects',
        'Humanistic high school diploma': 'NO COLLEGE',
        'Accounting': 'NO COLLEGE',
        'Communication Sciences': 'Other Humanities Subjects',
        'Safety Engineering': 'Engineering',
        'Architecture': 'Scientific Field',
        'Mathematics': 'Scientific Field',
        'construction Engineering': 'Engineering',
        'Petroleum Engineering': 'Engineering',
        'Naval Engineering': 'Engineering',
        'Artistic': 'NO COLLEGE',
        'Not Specified': 'Other',
        'Mathematical-physical modeling for engineering': 'Engineering',
        'Engineering for the environment and the territory': 'Engineering',
        'Medical': 'Medical Field',
        'Defense and Security': 'Other',
        'Physical education': 'Other',
        'Statistics': 'Scientific Field',
        'Educational/training sciences': 'Other Humanities Subjects'
    }
}
config['fill_nan_columns'] = {
    'Citizenship': 'Not Specified',
    'Protected Category': 'Not a protected category',
    'Study area': 'Not Specified',
    'Sector': 'Unemployed',
    'Job Family Hiring': 'Not Specified',
    'Job Title Hiring': 'Not Specified',
    'Event_Feedback': 'Not Specified',
    'Overall': 'Not Specified',
    'Minimum Ral': 'Not Specified',
    'Ral Maximum': 'Not Specified',
    'Study Level': 'Not Specified',
    'Current Ral': 'Not Specified',
    'Expected Ral': 'Not Specified',
    'Technical Skills': '%MEAN%',
    'Comunication': '%MEAN%',
    'Maturity': '%MEAN%',
    'Dynamism': '%MEAN%',
    'Mobility': '%MEAN%',
    'English': '%MEAN%'
}
config['drop_nan_columns_threshold'] = '100.0'
config['feature_remapping'] = {
    'Residence': {
        'lists': {
            'state':    {'src':'Residence', 'inc':['(STATE)','(COUNTRY)'], 'exc':['Not Specified'], 'split':(' » ', 0)                    },
            'city':     {'src':'Residence', 'exc':['(STATE)','(COUNTRY)','Not Specified'],          'split':(' » ', 0)                    },
            'province': {'src':'Residence', 'exc':['(STATE)','(COUNTRY)','Not Specified'],          'split':(' » ', 1), 'post':(' ~ ', 0) },
            'region':   {'src':'Residence', 'exc':['(STATE)','(COUNTRY)','Not Specified'],          'split':(' ~ ',-1)                    },
        },
        'fields': {
            'Residence City':     {'src':'Residence', 'list':'city',     'def':'Not Specified'},
            'Residence Province': {'src':'Residence', 'list':'province', 'def':'Not Specified'},
            'Residence Region':   {'src':'Residence', 'list':'region',   'def':'Not Specified'},
            'Residence State':    {'src':'Residence', 'list':'state',    'def':'ITALY'},
            'European Residence': {'src':'Residence State', 'in':'eu',    'y':'Yes','n':'No'},
            'Italian Residence':  {'src':'Residence State', 'eq':'ITALY', 'y':'Yes','n':'No'},
        },
        'eu': [
            'ALBANIA', 'AUSTRIA', 'BELARUS', 'BELGIUM', 'BULGARIA', 'CROATIA', 'CZECH REPUBLIC',
            'FRANCE', 'GERMANY', 'GREAT BRITAIN-NORTHERN IRELAND', 'GREECE', 'ITALY', 'LATVIA',
            'LITHUANIA', 'LUXEMBOURG', 'MALTA', 'MOLDOVA', 'MONACO', 'MONTENEGRO', 'NETHERLANDS',
            'NORWAY', 'POLAND', 'PORTUGAL', 'ROMANIA', 'RUSSIA', 'SAN MARINO', 'SERBIA', 'SLOVAKIA',
            'SLOVENIA', 'SPAIN', 'SWEDEN', 'SWITZERLAND', 'UKRAINE'
        ]
    }
}
config['status_positive_conditions'] = {
    'Candidate State': ['Hired', 'Economic proposal', 'QM'],
    'Event_Feedback': ['OK (live)', 'OK (waiting for departure)', 'OK (hired)']
}
config['categorical_columns'] = [
    'Candidate State', 'Event_Feedback', 
    'Residence City', 'Residence Province', 'Residence Region', 'Residence State', 'European Residence', 'Italian Residence',
    'Age Range', 'Sex',
    'Protected Category', 'Study Area', 'Study Title',
    'Years Experience', 'Sector', 'Job Family Hiring',
    'Job Title Hiring', 'Overall',
    'Minimum Ral', 'Ral Maximum', 'Study Level',
    'Current Ral', 'Expected Ral'
]
config['categorical_columns_custom_orders'] = {
    'Candidate State': ['Imported', 'First contact', 'In Selection', 'QM', 'Vivier', 'Economic proposal', 'Hired'],
    'Age Range': ['< 20 years', '20 - 25 years', '26 - 30 years', '31 - 35 years', '36 - 40 years', '40 - 45 years', '> 45 years'],
    'Years Experience': ['Not Specified', '[0]', '[0-1]', '[1-3]', '[3-5]', '[5-7]', '[7-10]', '[+10]'],
    'Study Title': ['Middle school diploma', 'High school graduation', 'Professional qualification', 'Three-year degree', 'master\'s degree', 'Five-year degree', 'Doctorate'],
    'RAL': ['Not Specified', '-20k', '20-22k', '22-24k', '24-26k', '26-28k', '28-30k', '30-32k', '32-34k', '34-36k', '36-38k', '38-40k', '40-42k', '42-44k', '44-46k', '46-48k', '48-50k', '+50k']
}
config['visualize_columns'] = ['Age Range', 'Sex', 'Protected Category', 'Study Area', 'Study Title', 'Years Experience']
config['sensitive_columns'] = ['Sex_encoded', 'Age Range_encoded', 'Protected Category_encoded']



## Data Preprocessing

### Clean Dataset

In [None]:
df = df.drop_duplicates(subset='Id', keep='last')
df.head()

In [None]:
df = df.drop(columns=config['drop_columns'])
df.head()

In [None]:
for col, remove_list in config['drop_rows'].items():
    df = df[~df[col].isin(remove_list)]
df.head()

In [None]:
for col, mapping in config['remap_rows'].items():
    df[col] = df[col].replace(mapping)
df.head()

In [None]:
for feature in df.columns:
    print(f'Feature: {feature} -- {list(df[feature].unique())}')

### Handle the NANs

In [None]:
print(f'Columns that contain NaN values:\n {df.columns[df.isnull().any()].tolist()}')

for col in df.columns[df.isnull().any()].tolist():
  print(f'{col} values: {df[col].unique()} \n') # Analyze each NaN containing feature first to determine the default fill value

In [None]:
unuseful_columns = []
for col in df.columns:
  null_count = df[col].isna().sum() / df.shape[0]
  if null_count > config['drop_nan_columns_threshold'].astype(float):
    unuseful_columns.append(col)
  
df = df.drop(columns=unuseful_columns)
df.head()

In [None]:
for col, filler in config['fill_nan_rows'].items():
    if filler == '%MEAN%':
        media = round(df[col].mean())
    df[col].fillna(media, inplace=True)
df.head()

In [None]:
print(f'There are {df.isnull().sum().sum()} NANs')

### Features Reformatting

In [None]:
print(df['Residence'].unique())

In [None]:
def gen_lists(df, specs):
    out = {}
    for name, p in specs.items():
        base = out.get(p['src'], df[p['src']].dropna().astype(str).unique())
        # filtra inc/exc
        items = [
            x for x in base
            if not any(exc in x for exc in p.get('exc', []))
               and (any(inc in x for inc in p.get('inc', [])) if 'inc' in p else True)
        ]
        # split / post-split
        for key in ('split', 'post'):
            if key in p:
                sep, idx = p[key]
                items = [x.split(sep)[idx] if sep in x else x for x in items]
        out[name] = sorted(set(items))
    return out

def apply_field(val, p, lists, feature_cfg):
    # pattern‐match su lista
    if 'list' in p:
        return next((x for x in lists[p['list']] if x in str(val)), p['def'])
    # appartenenza in lista
    if 'in' in p:
        return p['y'] if val in feature_cfg[p['in']] else p['n']
    # confronto esatto
    if 'eq' in p:
        return p['y'] if str(val) == p['eq'] else p['n']

# Applica per ogni feature definita ———
for feature, feat_cfg in config['feature_remapping'].items():
    # Genera le liste specifiche
    lists = gen_lists(df, feat_cfg['lists'])
    # Mappa ogni campo
    for col_name, col_cfg in feat_cfg['fields'].items():
        df[col_name] = df[col_cfg['src']].apply(lambda v, cfg=col_cfg: apply_field(v, cfg, lists, feat_cfg))
    # Elimina le colonne originali
    df = df.drop(columns=feature, errors='ignore')
df.head()


### Target Column

In [None]:
config['categorical_columns'].append('Status')

mask = np.zeros(len(df), dtype=bool)
for col, valid_values in config['status_positive_conditions'].items():
    mask |= df[col].isin(valid_values)
df['Status'] = np.where(mask, 'Positive', 'Negative')
df.head()

### Categorical columns

In [None]:
encoding_mappings = {}

for col in config['categorical_columns']:
    if col in config['categorical_columns_custom_orders']:
        df[f"{col}_encoded"] = pd.Categorical(df[col], categories=config['categorical_columns_custom_orders'][col], ordered=True).codes
        encoding_mappings[col] = {cat: i for i, cat in enumerate(config['categorical_columns_custom_orders'][col])}
    else:
        encoder = LabelEncoder()
        df[f"{col}_encoded"] = encoder.fit_transform(df[col].astype(str))
        encoding_mappings[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

df = df.drop(columns=config['categorical_columns'])
df.head()

In [None]:
for feature in df.columns:
    print(f'Feature: {feature} -- {list(df[feature].unique())}')

### Visualize Data

In [None]:
for lookup in config['visualize_columns']:
    mapping = {v: k for k, v in encoding_mappings[lookup].items()}
    codes = sorted(mapping.keys())
    distrib = Counter(df[f"{lookup}_encoded"])
    print(f"Distribution of {lookup}: {distrib}")
    labels = [mapping[c] for c in codes]
    counts = [distrib[c] for c in codes]
    distrib_df = pd.DataFrame({lookup: labels, 'Count': counts})
    distrib_df.head(20).plot(x=lookup, y='Count', kind='bar', legend=False)
    plt.title(lookup)

#### Correlation Matrix

In [None]:
plt.figure(figsize=(18, 12))
sns.heatmap(df.corr().round(2), annot=True, cmap='coolwarm', center=0, linewidths=.5)

### Visualize Percentage of Positive Status inside each class

In [None]:
print(encoding_mappings)

In [None]:
status_map = {v: k for k, v in encoding_mappings['Status'].items()}
for feature in config['sensitive_columns']:
    cat_map = {v: k for k, v in encoding_mappings[feature.replace('_encoded', '')].items()}
    percentage = df.groupby(feature)['Status_encoded'].mean().mul(100).round(2) # Positive
    for cat, perc in percentage.items():
        print(f"Feature: {feature} - Feature Val: {cat_map[cat]} - Status Val: {status_map[1]}-> {perc:.2f}%")
        print(f"Feature: {feature} - Feature Val: {cat_map[cat]} - Status Val: {status_map[0]}-> {(100 - perc):.2f}%")

## Save to File

In [None]:
df.to_excel(PATH + 'data/Dataset_Preprocessed.xlsx', index=False)

## Chi-squared Tests

In [None]:
# Tabelle di contingenza
contingency_sex    = pd.crosstab(df['Sex_encoded'], df['Status_encoded'])
contingency_age    = pd.crosstab(df['Age Range_encoded'], df['Status_encoded'])
contingency_region = pd.crosstab(df['Residence Region_encoded'], df['Status_encoded'])

# Chi-squared tests
tables = {
    'Sex': contingency_sex,
    'Age Range': contingency_age,
    'Residence Region': contingency_region
}
for var, table in tables.items():
    chi2, p, dof, expected = chi2_contingency(table, correction=False)
    test_name = 'Chi-squared'
    
    # se 2×2 e attese <5 → Fisher’s exact
    if table.shape == (2,2) and (expected < 5).any():
        _, p = fisher_exact(table)
        test_name = "Fisher's exact"
    
    n = table.values.sum()
    k = min(table.shape)
    cramer_v = np.sqrt(chi2 / (n * (k-1)))
    
    # Stampa a video
    print(f"--- {var} ---")
    print("Expected frequencies:")
    print(pd.DataFrame(expected, index=table.index, columns=table.columns))
    print()
    print(f"{test_name}: χ² = {chi2:.2f}, p = {p:.3f}, dof = {dof}, Cramér’s V = {cramer_v:.3f}")
    print("Conclusion: Significant association between two variables (Dependent)" if p < 0.05 else "Conclusion: No significant association between two variables (Independent)")
    print()
    

In [None]:
for feature in df.columns:
    print(f'Feature: {feature} -- {list(df[feature].unique())}')

In [None]:
selection_order = [
    'Imported', 'In selection', 'First contact',
    'QM', 'Vivier', 'Economic proposal', 'Hired'
]
lookouts = ['Sex_encoded', 'Age Range_encoded', 'Residence Region_encoded']

for lookout in lookouts:
    contingency_tables = {}
    cat_map = {v: k for k, v in encoding_mappings[lookout.replace('_encoded', '')].items()}
    for i, state in enumerate(selection_order):
        if state not in encoding_mappings['Candidate State']:
            continue

        post_states = selection_order[i+1:]
        if not post_states:
            continue

        state_code = encoding_mappings['Candidate State'][state]
        post_state_codes = [encoding_mappings['Candidate State'][s] for s in post_states if s in encoding_mappings['Candidate State']]

        df_state      = df[df['Candidate State_encoded'] == state_code]
        df_post_state = df[df['Candidate State_encoded'].isin(post_state_codes)]

        table = pd.DataFrame({
            f'Post {state}': df_post_state.groupby(lookout, observed=True).size(),
            state:            df_state.groupby(lookout, observed=True).size()
        }).fillna(0).astype(int)
        table.index = table.index.map(cat_map)
        contingency_tables[state] = table

    for var, table in contingency_tables.items():
        if table.empty or table.values.sum() == 0:
            continue

        chi2, p, dof, expected = chi2_contingency(table, correction=False)
        test_name = 'Chi-squared'
        if table.shape == (2,2) and (expected < 5).any():
            _, p = fisher_exact(table)
            test_name = "Fisher's exact"

        n = table.values.sum()
        k = min(table.shape)
        cramer_v = np.sqrt(chi2 / (n * (k-1)))

        print(f"--- {var} (by {lookout}) ---")
        print(table, "\n")
        print("Expected frequencies:")
        print(pd.DataFrame(expected, index=table.index, columns=table.columns), "\n")
        print(f"{test_name}: χ²={chi2:.2f}, p={p:.3f}, dof={dof}, Cramér’s V={cramer_v:.3f}")
        print("Dependent" if p<0.05 else "Independent", "\n")

## Train

### Dataset Preparation

In [None]:
df = shuffle(df, random_state=random_seed)
X_full = df.drop(columns=['STATUS'])
y = df['STATUS']
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y, test_size=0.2, random_state=random_seed, stratify=y)

X = df.drop(columns=['STATUS', 'sex'])
y = df['STATUS']
s = df['sex']
X_train, X_test, y_train, y_test, s_train, s_test = train_test_split(X, y, s, test_size=0.2, random_state=random_seed, stratify=y)

In [None]:
train_df = X_train.copy()
train_df['target'] = y_train.values
train_df['sex'] = s_train.values

test_df = X_test.copy()
test_df['target'] = y_test.values
test_df['sex'] = s_test.values

train_ds = StandardDataset(
    train_df,
    label_name='target',
    favorable_classes=[1],
    protected_attribute_names=['sex'],
    privileged_classes=[[1]]
)

test_ds = StandardDataset(
    test_df,
    label_name='target',
    favorable_classes=[1],
    protected_attribute_names=['sex'],
    privileged_classes=[[1]]
)

In [None]:
lfr = LFR(unprivileged_groups=[{'sex': 0}], privileged_groups=[{'sex': 1}], k=10, Ax=5, Ay=5, Az=10, max_iter=50, verbose=1)

lfr = lfr.fit(train_ds)
X_train_lfr = lfr.transform(train_ds)
X_train_lfr_df = pd.DataFrame(X_train_lfr.features, columns=train_ds.feature_names)


clf = LogisticRegression(solver='liblinear')
clf.fit(X_train_lfr_df, y_train)
preds_lfr = clf.predict(pd.DataFrame(lfr.transform(test_ds).features, columns=train_ds.feature_names))

In [None]:
cr = CorrelationRemover(sensitive_feature_ids=[train_ds.feature_names.index('sex')])

X_train_cr = cr.fit_transform(X_train)
X_train_cr_df = pd.DataFrame(X_train_cr.features, columns=train_ds.feature_names)

X_test_cr  = cr.transform(X_test)
X_test_cr_df = pd.DataFrame(X_test_cr.features, columns=train_ds.feature_names)

In [None]:
gfc = GerryFairClassifier(
    unprivileged_groups=[{'sex': 0}],
    privileged_groups=[{'sex': 1}],
    estimator=LogisticRegression(solver='liblinear'),
    constraints='demographic_parity'
)
gfc.fit(train_ds)
pred_gfc = gfc.predict(test_ds)

In [None]:
pr = PrejudiceRemover(
    sensitive_attr='sex',
    eta=25.0
)
pr.fit(train_ds)
pred_pr = pr.predict(test_ds)

In [None]:
mfc = MetaFairClassifier(
    unprivileged_groups=[{'sex': 0}],
    privileged_groups=[{'sex': 1}],
    sensitive_attr='sex',
    tau=0.8
)
mfc.fit(train_ds)
pred_mfc = mfc.predict(test_ds)

In [None]:
eop = EqOddsPostprocessing(
    unprivileged_groups=[{'sex': 0}],
    privileged_groups=[{'sex': 1}]
)
eop = eop.fit(train_ds, gfc.predict(train_ds))
pred_eop = eop.predict(test_ds)

In [None]:
roc = RejectOptionClassification(
    unprivileged_groups=[{'sex': 0}],
    privileged_groups=[{'sex': 1}],
    low_class_thresh=0.01,
    high_class_thresh=0.99,
    num_class_thresh=100,
    metric_name='Average odds difference'
)
roc = roc.fit(train_ds, gfc.predict(train_ds))
pred_roc = roc.predict(test_ds)

In [None]:
def compute_fairness_metrics(y_true, y_pred, sensitive_features, label=None):
    mf = MetricFrame(
        metrics={
            'selection_rate': selection_rate,
            'dp_diff': demographic_parity_difference,
            'eo_diff': equalized_odds_difference,
            'fpr': false_positive_rate,
            'fnr': false_negative_rate,
            'count': count
        },
        y_true=y_true,
        y_pred=y_pred,
        sensitive_features=sensitive_features
    )
    if label:
        print(f"=== {label} ===")
    print(mf.by_group)
    print("Overall:", mf.overall, "\n")
    return mf

In [None]:
# Pre-Processing
compute_fairness_metrics(y_test, preds_lfr, s_test, label="LFR + LogisticRegression")

# In-processing
compute_fairness_metrics(y_test, pred_gfc.ravel(), s_test, label="GerryFairClassifier")
compute_fairness_metrics(y_test, pred_pr.ravel(), s_test, label="PrejudiceRemover")
compute_fairness_metrics(y_test, pred_mfc.ravel(), s_test, label="MetaFairClassifier")

# Post-processing
compute_fairness_metrics(y_test, pred_eop.ravel(), s_test, label="EqOddsPostprocessing")
compute_fairness_metrics(y_test, pred_roc.ravel(), s_test, label="RejectOptionClassification")

### Models

In [None]:
def create_model(seed):
    tf.random.set_seed(seed)
    model = Sequential()
    model.add(Dense(128, input_dim=22, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(),
    'KNN': KNeighborsClassifier(),
    'Neural Network': create_model(random_seed)
}

In [None]:
metrics = []
predictions = {}

for name, model in models.items():
    if name in ['Linear Regression', 'Decision Tree', 'Naive Bayes', 'XGBoost', 'KNN']:
        model.fit(X_train_full, y_train_full)
    elif name in ['Neural Network']:
        model.fit(X_train_full, y_train_full, epochs=15, batch_size=64, validation_split=0.2)
    else:
        print("Error in Models!"); break

    y_pred = model.predict(X_test_full)

    if name in ['Linear Regression', 'XGBoost', 'Neural Network']:
        y_pred = (y_pred > 0.5).astype(int)

    accuracy = round(accuracy_score(y_test_full, y_pred), 3)
    precision = round(precision_score(y_test_full, y_pred), 3)
    recall = round(recall_score(y_test_full, y_pred), 3)
    f1 = round(f1_score(y_test_full, y_pred), 3)
    roc_auc = round(roc_auc_score(y_test_full, y_pred), 3)

    metrics.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'ROC AUC': roc_auc
    })
    predictions[name] = y_pred

metrics = pd.DataFrame(metrics)
predictions_df = pd.DataFrame({
    'Linear Regression' : predictions['Linear Regression'],
    'Decision Tree' : predictions['Decision Tree'],
    'Naive Bayes' : predictions['Naive Bayes'],
    'XGBoost' : predictions['XGBoost'],
    'kNN' : predictions['KNN'],
    'Neural Network' : predictions['Neural Network']
})

## Fairness Metrics

#### **3.1 Demographic Parity**

In [None]:
sensitive_features = [' Sex_encoded', ' Age Range_encoded', ' Citizenship_encoded', ' Protected category_encoded']
non_sensitive_features = ['Technical Skills', 'Comunication', 'Maturity', 'Dynamism', 'Mobility',
       'English', ' Study area_encoded', ' Study Title_encoded', ' Years Experience_encoded', ' Sector_encoded', ' Job Family Hiring_encoded',
       ' Job Title Hiring_encoded', ' Overall_encoded', ' Years Experience.1_encoded',' Minimum Ral_encoded', ' Ral Maximum_encoded',
       ' Study Level_encoded', 'Current Ral_encoded', 'Expected Ral_encoded']
models_list = [model for model in models]
tolerance = 0.15
significance_level = 0.1

In [None]:
def calculate_demographic_parity(predictions, sensitive_attribute, name, significance_level, tolerance, activate_check=False):

    df = pd.DataFrame({
        'predictions': predictions,
        'sensitive_attribute': sensitive_attribute
    })
    prop = df.groupby('sensitive_attribute')['predictions'].mean()
    
    if activate_check:
        print(f"===\n{name}\n{prop}")

    if prop.shape[0] == 2:
        return 'T' if (prop.max() - prop.min()) <= tolerance else False
    else:
        contingency_table = pd.crosstab(df['predictions'], df['sensitive_attribute'])
        chi2, p, dof, expected = chi2_contingency(contingency_table)

        if activate_check and (expected < 5).any():
            print(f"Sparse contingency for {name}")
                
        return 'T' if p > significance_level else False


In [None]:
table = []
for model in models:
    row = []
    for sensitive_feature in sensitive_features:
        result = calculate_demographic_parity(predictions[model], X_test_full[sensitive_feature], sensitive_feature, significance_level, tolerance, activate_check=True)
        row.append(result)
    table.append(row)
sf_df = pd.DataFrame(table, index = models_list, columns=sensitive_features)

#### **3.2 Equalized odds**

In [None]:
def calculate_equalized_odds(predictions, true_labels, sensitive_attribute, name, tolerance, activate_check=False):
    df = pd.DataFrame({
        'predictions': predictions,
        'true_labels': true_labels,
        'sensitive_attribute': sensitive_attribute
    })
    tprs, fprs = [], []
    for _, group_df in df.groupby('sens'):
        tn, fp, fn, tp = confusion_matrix(group_df['true_labels'], group_df['predictions'], labels=[0, 1]).ravel()
        tprs.append(tp / (tp + fn) if tp + fn != 0 else 0)
        fprs.append(fp / (fp + tn) if fp + tn != 0 else 0)

    max_tpr_diff = max(tprs) - min(tprs)
    max_fpr_diff = max(fprs) - min(fprs)

    if activate_check:
            print(f"===\n{name}\nMax FPR diff: {max_fpr_diff}\nMax TPR diff: {max_tpr_diff}")

    return 'T' if (max_tpr_diff <= 2 * tolerance and max_fpr_diff <= 2 * tolerance) else False

In [None]:
table = []
for model in models:
    row = []
    for sensitive_feature in sensitive_features:
        result = calculate_equalized_odds(predictions[model], y_test_full, X_test_full[sensitive_feature], sensitive_feature, tolerance, activate_check=False)
        row.append(result)
    table.append(row)
sf_df = pd.DataFrame(table, index = models_list, columns=sensitive_features)