In [None]:
!python -m pip install --upgrade pip
%pip install pandas matplotlib seaborn scikit-learn openpyxl tensorflow xgboost aif360
%pip install "aif360[Reductions, inFairness]"

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.stats import chi2_contingency
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization

from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.inprocessing import AdversarialDebiasing

random_seed = 15

pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [None]:
PATH = 'C:/Users/aberti/Desktop/ProjectWork_AEQUITAS_AKKODIS/data/'
df = (
    pd.read_excel(PATH + 'Dataset_2.0_Akkodis.xlsx')
      .rename(columns=lambda c: c.lstrip().title())
)
df.head()

### Clean Dataset

In [None]:
unuseful_columns = [
    'ID', 'TAG', 'Year of insertion', 'Year of Recruitment', 'Recruitment Request', 
    'Assumption Headquarters', 'event_type__val', 'linked_search__key', 'Job Description', 
    'Candidate Profile', 'Akkodis headquarters', 'Standing/Position', 'Unnamed: 0', 
    'Residence', 'Last Role', 'Study Area.1', 'Years Experience.1']
df = df.drop(columns=unuseful_columns)

In [None]:
for feature in df.columns:
    print(f'Feature: {feature} -- {list(df[feature].unique())}')

### Handle the NANs

In [None]:
print(df.columns[df.isnull().any()].tolist())

In [None]:
df['Citizenship'] = df['Citizenship'].fillna('Not Specified')

df['Protected category'] = df['Protected category'].fillna('Not a protected category')
df['Protected Category'] = df['Protected Category'].replace('Article 18', 'Yes')
df['Protected Category'] = df['Protected Category'].replace('Article 1', 'Yes')

df['Study area'] = df['Study area'].fillna('Not Specified')
df['Sector'] = df['Sector'].fillna('Unemployed')
df['Job Family Hiring'] = df['Job Family Hiring'].fillna('Not Specified')
df['Job Title Hiring'] = df['Job Title Hiring'].fillna('Not Specified')
df['vent_feedback'] = df['ent_feedback'].fillna('Not Specified')
df['verall'] = df['verall'].fillna('Not Specified')
df['Minimum Ral'] = df['Minimum Ral'].fillna('Not Specified')
df['Ral Maximum'] = df['Ral Maximum'].fillna('Not Specified')
df['Study Level'] = df['Study Level'].fillna('Not Specified')
df['Current Ral'] = df['Current Ral'].fillna('Not Specified')
df['Expected Ral'] = df['Expected Ral'].fillna('Not Specified')
df['Technical Skills'] = df['Technical Skills'].fillna(df['Technical Skills'].mean())
df['Comunication'] = df['Comunication'].fillna(df['Comunication'].mean())
df['Maturity'] = df['Maturity'].fillna(df['Maturity'].mean())
df['Dynamism'] = df['Dynamism'].fillna(df['Dynamism'].mean())
df['Mobility'] = df['Mobility'].fillna(df['Mobility'].mean())
df['English'] = df['English'].fillna(df['English'].mean())

print(f'There are {df.isnull().sum().sum()} NANs')
df.head()

### Features

In [None]:
citizenship_mapping = {
    'Pakistani': 'Non-European',
    'Italian': 'European',
    'Not Specified': 'Non-European',
    'Moroccan': 'Non-European',
    'Iranian': 'Non-European',
    'Albanian': 'European',
    'Indiana': 'Non-European',
    'Colombian': 'Non-European',
    'Ethiopian': 'Non-European',
    'Romanian': 'European',
    'Vltava': 'European',
    'Lebanese': 'Non-European',
    'Spanish': 'European',
    'Egyptian': 'Non-European',
    'Russian': 'European',
    'Tunisian': 'Non-European',
    'Turkish': 'European',
    'Chinese': 'Non-European',
    'Uzbek': 'Non-European',
    'Brazilian': 'Non-European',
    'Cameroonian': 'Non-European',
    'Sudanese': 'Non-European',
    'Algerian': 'Non-European',
    'Croatian': 'European',
    'Polish': 'European',
    'Indonesian': 'Non-European',
    'San Marino': 'European',
    'Argentina': 'Non-European',
    'Azerbaijan': 'Non-European',
    'Portuguese': 'European',
    'Serbian': 'European',
    'French': 'European',
    'Swiss': 'European',
    'German': 'European',
    'Peruvian': 'Non-European',
    'British': 'European',
    'Venezuelan': 'Non-European',
    'Rwandan': 'Non-European',
    'Costa Rican': 'Non-European',
    'South Korean': 'Non-European',
    'Ukraine': 'European',
    'Macedonian': 'European',
    'Nigerian': 'Non-European',
    'American': 'Non-European',
    'Kenyan': 'Non-European',
    'Emirati': 'Non-European',
    'Ecuadorian': 'Non-European',
    'Ivorian': 'Non-European',
    'Mexican': 'Non-European',
    'Chilean': 'Non-European',
    'Japanese': 'Non-European',
    'Syrian': 'Non-European',
    'Bangladeshis': 'Non-European',
    'Greek': 'European',
    'Israeli': 'Non-European',
    'Omani': 'Non-European',
    'South African': 'Non-European',
    'Bolivian': 'Non-European',
    'Filipina': 'Non-European',
    'Sinhalese': 'Non-European',
    'Palestinian (Palestinian Territories)': 'Non-European',
    'Afghan': 'Non-European',
    'Jordan': 'Non-European',
    'Cuban': 'Non-European',
    'Vietnamese': 'Non-European',
    'Latvian': 'European',
    'Libyan': 'Non-European',
    'Bulgarian': 'European',
    'Togolese': 'Non-European',
    'Kazakh': 'Non-European',
    'Austrian': 'European',
    'Belarusian': 'European',
    'Saudi': 'Non-European',
    'Bosnian': 'European',
    'Kyrgyz': 'Non-European',
    'Tajik': 'Non-European',
    'Dutch': 'European',
    'Qatari': 'Non-European',
    'Georgian': 'European',
    'Canadian': 'Non-European',
    'Australian': 'Non-European',
    'Salvadoran': 'Non-European',
    'Congolese': 'Non-European',
    'Guatemalan': 'Non-European',
    'Hungarian': 'European',
    'Tanzanian': 'Non-European',
    'Gabonese': 'Non-European',
    'Angolan': 'Non-European',
    'Maltese': 'European'
}

study_area_mapping = {
    'Automation/Mechatronics Engineering': 'Engineering',
    'computer engineering': 'Engineering',
    'chemical engineering': 'Engineering',
    'Legal': 'Law',
    'Mechanical engineering': 'Engineering',
    'Telecommunications Engineering': 'Engineering',
    'Economic - Statistics': 'Economic',
    'Psychology': 'Scientific Field',
    'Materials Science and Engineering': 'Engineering',
    'Other scientific subjects': 'Scientific Field',
    'Biomedical Engineering': 'Engineering',
    'electronic Engineering': 'Engineering',
    'Information Engineering': 'Engineering',
    'Aeronautical/Aerospace/Astronautics Engineering': 'Engineering',
    'Energy and Nuclear Engineering': 'Engineering',
    'Informatics': 'Informatics',
    'Management Engineering': 'Engineering',
    'Automotive Engineering': 'Engineering',
    'industrial engineering': 'Engineering',
    'Other': 'Other',
    'Surveyor': 'NO COLLEGE',
    'Civil/Civil and Environmental Engineering': 'Engineering',
    'Electrical Engineering': 'Engineering',
    'Scientific maturity': 'NO COLLEGE',
    'Chemist - Pharmaceutical': 'Medical Field',
    'Political-Social': 'Other Humanities Subjects',
    'Other humanities subjects': 'Other Humanities Subjects',
    'Geo-Biological': 'Scientific Field',
    'Linguistics': 'Linguistics',
    'Agriculture and veterinary': 'Scientific Field',
    'Literary': 'Other Humanities Subjects',
    'Humanistic high school diploma': 'NO COLLEGE',
    'Accounting': 'NO COLLEGE',
    'Communication Sciences': 'Other Humanities Subjects',
    'Safety Engineering': 'Engineering',
    'Architecture': 'Scientific Field',
    'Mathematics': 'Scientific Field',
    'construction Engineering': 'Engineering',
    'Petroleum Engineering': 'Engineering',
    'Naval Engineering': 'Engineering',
    'Artistic': 'NO COLLEGE',
    'Not Specified': 'Other',
    'Mathematical-physical modeling for engineering': 'Engineering',
    'Engineering for the environment and the territory': 'Engineering',
    'Medical': 'Medical Field',
    'Defense and Security': 'Other',
    'Physical education': 'Other',
    'Statistics': 'Scientific Field',
    'Educational/training sciences': 'Other Humanities Subjects'

}

age_mapping = {
    '< 20 years': 'Young',
    '20 - 25 years': 'Young',
    '26 - 30 years': 'Young',
    '31 - 35 years': 'Young',
    '36 - 40 years': 'Senior',
    '40 - 45 years': 'Senior',
    '> 45 years': 'Senior'
}

df['Citizenship'] = df['Citizenship'].replace(citizenship_mapping)
df['Age Range'] = df['Age Range'].replace(age_mapping)
df['Study area'] = df['Study area'].replace(study_area_mapping)
df.head()

### Target Column

In [None]:
statuses_to_remove = ['First contact', 'Imported']
df = df[~df['Candidate State'].isin(statuses_to_remove)]

In [None]:
df['STATUS'] = np.where(
    (df[' Candidate State'] == 'Hired') | 
    (df[' Candidate State'] == 'Economic proposal') | 
    (df[' event_feedback'] == 'OK (live)') | 
    (df[' event_feedback'] == 'OK (waiting for departure)') | 
    (df[' event_feedback'] == 'OK (hired)') | 
    (df[' Candidate State'] == 'QM'), 1, 0)
# 1 means the candidate is considered valid (even if still not hired), 0 the candidate is not considered valid for some reason

lookup = 'STATUS'
distrib = Counter(df[lookup])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup, 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.head(20).plot(x=lookup, y='Count', kind='bar', legend=False)
plt.title(lookup)
plt.xticks(rotation=45)

### Categorical columns

In [None]:
categorical_columns = [' Age Range', ' Citizenship', ' Sex',
       ' Protected category', ' Study area', ' Study Title',
       ' Years Experience', ' Sector', ' Job Family Hiring',
       ' Job Title Hiring', ' Overall',
       ' Minimum Ral', ' Ral Maximum', ' Study Level',
       'Current Ral', 'Expected Ral']

encoding_mappings = {}
for column in categorical_columns:
    encoder = LabelEncoder()
    df[f'{column}_encoded'] = encoder.fit_transform(df[column].astype(str))
    encoding_mappings[column] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

df = df.drop(columns=categorical_columns + ['Candidate State', 'Event_Feedback'])

In [None]:
df.head()

### Visualize Data

#### Citizenship

In [None]:
lookup = 'Citizenship'
mapping = {v: k for k, v in encoding_mappings[lookup].items()}

distrib = Counter(df[lookup + "_encoded"])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup + "_encoded", 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.head(20).plot(x=lookup + "_encoded", y='Count', kind='bar', legend=False)
plt.title(lookup)
plt.xticks(rotation=45)

#### Age

In [None]:
lookout = 'Age Range'
mapping = {v: k for k, v in encoding_mappings[lookout].items()}

distrib = Counter(df[lookup + "_encoded"])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup + "_encoded", 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.head(20).plot(x=lookup + "_encoded", y='Count', kind='bar', legend=False)
plt.title(lookup)
plt.xticks(rotation=45)

#### Gender

In [None]:
lookout = 'Sex'
mapping = {v: k for k, v in encoding_mappings[lookout].items()}

distrib = Counter(df[lookup + "_encoded"])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup + "_encoded", 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.head(20).plot(x=lookup + "_encoded", y='Count', kind='bar', legend=False)
plt.title(lookup)
plt.xticks(rotation=45)

#### Protected Category

In [None]:
lookout = 'Protected category'
mapping = {v: k for k, v in encoding_mappings[lookout].items()}

distrib = Counter(df[lookup + "_encoded"])
distrib_df = pd.DataFrame(distrib.items(), columns=[lookup + "_encoded", 'Count'])
distrib_df = distrib_df.sort_values(by='Count', ascending=False)
distrib_df.head(20).plot(x=lookup + "_encoded", y='Count', kind='bar', legend=False)
plt.title(lookup)
plt.xticks(rotation=45)

#### Correlation Matrix

In [None]:
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0, linewidths=.5)

### Visualize Percentage of Hired inside each class

In [None]:
sensitive = ['Sex_encoded', 'Age Range_encoded', 'Citizenship_encoded', 'Protected Category_encoded']
for feature in sensitive:
    percentage = df.groupby(feature)['STATUS'].mean().mul(100).round(2)
    for category, perc in percentage.items():
        print(f"Percentage of elements where {feature} is {category} and STATUS is HIRED: {percentage:.2f}%")

## **Task 2 - Algorithms**

In [None]:
# Shuffle the dataset
df = shuffle(df, random_state=random_seed)

# Split in X and y
X = df.drop(columns=['STATUS'])
y = df['STATUS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

##### **2.1 Machine Learning models**

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(),
    'KNN': KNeighborsClassifier(),
}

In [None]:
metrics = []
predictions = {}

# Fit models and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    if name in ['Linear Regression', 'XGBoost']:
        y_pred = (y_pred > 0.5).astype(int)

    # Store predictions
    predictions[name] = y_pred

    accuracy = round(accuracy_score(y_test, y_pred), 3)
    precision = round(precision_score(y_test, y_pred), 3)
    recall = round(recall_score(y_test, y_pred), 3)
    f1 = round(f1_score(y_test, y_pred), 3)
    roc_auc = round(roc_auc_score(y_test, y_pred), 3)

    # Append metrics to the DataFrame
    metrics.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'ROC AUC': roc_auc
    })

metrics = pd.DataFrame(metrics)
metrics.head()

In [None]:
predictions_df = pd.DataFrame({
    'Linear Regression' : predictions['Linear Regression'],
    'Decision Tree' : predictions['Decision Tree'],
    'Naive Bayes' : predictions['Naive Bayes'],
    'XGBoost' : predictions['XGBoost'],
    'kNN' : predictions['KNN']
})

##### **2.2 Neural Network**

In [None]:
def create_model():
    model = Sequential()
    model.add(Dense(128, input_dim=22, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# List to hold the models
neural_models = []

# Create and compile 7 models with different seeds
for seed in range(85,92):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    model = create_model()
    neural_models.append(model)

In [None]:
# Fit the models
histories = []
for i, model in enumerate(neural_models):
    print(f"Fitting model {i+1}...")
    history = model.fit(X_train, y_train, epochs=15, batch_size=64, validation_split=0.2)
    histories.append(history)
    print(f"Model {i+1} fitted.\n")

In [None]:
# Check training procedure
plt.plot(histories[0].history['accuracy'])
plt.plot(histories[0].history['val_accuracy'])
plt.title('Model 1 accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(histories[0].history['loss'])
plt.plot(histories[0].history['val_loss'])
plt.title('Model 1 loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
neural_predictions = []

for i, model in enumerate(neural_models):
    print(f"Predicting with model {i+1}...")
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    neural_predictions.append(y_pred)
    print(f"Predictions from model {i+1} stored.\n")

In [None]:
nn_metrics = []

for i, y_pred in enumerate(neural_predictions):
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    precision = round(precision_score(y_test, y_pred), 3)
    recall = round(recall_score(y_test, y_pred), 3)
    f1 = round(f1_score(y_test, y_pred), 3)
    roc_auc = round(roc_auc_score(y_test, y_pred), 3)

    nn_metrics.append({
        "Model": f"Neural Network {i+1}",
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
        "ROC AUC": roc_auc
    })

# Display the 7 models performances
nn_metrics = pd.DataFrame(nn_metrics)
nn_metrics

In [None]:
combined_metrics = pd.concat([metrics, nn_metrics], ignore_index=True)
combined_metrics

In [None]:
# Add the NN to the models and to the predicitons
for i, model in enumerate(neural_models):
    models[f"Neural Network {i+1}"] = model

for i, prediction_list in enumerate(neural_predictions):
    predictions_df[f'Neural Network {i+1}'] = prediction_list.flatten()
    predictions[f'Neural Network {i+1}'] = prediction_list.flatten()

## **Task 3 - Fairness Metrics**

#### **3.1 Demographic Parity**

In [None]:
# Columns groups of interest
sensitive_features = [' Sex_encoded', ' Age Range_encoded', ' Citizenship_encoded', ' Protected category_encoded']
non_sensitive_features = ['Technical Skills', 'Comunication', 'Maturity', 'Dynamism', 'Mobility',
       'English', ' Study area_encoded', ' Study Title_encoded', ' Years Experience_encoded', ' Sector_encoded', ' Job Family Hiring_encoded',
       ' Job Title Hiring_encoded', ' Overall_encoded', ' Years Experience.1_encoded',' Minimum Ral_encoded', ' Ral Maximum_encoded',
       ' Study Level_encoded', 'Current Ral_encoded', 'Expected Ral_encoded']

models_list = [model for model in models]

# Tresholds
tolerance = 0.15
significance_level = 0.1

In [None]:
def calculate_demographic_parity(predictions, sensitive_attribute, name, significance_level, tolerance, activate_check=False):

    df = pd.DataFrame({
        'predictions': predictions,
        'sensitive_attribute': sensitive_attribute
    })

    # Proportion of positive predictions for each group   
    positive_proportions = df.groupby('sensitive_attribute')['predictions'].mean()
    num_class = positive_proportions.shape[0]
    min_proportion = positive_proportions.min()
    max_proportion = positive_proportions.max()
    percentage_difference = (max_proportion - min_proportion)

    # Case for binary sensitive attribute
    if num_class == 2:
        
        if activate_check == True:
            print("===")
            print(name)
            print(positive_proportions)

        if percentage_difference <= tolerance:
            return 'T'
        else:
            return False

    # Case for multiclass sensitive attribute
    if num_class > 2:
        contingency_table = pd.crosstab(df['predictions'], df['sensitive_attribute'])
        chi2, p, dof, expected = chi2_contingency(contingency_table)

        if activate_check == True:
            print("===")
            print(name)
            print(positive_proportions)
            if (expected < 5).any():
                print(f"Sparse contigency for {name}")
                
        if p > significance_level:
            return 'T'
        else:
            return False

In [None]:
# Models behaviours over sensitive features
table = []

for model in models:
     temp = []
     for i in range(len(sensitive_features)):
        Boolean_Output = calculate_demographic_parity(predictions[model], X_test[sensitive_features[i]], sensitive_features[i], significance_level, tolerance, activate_check=True)
        temp.append(Boolean_Output)
     table.append(temp)

sf_df = pd.DataFrame(table, index = models_list, columns=sensitive_features)
sf_df.head(len(models_list))

#### **3.2 Equalized odds**

In [None]:
def calculate_equalized_odds(predictions, true_labels, sensitive_attribute, name, tolerance, activate_check=False):
    df = pd.DataFrame({
        'predictions': predictions,
        'true_labels': true_labels,
        'sensitive_attribute': sensitive_attribute
    })

    # Calculate TPR and FPR for each group
    groups = df['sensitive_attribute'].unique()
    metrics = {}
    for group in groups:
        group_df = df[df['sensitive_attribute'] == group]
        cm = confusion_matrix(group_df['true_labels'], group_df['predictions'], labels=[0, 1])
        tn, fp, fn, tp = cm.ravel()

        tpr = tp / (tp + fn) if tp + fn != 0 else 0
        fpr = fp / (fp + tn) if fp + tn != 0 else 0
        metrics[group] = {'TPR': tpr, 'FPR': fpr}

    # Check if TPR and FPR are within the tolerance
    tprs = [metrics[group]['TPR'] for group in groups]
    fprs = [metrics[group]['FPR'] for group in groups]

    max_tpr_diff = max(tprs) - min(tprs)
    max_fpr_diff = max(fprs) - min(fprs)

    if activate_check == True:
        print("===")
        print(name)
        print(max_fpr_diff)
        print(max_tpr_diff)

    tpr_within_tolerance = max_tpr_diff <= tolerance*2
    fpr_within_tolerance = max_fpr_diff <= tolerance*2

    if tpr_within_tolerance and fpr_within_tolerance:
        return 'T'
    else:
        return False

In [None]:
# Equalized odds
table = []

for model in models:
    temp = []
    for i in range(len(sensitive_features)):
        Boolean_Output = calculate_equalized_odds(predictions[model], y_test, X_test[sensitive_features[i]], sensitive_features[i], tolerance, activate_check=False)
        temp.append(Boolean_Output)
    table.append(temp)

# DataFrame
equalized_df = pd.DataFrame(table, index = models_list, columns=sensitive_features)
equalized_df.head(len(models_list))