In [1]:
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning

# Filter out the ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
# Configuration
dataset_name = "ausprivauto0405"
target_column = "ClaimNb"
file_path = f"{dataset_name}.csv"
test_size = 0.2
random_state = 42

# Load the data
df = pd.read_csv(file_path, index_col=0)

In [3]:
# Convert categorical columns to one-hot encoding
categorical_features = df.select_dtypes(include=['category', 'object']).columns.tolist()

for feature in categorical_features:
    onehot = pd.get_dummies(df[feature], prefix=feature)
    df = df.drop(feature, axis=1)
    df = df.join(onehot)


In [4]:
# Prepare features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Train initial model
model = LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=100)
model.fit(x_train, y_train)

# Make predictions
y_pred = model.predict(x_test)

# Evaluate initial model
print("Initial model performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred):.5f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.5f}")
print(f"Recall: {recall_score(y_test, y_pred):.5f}")

Initial model performance:
Accuracy: 0.60662
Precision: 0.09745
F1 Score: 0.16799
Recall: 0.60835


In [7]:
from fairlearn.metrics import equalized_odds_difference

# Assuming 'Gender' was somehow preserved or can be inferred for fairness analysis
# If 'Gender' is directly available, use it as shown below
# If 'Gender' is not directly available, you might need to infer it from one-hot encoded columns or adjust your preprocessing

# For demonstration, let's assume 'Gender' is directly available in `x_test`
# This is a placeholder step; you need to ensure 'Gender' is correctly extracted from your dataset
sensitive_features_gender = x_test['Gender'] if 'Gender' in x_test.columns else None

# If 'Gender' was one-hot encoded, you might need to reconstruct it, for example:
# sensitive_features_gender = x_test[['Gender_Male', 'Gender_Female']].idxmax(axis=1)

if sensitive_features_gender is not None:
    # Calculate the equalized odds difference
    eo_difference = equalized_odds_difference(y_test, y_pred, sensitive_features=sensitive_features_gender)
    print("Equalized Odds Difference for Gender: ", eo_difference)
else:
    print("Sensitive attribute 'Gender' is not available for fairness analysis.")

Equalized Odds Difference for Gender:  0.011526788382003716


In [8]:
# Import necessary libraries and methods
from fairlearn.postprocessing import ThresholdOptimizer
from fairlearn.metrics import equalized_odds_difference
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create a ThresholdOptimizer instance with equalized_odds constraint
mitigated_model = ThresholdOptimizer(estimator=model, constraints='equalized_odds', objective='balanced_accuracy_score')

# Fit the mitigated model to the training data along with the sensitive feature
mitigated_model.fit(x_train, y_train, sensitive_features=x_train['Gender'])

# Predict on the test data using the mitigated model
y_pred_mitigated = mitigated_model.predict(x_test, sensitive_features=x_test['Gender'])

# Evaluate the performance of the mitigated model
accuracy_mitigated = accuracy_score(y_test, y_pred_mitigated)
precision_mitigated = precision_score(y_test, y_pred_mitigated)
recall_mitigated = recall_score(y_test, y_pred_mitigated)
f1_mitigated = f1_score(y_test, y_pred_mitigated)

# Calculate the Equalized Odds Difference for the mitigated model
eo_mitigated = equalized_odds_difference(y_test, y_pred_mitigated, sensitive_features=x_test['Gender'])

# Print the performance metrics and fairness metric of the mitigated model
print("Mitigated model performance:")
print(f"Accuracy: {accuracy_mitigated:.5f}")
print(f"Precision: {precision_mitigated:.5f}")
print(f"F1 Score: {f1_mitigated:.5f}")
print(f"Recall: {recall_mitigated:.5f}")
print("Equalized Odds Difference (Mitigated): ", eo_mitigated)

  warn(


Mitigated model performance:
Accuracy: 0.60640
Precision: 0.09783
F1 Score: 0.16869
Recall: 0.61174
Equalized Odds Difference (Mitigated):  0.008040970158250893
