In [1]:
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning

# Filter out the ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# Set up parameters
dataset_name = "norauto"
y_label = "ClaimAmount"

# Specify the file path of the CSV file
csv_file_path =  dataset_name + ".csv"

# Load the CSV file as a DataFrame, ignoring the first column
df = pd.read_csv(csv_file_path, index_col=0)

# Display the DataFrame
df.head()

Unnamed: 0,Male,Young,DistLimit,GeoRegion,Expo,ClaimAmount
171255,0,1,no limit,Medium+,0.778,0.0
27327,0,0,8000 km,High+,0.104,0.0
77914,0,0,12000 km,High+,1.003,0.0
150038,0,0,12000 km,High-,0.156,0.0
150050,0,1,no limit,Medium+,0.252,0.0


In [3]:
categorical_features = df.select_dtypes(include=['category', 'object']).columns.tolist()
numerical_features = df.select_dtypes(include=['number']).columns.difference([y_label]).tolist()

# Replace the categorical values with the numeric equivalents that we have above
categoricalFeatures = ['DistLimit',	'GeoRegion']

# Iterate through the list of categorical features and one hot encode them.
for feature in categoricalFeatures:
    onehot = pd.get_dummies(df[feature], prefix=feature)
    df = df.drop(feature, axis=1)
    df = df.join(onehot)
X = df.drop(columns=["ClaimAmount"])
y = df["ClaimAmount"].values

scaler = StandardScaler()
data_std = scaler.fit_transform(X)
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state = 0)
# Liblinear is a solver that is very fast for small datasets, like ours
model = LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=100)

# Fit the model to the training data
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [4]:
# Evaluate the model
print("Test performance: ", end="")
print(f"Accuracy; {accuracy_score(y_test, y_pred):.5f}", end=", ")
print(f"Precision: {precision_score(y_test, y_pred, zero_division=0):.5f}", end=", ")
print(f"F1 Score:, {f1_score(y_test, y_pred):.5f}", end=", ")
print(f"Recall: {recall_score(y_test, y_pred):.5f}")

Test performance: Accuracy; 0.62445, Precision: 0.67593, F1 Score:, 0.62931, Recall: 0.58871


In [5]:
# Import the necessary method from the fairlearn library
from fairlearn.metrics import equalized_odds_difference

sensitive_attr_test = x_test['Male'].values

# Calculate the Equalized Odds Difference
eq_odds_diff = equalized_odds_difference(y_test, y_pred, sensitive_features=sensitive_attr_test)

# Print the Equalized Odds Difference
print(f"Equalized Odds Difference for 'Male': {eq_odds_diff:.5f}")

Equalized Odds Difference for 'Male': 0.28689


In [11]:
# Import necessary libraries and methods for mitigation
from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds
from sklearn.metrics import accuracy_score

# Define the Equalized Odds constraint
constraint = EqualizedOdds()

# Initialize the Exponentiated Gradient reduction method with the logistic regression model as the estimator
mitigator = ExponentiatedGradient(estimator=model, constraints=constraint)

# Fit the mitigator on the training data
mitigator.fit(x_train, y_train, sensitive_features=x_train['Male'])

# Predict with the mitigated model
y_pred_mitigated = mitigator.predict(x_test)

# Evaluate the mitigated model
print("Mitigated model performance: ", end="")
print(f"Accuracy; {accuracy_score(y_test, y_pred_mitigated):.5f}", end=", ")
print(f"Precision: {precision_score(y_test, y_pred_mitigated, zero_division=0):.5f}", end=", ")
print(f"F1 Score:, {f1_score(y_test, y_pred_mitigated):.5f}", end=", ")
print(f"Recall: {recall_score(y_test, y_pred_mitigated):.5f}")
# You can add other performance metrics here as needed

# Calculate and print the Equalized Odds Difference for the mitigated model
eq_odds_diff_mitigated = equalized_odds_difference(y_test, y_pred_mitigated, sensitive_features=sensitive_attr_test)
print(f"Equalized Odds Difference for 'Male' after mitigation: {eq_odds_diff_mitigated:.5f}")

Mitigated model performance: Accuracy; 0.62882, Precision: 0.68224, F1 Score:, 0.63203, Recall: 0.58871
Equalized Odds Difference for 'Male' after mitigation: 0.22504
