<a href="https://colab.research.google.com/github/arturrur/mc853/blob/main/Fairness.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook para equidade - Entrega 03

In [53]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (StratifiedKFold,
                                     GridSearchCV)

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import (balanced_accuracy_score,
                             make_scorer,
                             roc_auc_score,
                             recall_score,
                             precision_score)

from sklearn.linear_model import LogisticRegression

from sklearn.impute import KNNImputer

In [54]:
# Set the number of kfolds for grid search
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)

# Set preprocessing: StandardScaler for feature standardization
preprocessing = StandardScaler()

# Choice of the best hyperparameters through balanced accuracy metric
perf = balanced_accuracy_score

# Initialize KNNImputer with the specified number of neighbors
imputer = KNNImputer(n_neighbors=3)

In [55]:
def data_sample(X, y):
    '''
    Receives a set of features and target feature separately.
    Returns balanced data, with the same number of samples in both classes.
    If the minority class is less than 5%, applies oversampling and undersampling.
    Otherwise, applies only undersampling.
    Parameters:
        X : array-like, shape (n_samples, n_features)
            The feature matrix.
        y : array-like, shape (n_samples,)
            The target variable.

    Returns:
        X_resampled : array-like, shape (n_samples_resampled, n_features)
            The resampled feature matrix.
        y_resampled : array-like, shape (n_samples_resampled,)
            The resampled target variable.
    '''

    # Define sampling strategies
    undersample = RandomUnderSampler(sampling_strategy='majority', random_state=1)
    oversample = RandomOverSampler(sampling_strategy=0.2, random_state=1)

    # Identify the minority class
    count_1 = (y == 1).sum()
    count_0 = (y == 0).sum()
    count_min = min(count_0, count_1)
    count_max = max(count_0, count_1)


    # Calculate the percentage of the minority class compared to the total number of instances
    ratio = (count_min / count_max)

    # If the minority class is more than 60% of the majority class, do not apply any resampling technique
    if ratio > 0.6:
        X_resampled, y_resampled = X, y

    # Check if the percentage of class 1 is at least 5% of the total number of instances
    # If it is less than 5%, apply both over and under sampling
    else:
        if ratio <= 0.2:
            X_resampled, y_resampled = oversample.fit_resample(X, y)
            X_resampled, y_resampled = undersample.fit_resample(X_resampled, y_resampled)

        # Otherwise, apply only undersampling
        else:
            X_resampled, y_resampled = undersample.fit_resample(X, y)

    return X_resampled, y_resampled

In [56]:
def model_fairness(data, data_test, remove_sensitive=False):
  # This function will:
  # 1 Remove sensitive atribute's column (optional)
  # 2 Resample and normalize data
  # 3 Impute missing data
  # 4 Train the LogisticRegression model
  # 5 Print the True positive rate

  # Remove sensitive attribute
  senstive_col = data_test['Sexo']
  if remove_sensitive:
    data = data.drop(columns=['Sexo', 'Gestante'])
    data_test = data_test.drop(columns=['Sexo', 'Gestante'])

  # Identify target column
  target_feature = data.columns[-1]

  # Separate features (X) and target (y) for training data
  X = data.drop(columns=[target_feature])
  y = data[target_feature]

  # Separate features (X) and target (y) for test data
  X_test = data_test.drop(columns=[target_feature])
  y_test = data_test[target_feature]

  # Resample training data
  X_train, y_train = data_sample(X, y)

  # Impute missing data
  X_train = imputer.fit_transform(X_train)
  X_test = imputer.transform(X_test)

  # Normalize data
  X_train = preprocessing.fit_transform(X_train)
  X_test = preprocessing.transform(X_test)

  best = GridSearchCV(LogisticRegression(max_iter=1000), {'class_weight': ('balanced', {0:1, 1:2}, {0:1, 1:3})}, cv=gskf, scoring=(make_scorer(perf)))
  best.fit(X_train, y_train)

  y_pred = best.predict(X_test)

  # Calculating perfomance metrics
  recallscore = recall_score(y_test, y_pred, labels=[0, 1], average=None)
  sen = recallscore[1]
  spe = recallscore[0]

  # Calculate precision for each class
  prec_score = precision_score(y_test, y_pred, labels=[0, 1], average=None)
  prec_n = prec_score[0]
  prec_p = prec_score[1]

  # Calculate the area under the ROC curve
  aucscore = roc_auc_score(y_test, (best.predict_proba((X_test)))[:, 1])
  auc = aucscore



  # Calculating TPR for each sex
  TruePosFem = 0
  FalseNegFem = 0
  TruePosMale = 0
  FalseNegMale = 0
  for i in range(len(y_pred)):
    if senstive_col[i] == 1:
      if y_test[i] == 1:
        if y_pred[i] == 1:
          TruePosFem += 1
        else:
          FalseNegFem += 1
    else:
      if y_test[i] == 1:
        if y_pred[i] == 1:
          TruePosMale += 1
        else:
          FalseNegMale += 1

  print("Metrics without sex column") if remove_sensitive else print("Metrics with sex column")

  print(f"- Recall 1:        {sen:.4f}")
  print(f"- Recall 0:        {spe:.4f}")
  print(f"- Precisão 1:      {prec_p:.4f}")
  print(f"- Precisão 0:      {prec_n:.4f}")
  print(f"- AUC:             {auc:.4f}")
  print(f"- Female TPR:      {TruePosFem / (TruePosFem + FalseNegFem)}")
  print(f"- Male TPR:        {TruePosMale/ (TruePosMale + FalseNegMale)}")


In [57]:

train_df = pd.read_csv('https://raw.githubusercontent.com/arturrur/mc853/refs/heads/main/data/training/treino.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/arturrur/mc853/refs/heads/main/data/test/teste.csv')


In [58]:
model_fairness(train_df, test_df)
model_fairness(train_df, test_df, True)

Metrics with sex column
- Recall 1:        0.7297
- Recall 0:        0.7922
- Precisão 1:      0.2864
- Precisão 0:      0.9625
- AUC:             0.8526
- Female TPR:      0.6530612244897959
- Male TPR:        0.75
Metrics without sex column
- Recall 1:        0.7269
- Recall 0:        0.7932
- Precisão 1:      0.2866
- Precisão 0:      0.9621
- AUC:             0.8532
- Female TPR:      0.6802721088435374
- Male TPR:        0.7392086330935251
