In [1]:
from fairness.matching import conscious_fairness_through_unawareness
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
dataset = pd.read_csv('./dataset/adult/adult.data', sep=',')

In [3]:
print("Columns original dataset: ", len(dataset.columns))

Columns original dataset:  15


In [4]:
columns_to_drop = ['education-num']
protected_attributes = ['sex', 'race', 'relationship', 'native-country']
output_column = 'income'

In [5]:
dataset[output_column] = LabelEncoder().fit_transform(dataset[output_column])

In [6]:
fair_dataset = conscious_fairness_through_unawareness(dataset, protected_attributes, dataset[output_column].unique(), output_column, columns_to_drop)

unfair


In [7]:
print("Columns fair dataset: ", len(fair_dataset.columns))

Columns fair dataset:  8


In [8]:
output_column = "income"

In [9]:
X = fair_dataset.iloc[:,:len(fair_dataset.columns) - 2]
y = fair_dataset.loc[:, output_column:]
y = np.ravel(y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
standard_scaler_train = StandardScaler()
X_train = standard_scaler_train.fit_transform(X_train)

standard_scaler_test = StandardScaler()
X_test = standard_scaler_test.fit_transform(X_test)

In [12]:
random_forest_classifier = RandomForestClassifier()
decision_tree_classifier = DecisionTreeClassifier()
knn_classifier = KNeighborsClassifier()

models = [
    random_forest_classifier,
    decision_tree_classifier,
    knn_classifier
]

random_forest_params = {
    'min_samples_leaf': [5, 7, 9, 11],
    'n_estimators': [200, 500],
    'max_depth' : [10, 20, 50, 80, 100, 150],
    'criterion' :['gini', 'entropy']
}

decision_tree_params = {
    'min_samples_leaf': [5, 7, 9, 11],
    'max_depth' : [10, 20, 50, 80, 100, 150],
    'criterion' :['gini', 'entropy']
}

knn_params = {
    'algorithm' :['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': range(20, 50),
}


model_params = [random_forest_params, decision_tree_params, knn_params]

In [13]:
best_params = None
max_accuracy = 0
best_model = None

for model_index in range(0, len(models)):

    print("Model: ", models[model_index])
    
    grid_search = GridSearchCV(
    estimator=models[model_index],
    param_grid=model_params[model_index],
    scoring='accuracy', 
    return_train_score=True,
    cv=3
    )

    grid_search_classifier = grid_search.fit(X_train, y_train)
    y_pred = grid_search_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print("Model: ", models[model_index])
    print("Best params: ", grid_search_classifier.best_params_)
    print("Accuracy: ", accuracy)
    
    if accuracy > max_accuracy:
        best_params = grid_search_classifier.best_params_
        max_accuracy = accuracy
        best_model = models[model_index]


Model:  RandomForestClassifier()


Model:  RandomForestClassifier()
Best params:  {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 200}
Accuracy:  0.8088438507600184
Model:  DecisionTreeClassifier()
Model:  DecisionTreeClassifier()
Best params:  {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 11}
Accuracy:  0.8014739751266697
Model:  KNeighborsClassifier()
Model:  KNeighborsClassifier()
Best params:  {'algorithm': 'ball_tree', 'leaf_size': 34}
Accuracy:  0.7999385843697221


In [14]:
print("BEST SOLUTION")
print("Best model: ", best_model)
print("Best_params: ", best_params)
print("Accuracy: ", max_accuracy)

BEST SOLUTION
Best model:  RandomForestClassifier()
Best_params:  {'criterion': 'entropy', 'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 200}
Accuracy:  0.8088438507600184


In [15]:
#print("Accuracy score on Test set: ", accuracy_score(y_test, y_pred))
#print("Accuracy score on Train set: ", accuracy_score(y_train, grid_search_classifier.predict(X_train)))
#print(classification_report(y_test, y_pred))

In [16]:
#cm_display = ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred), display_labels=[False, True])
#cm_display.plot()
#plt.show()