In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from fairness.matching import conscious_fairness_through_unawareness
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
dataset = pd.read_csv('./dataset/ull/ULL_dataset.csv', sep=',')

In [3]:
dataset

Unnamed: 0.1,Unnamed: 0,scores,score_mat,level_mat,score_len,level_len,score_ing,level_ing,a1,a2,...,p331a,p331b,p331c,p331d,p331e,p331f,p331g,p331j,pfc,rep
0,0,1,564.8700,3.0,535.1500,3.0,500.461528,2.0,2.0,2007.0,...,4.0,4.0,4.0,4.0,4.0,3.0,5.0,4.0,3.0,2.0
1,1,1,388.3400,1.0,293.7000,1.0,500.461528,2.0,1.0,2007.0,...,4.0,4.0,4.0,4.0,4.0,3.0,5.0,4.0,3.0,2.0
2,2,1,386.5900,1.0,514.8100,3.0,500.461528,2.0,2.0,2007.0,...,4.0,4.0,4.0,4.0,4.0,3.0,5.0,4.0,3.0,2.0
3,3,1,487.7600,2.0,449.2500,2.0,500.461528,2.0,1.0,2007.0,...,2.0,2.0,3.0,2.0,3.0,3.0,5.0,4.0,3.0,2.0
4,4,1,709.7900,4.0,598.7200,3.0,500.461528,2.0,2.0,2007.0,...,4.0,4.0,4.0,4.0,4.0,3.0,5.0,4.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83852,83852,1,400.8625,2.0,446.6522,2.0,294.747400,1.0,2.0,2007.0,...,4.0,4.0,4.0,4.0,4.0,3.0,5.0,3.0,10.0,2.0
83853,83853,1,597.0243,3.0,632.6043,4.0,633.296600,4.0,2.0,2007.0,...,3.0,4.0,4.0,4.0,4.0,3.0,5.0,4.0,3.0,2.0
83854,83854,1,707.9254,4.0,400.2761,2.0,477.505600,2.0,1.0,2007.0,...,4.0,4.0,4.0,4.0,4.0,3.0,5.0,4.0,3.0,2.0
83855,83855,1,522.8511,3.0,656.1601,4.0,540.112200,3.0,2.0,2007.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,8.0,1.0


In [4]:
print("Columns of the dataset: ", len(dataset.columns))

Columns of the dataset:  549


In [5]:
output_columns = ['level_mat', 'level_len', 'level_ing']

In [6]:
protected_attributes = ['score_MAT', 'level_MAT', 'score_LEN', 'level_LEN', 'score_ING',
       'level_ING', 'a1', 'a4', 'repeater', 'a24', 'a41', 'a42',
       'country_iso_cnac', 'country_iso_nac', 'island', 'capital_island',
       'public_private', 'f3a', 'f3b', 'mother_education',
       'father_education', 'f4a', 'f4b', 'f5a', 'f5b', 'f5n', 'inmigrant',
       'inmigrant2', 'inmigrant_second_gen', 'f11', 'books', 'f23',
       'f24a', 'f24b', 'mother_occupation', 'father_occupation', 'f34',
       'household_income_q', 'escs']

In [7]:
protected_attributes = [x.lower() for x in protected_attributes]

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

random_forest_classifier = RandomForestClassifier()
decision_tree_classifier = DecisionTreeClassifier()

models = [
    random_forest_classifier,
    decision_tree_classifier
]

random_forest_params = {
    'min_samples_leaf': [5, 7, 9, 11],
    'n_estimators': [200, 500],
    'max_depth' : [10, 20, 50, 80, 100, 150],
    'criterion' :['gini', 'entropy']
}

decision_tree_params = {
    'min_samples_leaf': [5, 7, 9, 11],
    'max_depth' : [10, 20, 50, 80, 100, 150],
    'criterion' :['gini', 'entropy']
}


model_params = [random_forest_params, decision_tree_params]

In [9]:
dataset.drop(columns=['Unnamed: 0', 'scores'], inplace=True)

In [10]:
for output_column in output_columns:
    output_column_values = [int(x) for x in dataset[output_column]]
    if output_column in protected_attributes:
        protected_attributes.remove(output_column)
    else:
        protected_attributes = protected_attributes
        
    fair_dataset = conscious_fairness_through_unawareness(dataset, protected_attributes, output_column_values, output_column)
    print("Columns of the fair dataset: ", fair_dataset.columns)
    
    X = dataset.loc[:, dataset.columns!=output_column]
    y = output_column
    y = np.ravel(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    standard_scaler_train = StandardScaler()
    X_train = standard_scaler_train.fit_transform(X_train)

    standard_scaler_test = StandardScaler()
    X_test = standard_scaler_test.fit_transform(X_test)

    best_params = None
    max_accuracy = 0
    best_model = None

    for model_index in range(0, len(models)):
        
        grid_search = GridSearchCV(
        estimator=models[model_index],
        param_grid=model_params[model_index],
        scoring='accuracy', 
        return_train_score=True,
        cv=3
        )

        grid_search_classifier = grid_search.fit(X_train, y_train)
        y_pred = grid_search_classifier.predict(X_test)

        print("Output column: ", output_column)
        print("Model: ", models[model_index])
        print("Best params: ", grid_search_classifier.best_params_)
        print("Accuracy: ", accuracy_score(y_test, y_pred))

        if accuracy_score(y_test, y_pred) > max_accuracy:
            max_accuracy = accuracy_score(y_test, y_pred)
            best_model = models[model_index]
            best_params = grid_search_classifier.best_params_

    print("OUTPUT COLUMN: ", output_column)
    print("Best model: ", best_model)
    print("Best_params: ", best_params)
    print("Accuracy: ", max_accuracy)

KeyboardInterrupt: 