In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
import pickle
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve


In [None]:

# Load the Adult dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, header=None)

# Add column headers
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
              'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
              'hours-per-week', 'native-country', 'income']

# Preprocess the data
# Remove missing values and convert categorical variables to numerical using one-hot encoding
print("Preprocessing the data...")
df = df.dropna()
df = pd.get_dummies(df, columns=['workclass', 'education', 'marital-status',
                                 'occupation', 'relationship', 'race', 'sex', 'native-country'])

# Normalize the numerical features
print("Normalizing the numerical features...")
scaler = StandardScaler()
numerical_cols = ['age', 'fnlwgt', 'education-num',
                  'capital-gain', 'capital-loss', 'hours-per-week']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split the data into training and testing sets
print("Splitting the data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(df.drop('income', axis=1),
                                                    df['income'],
                                                    test_size=0.2,
                                                    random_state=42)




Preprocessing the data...
Normalizing the numerical features...
Splitting the data into training and testing sets...
Balancing the data using SMOTE...


In [12]:
# Use SMOTE to oversample the minority class within the cross validation
# This way the sampling is done on the training data only, hence avoiding leakage
print("Balancing the data using SMOTE...")
sm = SMOTE(random_state=42)


# Define the Random Forest Classifier and MLPClassifier
rfc = RandomForestClassifier(random_state=42)
mlp = MLPClassifier(random_state=42)

# Define the pipeline with SMOTE, normalization and the classifier
pipeline_rfc = Pipeline([('sm', sm), ('scaler', scaler), ('rfc', rfc)])
pipeline_mlp = Pipeline([('sm', sm), ('scaler', scaler), ('mlp', mlp)])

# Define the hyperparameters to tune with GridSearchCV
param_grid_rfc = {'rfc__n_estimators': [50, 100, 200],
                  'rfc__max_depth': [5, 10, 15],
                  'rfc__min_samples_split': [2, 5, 10]}

param_grid_mlp = {'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
                  'mlp__alpha': [0.0001, 0.001, 0.01],
                  'mlp__max_iter': [100, 200, 300]}

# Train the models using GridSearchCV to find the best hyperparameters
print("Training the models using GridSearchCV...")
grid_rfc = GridSearchCV(pipeline_rfc, param_grid_rfc,
                        cv=5, n_jobs=-1,  verbose=3)
grid_rfc.fit(X_train, y_train)

grid_mlp = GridSearchCV(pipeline_mlp, param_grid_mlp,
                        cv=5, n_jobs=-1,  verbose=3)
grid_mlp.fit(X_train, y_train)




Balancing the data using SMOTE...
Training the models using GridSearchCV...
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 2/5] END rfc__max_depth=5, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.742 total time=   1.0s
[CV 5/5] END rfc__max_depth=5, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.754 total time=   1.0s
[CV 3/5] END rfc__max_depth=5, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.761 total time=   1.3s
[CV 4/5] END rfc__max_depth=5, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.745 total time=   1.3s
[CV 1/5] END rfc__max_depth=5, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.741 total time=   1.4s
[CV 1/5] END rfc__max_depth=5, rfc__min_samples_split=2, rfc__n_estimators=100;, score=0.742 total time=   1.9s
[CV 3/5] END rfc__max_depth=5, rfc__min_samples_split=2, rfc__n_estimators=100;, score=0.755 total time=   2.1s
[CV 2/5] END rfc__max_depth=5, rfc__min_samples_split=2, rfc__n_estimators=100;, sc



[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.836 total time=   9.7s
[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.828 total time=   9.7s
[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.827 total time=   9.8s
[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.835 total time=   9.8s
[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.808 total time=   9.9s




[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.826 total time=  19.1s
[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.826 total time=  19.1s
[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.806 total time=  19.2s
[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.826 total time=  18.1s
[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.830 total time=  18.4s




[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.822 total time=  19.2s




[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.805 total time=  14.0s
[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.810 total time=  23.7s




[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.830 total time=  28.3s




[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.825 total time=  13.6s




[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.825 total time=  13.8s




[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.824 total time=  14.0s
[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.821 total time=  25.8s




[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.824 total time=  27.4s




[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.832 total time=  13.9s




[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.807 total time=  26.9s




[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.816 total time=  27.4s
[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.831 total time=  23.0s




[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.824 total time=  26.4s




[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.819 total time=  26.9s




[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.810 total time=  14.0s




[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.820 total time=  13.6s
[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.820 total time=  35.7s
[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.819 total time=  13.5s




[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.806 total time=  40.2s




[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.818 total time=  40.9s
[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.831 total time=  24.4s




[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.816 total time=  14.3s




[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.827 total time=  13.8s
[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.819 total time=  38.9s




[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.804 total time=  27.0s
[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.811 total time=  27.2s




[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.799 total time=  26.7s




[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.817 total time=  25.7s




[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.820 total time=  26.8s
[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.815 total time=  32.0s




[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.805 total time=  39.7s




[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.810 total time=  24.1s




[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.821 total time=  24.0s
[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.815 total time=  38.6s




[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.820 total time=  23.9s
[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.801 total time=  37.9s




[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.822 total time=  38.6s




[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.810 total time=  23.9s




[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.820 total time=  25.0s




[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.804 total time=  48.1s




[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.809 total time=  47.7s




[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.807 total time=  48.8s




[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.810 total time=  48.9s




[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.807 total time=  48.0s




[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.808 total time=   9.2s




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.832 total time=   9.4s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.827 total time=   9.0s




[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.835 total time=   9.5s
[CV 2/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.807 total time=  56.1s




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.834 total time=   9.7s
[CV 1/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.803 total time= 1.2min




[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.808 total time=  18.7s




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.824 total time=  20.1s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.820 total time=  19.0s




[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.825 total time=  19.3s
[CV 3/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.811 total time= 1.2min




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.834 total time=  18.9s
[CV 4/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.810 total time=  59.2s
[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.825 total time=  22.3s
[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.821 total time=  22.6s
[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.818 total time=  22.1s
[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.807 total time=  27.1s




[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.809 total time=  13.6s
[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.830 total time=  23.2s
[CV 5/5] END mlp__alpha=0.0001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.809 total time= 1.2min




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.825 total time=  13.6s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.824 total time=  13.9s




[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.826 total time=  13.6s




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.828 total time=  13.7s




[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.806 total time=  27.9s




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.813 total time=  27.5s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.822 total time=  27.5s




[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.820 total time=  27.6s




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.825 total time=  27.7s




[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.810 total time=  14.3s
[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.818 total time=  32.7s




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.822 total time=  13.9s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.813 total time=  13.4s
[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.805 total time=  38.0s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.817 total time=  41.1s




[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.823 total time=  13.5s




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.824 total time=  13.9s
[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.822 total time=  33.4s




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.826 total time=  41.2s




[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.804 total time=  27.2s




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.817 total time=  27.1s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.802 total time=  26.8s




[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.813 total time=  26.9s




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.819 total time=  27.3s
[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.794 total time=  39.5s




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.815 total time=  39.3s




[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.805 total time=  24.6s




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.828 total time=  24.3s
[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.812 total time=  37.2s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.814 total time=  23.8s
[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.812 total time=  34.4s
[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.802 total time=  39.2s




[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.815 total time=  23.1s




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.811 total time=  23.4s




[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.798 total time=  47.3s




[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.813 total time=  48.1s




[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.808 total time=  47.2s




[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.811 total time=  48.0s




[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.816 total time=  48.4s




[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.811 total time=   9.9s




[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.830 total time=   9.5s




[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.830 total time=   9.7s
[CV 1/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.792 total time=  58.0s




[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.836 total time=   9.9s




[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=100;, score=0.835 total time=  10.7s
[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.814 total time=  19.3s
[CV 2/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.805 total time= 1.1min
[CV 3/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.816 total time= 1.1min
[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.829 total time=  20.0s




[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.821 total time=  20.7s




[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.825 total time=  20.8s




[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=200;, score=0.833 total time=  20.8s
[CV 4/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.813 total time= 1.0min
[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.814 total time=  19.1s
[CV 5/5] END mlp__alpha=0.001, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.823 total time=  58.4s
[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.826 total time=  23.4s
[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.827 total time=  21.5s




[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.821 total time=  29.8s




[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.806 total time=  14.8s




[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.826 total time=  14.6s
[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50,), mlp__max_iter=300;, score=0.824 total time=  24.9s




[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.828 total time=  14.9s




[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.821 total time=  14.5s




[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=100;, score=0.832 total time=  15.1s




[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.806 total time=  28.7s
[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.818 total time=  24.1s
[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.828 total time=  23.9s




[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.817 total time=  28.1s




[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=200;, score=0.817 total time=  29.3s
[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.818 total time=  25.1s
[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.821 total time=  29.8s




[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.815 total time=  13.9s
[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.802 total time=  38.6s
[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.821 total time=  14.0s




[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.820 total time=  14.2s




[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.822 total time=  13.9s




[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=100;, score=0.827 total time=  14.1s
[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.828 total time=  24.4s
[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100,), mlp__max_iter=300;, score=0.818 total time=  32.4s




[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.804 total time=  26.7s
[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.817 total time=  25.6s




[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.816 total time=  25.9s




[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.816 total time=  25.8s




[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=200;, score=0.819 total time=  25.7s
[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.817 total time=  23.9s
[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.807 total time=  32.6s
[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.813 total time=  35.3s




[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.811 total time=  23.6s




[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.827 total time=  23.9s




[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.822 total time=  24.2s
[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.816 total time=  33.1s
[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(50, 50), mlp__max_iter=300;, score=0.818 total time=  36.5s




[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.813 total time=  24.3s




[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=100;, score=0.816 total time=  24.5s




[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.800 total time=  49.6s
[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.804 total time=  42.3s




[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.811 total time=  49.8s




[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.811 total time=  48.6s




[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=200;, score=0.811 total time=  47.5s
[CV 1/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.799 total time=  50.9s
[CV 3/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.808 total time=  48.2s
[CV 4/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.804 total time=  32.2s
[CV 2/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.820 total time= 1.0min
[CV 5/5] END mlp__alpha=0.01, mlp__hidden_layer_sizes=(100, 100), mlp__max_iter=300;, score=0.817 total time=  37.3s




In [13]:
# Save the models to pickle files
with open("models/rfc-adult.pickle", "wb") as f:
    pickle.dump(grid_rfc.best_estimator_, f)

with open("models/mlp-adult.pickle", "wb") as f:
    pickle.dump(grid_mlp.best_estimator_, f)


In [16]:

# Print the best hyperparameters for both models
print("Best hyperparameters for Random Forest:", grid_rfc.best_params_)
print("Best hyperparameters for MLP:", grid_mlp.best_params_)

# Predict on the test set using the best models
y_pred_rfc = grid_rfc.predict(X_test)
y_pred_mlp = grid_mlp.predict(X_test)

# Print the classification reports for both models
print("Classification report for Random Forest:")
print(classification_report(y_test, y_pred_rfc))

print("Classification report for MLP:")
print(classification_report(y_test, y_pred_mlp))

# Calculate the ROC curve for both models
y_proba_rfc = grid_rfc.predict_proba(X_test)[:, 1]
fpr_rfc, tpr_rfc, thresholds_rfc = roc_curve(y_test, y_proba_rfc)

y_proba_mlp = grid_mlp.predict_proba(X_test)[:, 1]
fpr_mlp, tpr_mlp, thresholds_mlp = roc_curve(y_test, y_proba_mlp)

# Plot the ROC curve for both models
plt.figure(figsize=(8, 8))
plt.plot(fpr_rfc, tpr_rfc, label='Random Forest')
plt.plot(fpr_mlp, tpr_mlp, label='MLP')
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


Best hyperparameters for Random Forest: {'rfc__max_depth': 15, 'rfc__min_samples_split': 5, 'rfc__n_estimators': 200}
Best hyperparameters for MLP: {'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (50,), 'mlp__max_iter': 100}
Classification report for Random Forest:
              precision    recall  f1-score   support

       <=50K       0.95      0.80      0.87      4942
        >50K       0.58      0.86      0.69      1571

    accuracy                           0.81      6513
   macro avg       0.76      0.83      0.78      6513
weighted avg       0.86      0.81      0.82      6513

Classification report for MLP:
              precision    recall  f1-score   support

       <=50K       0.91      0.86      0.88      4942
        >50K       0.62      0.75      0.68      1571

    accuracy                           0.83      6513
   macro avg       0.77      0.80      0.78      6513
weighted avg       0.84      0.83      0.84      6513



ValueError: y_true takes value in {' <=50K', ' >50K'} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.

In [None]:
# Import the necessary libraries
from dice_ml import Dice
from dice_ml.utils import helpers  # for custom data normalization
import pandas as pd

# Load the adult dataset and preprocess the data
data = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data = data.drop(['fnlwgt'], axis=1)  # Drop non-relevant feature
# Replace income classes with 0 and 1
data = data.replace({'income': {'<=50K': 0, '>50K': 1}})

# Define the input for which we want to generate a counterfactual explanation
input_data = {'age': 25, 'workclass': 'Private', 'education': 'Some-college', 'education-num': 10,
              'marital-status': 'Never-married', 'occupation': 'Exec-managerial', 'relationship': 'Not-in-family',
              'race': 'White', 'sex': 'Male', 'capital-gain': 0, 'capital-loss': 0, 'hours-per-week': 40,
              'native-country': 'United-States'}

# Define the feature names and feature types (categorical or continuous)
feature_names = list(data.columns.drop('income'))
feature_types = {'age': 'continuous', 'workclass': 'categorical', 'education': 'categorical',
                 'education-num': 'continuous', 'marital-status': 'categorical', 'occupation': 'categorical',
                 'relationship': 'categorical', 'race': 'categorical', 'sex': 'categorical', 'capital-gain': 'continuous',
                 'capital-loss': 'continuous', 'hours-per-week': 'continuous', 'native-country': 'categorical'}

# Normalize the input data using the provided helper function
normalized_data = helpers.normalize_data(data)

# Define the DiCE instance and generate counterfactuals
dice = Dice(data, normalize=True)
counterfactuals = dice.generate_counterfactuals(input_data, total_CFs=4, desired_class="opposite", features_to_vary="all",
                                                feature_weights="inverse_mad", proximity_weight=0.5)
# Print the generated counterfactuals
print("Counterfactuals:")
for i, cf in enumerate(counterfactuals):
    print("CF ", i, ": ", cf.final_cfs)
