In [None]:
import pandas as pd 
from loguru import logger 
from blog.data.data_cleaner_factory import DataCleanerFactory

dcf = DataCleanerFactory()
lnt_dataset  = dcf.getDataset('lnt')
X,y = lnt_dataset.get_data(path='../data/lnt_dataset.csv')

In [None]:
from sklearn import datasets, model_selection, ensemble
seed = 1


train, test, y_train, y_test = model_selection.train_test_split(X, 
                                                                y, 
                                                                train_size=0.70,random_state=seed)
model = ensemble.RandomForestClassifier(random_state=seed)
model.fit(train, y_train)

In [None]:
# Contrastive explanation
import contrastive_explanation as ce

dm = ce.domain_mappers.DomainMapperTabular(train, 
                                           feature_names=X.feature_names,
					   contrast_names=y.target_names)
exp = ce.ContrastiveExplanation(dm, verbose=True)

sample = test[0]
exp.explain_instance_domain(model.predict_proba, sample)

In [None]:
# Read the adult data set (https://archive.ics.uci.edu/ml/datasets/Adult)
c_file = ce.utils.download_data('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')
c_df = pd.read_csv(c_file, header=None, skipinitialspace=True)
c_df = c_df.drop([2, 4], axis=1)

# Give descriptive names to features
c_features    = ['age', 'workclass', 'education', 'marital-status',
                 'occupation', 'relationship', 'race', 'sex',
                 'capital-gain', 'capital-loss', 'hours-per-week',
                 'native-country']
c_categorical = ['workclass', 'education', 'marital-status', 'occupation',
                 'relationship', 'race', 'sex', 'native-country']
c_df.columns  = c_features + ['class']
c_contrasts   = c_df['class'].unique()

# Split into x and y (class feature is last feature)
cx, cy = c_df.iloc[:, :-1], c_df.iloc[:, -1]
c_df.head()

In [None]:
# Split data in a train/test set and in predictor (x) and target (y) variables
from sklearn import datasets, model_selection, ensemble, metrics, pipeline, preprocessing
import numpy as np 
SEED = np.random.RandomState(1994)
cx_train, cx_test, cy_train, cy_test = model_selection.train_test_split(cx, 
                                                                        cy, 
                                                                        train_size=0.80, 
                                                                        random_state=76)

# Train an AdaBoostClassifier
c_model = pipeline.Pipeline([('label_encoder', ce.CustomLabelEncoder(c_categorical).fit(cx)),
                             ('classifier', ensemble.AdaBoostClassifier(random_state=75, n_estimators=100))])
c_model.fit(cx_train, cy_train)

# Print out the classifier performance (F1-score)
print('Classifier performance (F1):', metrics.f1_score(cy_test, c_model.predict(cx_test), average='weighted'))

In [None]:
# Select a sample to explain ('questioned data point') why it predicted the fact instead of the foil 
sample = cx_test.iloc[5]
print(sample)

# Create a domain mapper for the Pandas DataFrame (it will automatically infer feature names)
c_dm = ce.domain_mappers.DomainMapperPandas(cx_train,
                                            contrast_names=c_contrasts)

# Create the contrastive explanation object (default is a Foil Tree explanator)
c_exp = ce.ContrastiveExplanation(c_dm)

# Explain the instance (sample) for the given model
c_exp.explain_instance_domain(c_model.predict_proba, sample)

In [None]:
print(c_contrasts)

In [None]:
type(cy_train)