# **ProtoDash explanations**

In [1]:
from aix360.algorithms.protodash import ProtodashExplainer

import numpy as np
import pandas as pd

In [2]:
UNDER_EDUCATED_TEST_FEATURES = "../01_data/test/under_educated_features.csv"
COLLEGE_EDUCATED_TEST_FEATURES = "../01_data/test/college_educated_features.csv"
WELL_EDUCATED_TEST_FEATURES = "../01_data/test/well_educated_features.csv"
WHOLE_DATASET = "../01_data/census_labeled_indexed.csv"

## **Understanding the data**

In this part, we will check what characterizes the datasets.

In [3]:
whole_census = pd.read_csv(WHOLE_DATASET, index_col="index")

In [4]:
M = 10 #number of representative examples

In [5]:
explainer = ProtodashExplainer()

In [6]:
under = np.loadtxt(UNDER_EDUCATED_TEST_FEATURES, delimiter=",")
coll = np.loadtxt(COLLEGE_EDUCATED_TEST_FEATURES, delimiter=",")
well = np.loadtxt(WELL_EDUCATED_TEST_FEATURES, delimiter=",")

### **Under educated set**

In [7]:
(w_under, s_under, _) = explainer.explain(under, under, M)

In [8]:
w_under = np.around(w_under/np.sum(w_under), 2)
w_under, s_under

(array([0.25, 0.14, 0.1 , 0.06, 0.16, 0.07, 0.07, 0.06, 0.05, 0.04]),
 array([ 355, 1591,  190,  256, 1265, 1084, 1343, 1743, 1584, 1322]))

In [9]:
#show representative samples
under_human_readable = whole_census[whole_census["education-num"] < 10].iloc[s_under]
under_human_readable

Unnamed: 0_level_0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation,income-category
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
797,40,39,0,0,Private,HS-grad,9,Married-civ-spouse,Husband,White,Male,United-States,33091,Craft-repair,1
4060,3,19,0,0,Private,HS-grad,9,Never-married,Own-child,White,Male,United-States,6900,Sales,0
435,90,49,0,0,Private,HS-grad,9,Divorced,Not-in-family,White,Male,United-States,20500,Transport-moving,1
580,12,90,2964,0,Self-emp-not-inc,HS-grad,9,Never-married,Not-in-family,White,Male,United-States,18169,Exec-managerial,0
3355,35,23,0,0,Private,HS-grad,9,Never-married,Unmarried,White,Female,United-States,2200,Other-service,0
2778,30,50,7298,0,Local-gov,HS-grad,9,Married-civ-spouse,Wife,White,Female,United-States,68502,Adm-clerical,2
3530,48,45,0,0,Private,HS-grad,9,Married-civ-spouse,Husband,Black,Male,?,30368,Machine-op-inspct,1
4434,60,30,0,0,Private,HS-grad,9,Separated,Not-in-family,White,Male,United-States,26650,Handlers-cleaners,1
4039,40,55,0,0,Private,HS-grad,9,Widowed,Own-child,White,Female,United-States,21800,Adm-clerical,1
3478,50,38,5178,0,Self-emp-inc,HS-grad,9,Married-civ-spouse,Husband,White,Male,Portugal,63654,Exec-managerial,2


### **College education set**

In [10]:
(w_coll, s_coll, _) = explainer.explain(coll, coll, M)

In [11]:
w_coll = np.around(w_coll/np.sum(w_coll), 2)
w_coll, s_coll

(array([0.26, 0.09, 0.09, 0.14, 0.13, 0.1 , 0.07, 0.06, 0.04, 0.02]),
 array([ 389,  245, 1313,  353,  257, 1111,  493, 1096,  431,  734]))

In [12]:
#show representative samples
coll_human_readable = whole_census[np.logical_and(
    (whole_census["education-num"] >= 10),
    (whole_census["education-num"] < 13))].iloc[s_coll]
coll_human_readable

Unnamed: 0_level_0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation,income-category
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1477,36,35,0,0,Private,Assoc-voc,11,Married-civ-spouse,Husband,White,Male,United-States,39902,Adm-clerical,1
916,98,22,0,0,Private,Some-college,10,Never-married,Own-child,White,Male,United-States,20000,Prof-specialty,1
5071,5,70,0,0,Self-emp-inc,Some-college,10,Never-married,Not-in-family,White,Male,United-States,44290,Exec-managerial,1
1351,12,18,0,0,Private,Some-college,10,Never-married,Own-child,White,Female,United-States,4200,Other-service,0
959,54,29,0,0,Private,Some-college,10,Divorced,Unmarried,White,Female,United-States,18200,Sales,0
4407,40,34,0,0,Self-emp-not-inc,Assoc-acdm,12,Never-married,Not-in-family,White,Male,United-States,13035,Craft-repair,0
1881,32,43,0,1902,Private,Assoc-voc,11,Married-civ-spouse,Wife,White,Female,United-States,79228,Tech-support,2
4347,40,54,0,0,Local-gov,Assoc-voc,11,Married-civ-spouse,Husband,Black,Male,United-States,32000,Transport-moving,1
1624,45,47,0,0,Private,Some-college,10,Widowed,Not-in-family,White,Female,Germany,41000,Exec-managerial,1
2990,40,23,99999,0,State-gov,Some-college,10,Never-married,Not-in-family,Other,Female,United-States,99999,Protective-serv,2


### **Well educated set**

In [13]:
(w_well, s_well, _) = explainer.explain(well, well, M)

In [65]:
w_well = np.around(w_well/np.sum(w_well), 2)
w_well

array([0.43, 0.2 , 0.06, 0.05, 0.08, 0.05, 0.03, 0.04, 0.03, 0.02])

In [15]:
#show representative samples
well_human_readable = whole_census[whole_census["education-num"] >= 13].iloc[s_well]
well_human_readable

Unnamed: 0_level_0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation,income-category
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2334,45,42,0,0,Private,Doctorate,16,Married-civ-spouse,Husband,White,Male,United-States,99999,Prof-specialty,2
550,30,22,0,0,Private,Bachelors,13,Never-married,Not-in-family,White,Male,United-States,17030,Exec-managerial,0
3096,72,67,15831,0,Local-gov,Masters,14,Never-married,Other-relative,White,Female,United-States,79670,Exec-managerial,2
1731,10,74,0,0,Private,Masters,14,Divorced,Not-in-family,White,Female,United-States,6041,Sales,0
2618,50,37,0,0,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Wife,White,Female,United-States,41646,Sales,1
2525,40,42,0,0,Federal-gov,Bachelors,13,Divorced,Own-child,White,Male,United-States,11000,Adm-clerical,0
1642,55,55,0,0,Private,Bachelors,13,Married-spouse-absent,Unmarried,White,Female,Ireland,24000,Craft-repair,1
3230,50,25,2597,0,State-gov,Bachelors,13,Never-married,Not-in-family,White,Female,United-States,17985,Other-service,0
1985,60,41,0,1977,Self-emp-inc,Bachelors,13,Married-civ-spouse,Husband,Asian-Pac-Islander,Male,Taiwan,99999,Exec-managerial,2
2698,27,24,0,0,Local-gov,Bachelors,13,Never-married,Own-child,Black,Female,United-States,13000,Adm-clerical,0


---

## **Comparing datasets**

In [16]:
#What is the difference in well educated set (want to explain) compared to under educated one (prototypes).
(w_comp_hu, s_comp_hu, _) = explainer.explain(well, under, M)
comp_hu_human_readable = whole_census[whole_census["education-num"] < 10].iloc[s_comp_hu]
comp_hu_human_readable

Unnamed: 0_level_0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation,income-category
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4500,45,39,0,0,Private,HS-grad,9,Married-civ-spouse,Husband,White,Male,United-States,48351,Prof-specialty,1
580,12,90,2964,0,Self-emp-not-inc,HS-grad,9,Never-married,Not-in-family,White,Male,United-States,18169,Exec-managerial,0
261,64,21,0,0,Private,HS-grad,9,Never-married,Not-in-family,White,Female,United-States,5902,Other-service,0
4060,3,19,0,0,Private,HS-grad,9,Never-married,Own-child,White,Male,United-States,6900,Sales,0
2069,99,51,0,0,Self-emp-inc,HS-grad,9,Married-spouse-absent,Not-in-family,White,Male,United-States,50460,Exec-managerial,2
5294,40,41,0,0,Local-gov,HS-grad,9,Divorced,Unmarried,White,Female,United-States,26346,Exec-managerial,1
2448,50,43,0,0,Private,HS-grad,9,Married-civ-spouse,Wife,White,Female,Canada,72575,Adm-clerical,2
5037,40,27,0,0,Federal-gov,HS-grad,9,Never-married,Not-in-family,Black,Female,United-States,18000,Adm-clerical,0
4185,40,45,0,0,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Husband,Asian-Pac-Islander,Male,China,16426,Exec-managerial,0
4042,40,55,0,1887,State-gov,HS-grad,9,Married-civ-spouse,Wife,White,Female,United-States,67713,Tech-support,2


# **Checking Model Inaccuracy**

In [23]:
MODEL_NAME = "under_educated"
TARGET_SET = "../01_data/test/well_educated_labels.csv"

In [24]:
import neural_network
model = neural_network.create_model(well)

model.load_weights("../03_models/"+MODEL_NAME+"/")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f339b8cc240>

In [44]:
predictions = np.argmax(model.predict(well), axis=1)

### Searching for a "big mistake"

In [57]:
target = np.loadtxt(TARGET_SET, delimiter=",")
target = np.argmax(target, axis=1)
diffs = np.abs(predictions - target)

#searching for a big mistake:
idx = 0
while (idx<len(diffs)) and (diffs[idx] < 2):
    idx += 1

In [59]:
comp_human_readable = whole_census[whole_census["education-num"] >= 13].iloc[idx]
print("predicted: %d\t actual: %d"%(predictions[idx], target[idx]))
comp_human_readable

predicted: 0	 actual: 2


hours-per-week                     40
age                                32
capital-gain                        0
capital-loss                        0
workclass                     Private
education                   Bachelors
education-num                      13
marital-status     Married-civ-spouse
relationship                  Husband
race                            White
gender                           Male
native-country          United-States
income                          77700
occupation            Exec-managerial
income-category                     2
Name: 38, dtype: object

### Trying to explain the mistake

In [68]:
(w_comp, s_comp, _) = explainer.explain(np.array(well[idx]).reshape(1,len(well[idx])), under, M)
comp_human_readable = whole_census[whole_census["education-num"] < 10].iloc[s_comp]
comp_human_readable

Unnamed: 0_level_0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation,income-category
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
4497,40,33,0,0,Private,HS-grad,9,Married-civ-spouse,Husband,White,Male,United-States,40200,Exec-managerial,1
5182,35,20,0,0,Private,HS-grad,9,Never-married,Not-in-family,Black,Female,Outlying-US(Guam-USVI-etc),7560,Adm-clerical,0
3666,98,26,0,0,Self-emp-inc,12th,8,Married-civ-spouse,Husband,Other,Male,Dominican-Republic,15500,Sales,0
3106,5,32,0,0,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Wife,White,Female,?,2962,Other-service,0
1341,40,35,0,0,State-gov,HS-grad,9,Divorced,Unmarried,White,Female,United-States,33000,Protective-serv,1
559,40,18,0,0,Private,HS-grad,9,Never-married,Own-child,Asian-Pac-Islander,Female,United-States,3025,Sales,0
776,21,56,0,0,Self-emp-inc,HS-grad,9,Married-civ-spouse,Other-relative,Asian-Pac-Islander,Female,China,20088,Exec-managerial,1
5189,40,32,0,0,Private,HS-grad,9,Married-spouse-absent,Other-relative,Other,Male,Mexico,6600,Other-service,0
1740,60,27,0,0,Self-emp-not-inc,HS-grad,9,Married-civ-spouse,Husband,White,Male,Ireland,75050,Craft-repair,2
4488,45,24,0,0,Private,HS-grad,9,Married-civ-spouse,Wife,Amer-Indian-Eskimo,Female,United-States,21000,Exec-managerial,1


In [69]:
w_comp = np.around(w_comp/np.sum(w_comp), 2)
w_comp

array([0.9 , 0.02, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01])