In [1]:
#source: https://docs.seldon.io/projects/alibi/en/latest/examples/anchor_text_movie.html
import os
import spacy
import string
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from alibi.explainers import AnchorText
from alibi.datasets import fetch_movie_sentiment
from alibi.utils.download import spacy_model
from alibi.utils.lang_model import DistilbertBaseUncased, BertBaseUncased, RobertaBase

In [8]:
movies = fetch_movie_sentiment()
movies.keys()

dict_keys(['data', 'target', 'target_names'])

In [9]:
data = movies.data
labels = movies.target
target_names = movies.target_names

In [10]:
train, test, train_labels, test_labels = train_test_split(data, labels, test_size=.2, random_state=42)
train, val, train_labels, val_labels = train_test_split(train, train_labels, test_size=.1, random_state=42)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
val_labels = np.array(val_labels)

In [11]:
vectorizer = CountVectorizer(min_df=1)
vectorizer.fit(train)



CountVectorizer()

In [12]:
np.random.seed(0)
clf = LogisticRegression(solver='liblinear')
clf.fit(vectorizer.transform(train), train_labels)

LogisticRegression(solver='liblinear')

In [13]:
predict_fn = lambda x: clf.predict(vectorizer.transform(x))

In [14]:
preds_train = predict_fn(train)
preds_val = predict_fn(val)
preds_test = predict_fn(test)
print('Train accuracy: %.3f' % accuracy_score(train_labels, preds_train))
print('Validation accuracy: %.3f' % accuracy_score(val_labels, preds_val))
print('Test accuracy: %.3f' % accuracy_score(test_labels, preds_test))

Train accuracy: 0.980
Validation accuracy: 0.754
Test accuracy: 0.759


In [15]:
model = 'en_core_web_sm'
spacy_model(model=model)
nlp = spacy.load(model)



In [16]:
class_names = movies.target_names

# select instance to be explained
text = data[4]
print("* Text: %s" % text)

# compute class prediction
pred = class_names[predict_fn([text])[0]]
alternative =  class_names[1 - predict_fn([text])[0]]
print("* Prediction: %s" % pred)



* Text: a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification .
* Prediction: negative


In [23]:
predict_fn([text])

array([0])

In [21]:
text

'a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification .'

Use unknown sampling strategy

In [17]:
explainer = AnchorText(
    predictor=predict_fn,
    sampling_strategy='unknown',
    nlp=nlp,
)

In [6]:
nlp.vocab.lookups.tables

NameError: name 'nlp' is not defined

In [None]:
# run this cell if issues with lexeme_prob lookup table
# import spacy
# from alibi.utils.download import spacy_model
# from alibi.explainers import AnchorText


# from spacy.lookups import load_lookups

# nlp = spacy.load("en_core_web_sm")
# lookups = load_lookups("en", ["lexeme_prob"])

# # nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))

In [18]:
explanation = explainer.explain(text, threshold=0.95)

In [19]:
explanation.anchor

['flashy']

In [20]:
explanation.raw

{'feature': [2],
 'mean': [0.99375],
 'precision': [0.99375],
 'coverage': [0.4993],
 'examples': [{'covered_true': array(['a UNK flashy UNK UNK opaque and emotionally vapid exercise in style UNK mystification .',
          'a UNK flashy UNK UNK UNK and emotionally UNK exercise UNK UNK and UNK UNK',
          'a UNK flashy UNK narratively opaque UNK UNK UNK exercise in style and UNK UNK',
          'UNK visually flashy UNK narratively UNK and emotionally UNK UNK UNK UNK UNK mystification .',
          'UNK UNK flashy UNK UNK opaque and emotionally UNK UNK in UNK and UNK .',
          'a visually flashy but UNK UNK and UNK UNK UNK in style UNK mystification .',
          'a visually flashy but UNK opaque UNK emotionally vapid UNK in UNK and mystification .',
          'a UNK flashy but narratively UNK UNK emotionally vapid exercise in style UNK mystification UNK',
          'a UNK flashy but narratively opaque UNK emotionally vapid exercise in style and mystification .',
          'a vi

In [14]:
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('\nExamples where anchor applies and model predicts %s:' % pred)
print('\n'.join([x for x in explanation.raw['examples'][-1]['covered_true']]))
print('\nExamples where anchor applies and model predicts %s:' % alternative)
print('\n'.join([x for x in explanation.raw['examples'][-1]['covered_false']]))



Anchor: flashy
Precision: 0.99

Examples where anchor applies and model predicts negative:
a UNK flashy UNK UNK opaque and emotionally vapid exercise in style UNK mystification .
a UNK flashy UNK UNK UNK and emotionally UNK exercise UNK UNK and UNK UNK
a UNK flashy UNK narratively opaque UNK UNK UNK exercise in style and UNK UNK
UNK visually flashy UNK narratively UNK and emotionally UNK UNK UNK UNK UNK mystification .
UNK UNK flashy UNK UNK opaque and emotionally UNK UNK in UNK and UNK .
a visually flashy but UNK UNK and UNK UNK UNK in style UNK mystification .
a visually flashy but UNK opaque UNK emotionally vapid UNK in UNK and mystification .
a UNK flashy but narratively UNK UNK emotionally vapid exercise in style UNK mystification UNK
a UNK flashy but narratively opaque UNK emotionally vapid exercise in style and mystification .
a visually flashy UNK UNK opaque UNK UNK UNK exercise in UNK UNK UNK .

Examples where anchor applies and model predicts positive:
UNK UNK flashy but narr

use similarity sampling

In [15]:
explainer = AnchorText(
    predictor=predict_fn,
    sampling_strategy='similarity',   # replace masked words by simialar words
    nlp=nlp,                          # spacy object
    sample_proba=0.5,                 # probability of a word to be masked and replace by as similar word
)


In [17]:
explanation = explainer.explain(text, threshold=0.95)

In [18]:
print('Anchor: %s' % (' AND '.join(explanation.anchor)))
print('Precision: %.2f' % explanation.precision)
print('\nExamples where anchor applies and model predicts %s:' % pred)
print('\n'.join([x for x in explanation.raw['examples'][-1]['covered_true']]))
print('\nExamples where anchor applies and model predicts %s:' % alternative)
print('\n'.join([x for x in explanation.raw['examples'][-1]['covered_false']]))

Anchor: exercise AND emotionally
Precision: 0.96

Examples where anchor applies and model predicts negative:
that visually flashy but narratively bright and emotionally snobby exercise in style and deceit .
some technically flashy but eerily opaque and emotionally vapid exercise in improvisation and mystification .
a visually flashy but electrically sensitive and emotionally pushy exercise in style and mystification .
some incredibly ingenious but narratively coherent and emotionally contrived exercise on style and dogma .
an extremely unwieldy but narratively translucent and emotionally vapid exercise in faux and mystification .
a somewhat graceful but artistically transparent and emotionally vapid exercise across suit and mystification .
this visually trendy but narratively opaque and emotionally vapid exercise without era and mystification .
any masterfully bright but intellectually opaque and emotionally meaningless exercise in style and helplessness .
both visually overbearing but

In [5]:
from xgboost import XGBClassifier
clf_name = XGBClassifier()

print(str(clf_name).split("(")[0])

XGBClassifier


In [2]:
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
import pandas as pd
X = np.ones(shape=(8, 2))
y = np.ones(shape=(8, 1))
groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])


gss = GroupShuffleSplit(n_splits=1, train_size=.7, random_state=42)
gss.get_n_splits()

for train_idx, test_idx in gss.split(X, y, groups):
    print("TRAIN:", train_idx, "TEST:", test_idx)

TRAIN: [2 3 4 5 6 7] TEST: [0 1]


In [16]:
X_train = X[train_idx]
X_train, y[train_idx]

(array([[1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.],
        [1., 1.]]),
 array([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.]]))

In [17]:
X_test = X[test_idx]
X_test, y[test_idx]

(array([[1., 1.],
        [1., 1.]]),
 array([[1.],
        [1.]]))

In [36]:

df = pd.DataFrame({"X":["Rabble","Rabble","Rabble","Rabble",
                        "cheese","cheese","cheese",
                        "pickle","pickle"], 
                    "y":[1,1,1,1,2,2,2,3,3],
                     "groups": [0,0,0,0,1,1,1,2,2]})

df = shuffle(df, random_state = 42)

print(f"Original df:\n {df}")
gss = GroupShuffleSplit(n_splits=1, train_size=.7, random_state=1234)
# gss.get_n_splits()

for train_idx, test_idx in gss.split(df, groups = df.groups):
    print("TRAIN:", train_idx, "TEST:", test_idx)



Original df:
         X  y  groups
7  pickle  3       2
1  Rabble  1       0
5  cheese  2       1
0  Rabble  1       0
8  pickle  3       2
2  Rabble  1       0
4  cheese  2       1
3  Rabble  1       0
6  cheese  2       1
TRAIN: [0 2 4 6 8] TEST: [1 3 5 7]


1

In [23]:

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

print(f"new train df:\n {train_df}")

print(f"\n new test df:\n {test_df}")

new train df:
         X  y  groups
7  pickle  3       2
5  cheese  2       1
8  pickle  3       2
4  cheese  2       1
6  cheese  2       1

 new test df:
         X  y  groups
1  Rabble  1       0
0  Rabble  1       0
2  Rabble  1       0
3  Rabble  1       0


In [12]:
df2= pd.DataFrame({"X":["splooge","splooge","splooge","splooge",
                        "dank","dank","dank",
                        "ribblar","ribblar"], 
                    "y":[1,1,1,1,2,2,2,3,3],
                     "groups": [1,1,0,0,1,1,1,2,2]})

for train_idx_2, test_idx_2 in gss.split(df2, groups = df2.groups):
    print("TRAIN:", train_idx_2, "TEST:", test_idx_2)     


train_2 = df2.iloc[train_idx_2]
test_2 = df2.iloc[test_idx_2]

TRAIN: [0 1 4 5 6 7 8] TEST: [2 3]


In [13]:
print(f"new train df: {train_2}")

print(f"\n new test df: {test_2}")

new train df:          X  y  groups
0  splooge  1       1
1  splooge  1       1
4     dank  2       1
5     dank  2       1
6     dank  2       1
7  ribblar  3       2
8  ribblar  3       2

 new test df:          X  y  groups
2  splooge  1       0
3  splooge  1       0


In [20]:
from sklearn.utils import shuffle

train_2,shuffle(train_2, random_state = 42)

(         X  y  groups
 0  splooge  1       1
 1  splooge  1       1
 4     dank  2       1
 5     dank  2       1
 6     dank  2       1
 7  ribblar  3       2
 8  ribblar  3       2,
          X  y  groups
 0  splooge  1       1
 1  splooge  1       1
 7  ribblar  3       2
 4     dank  2       1
 6     dank  2       1
 5     dank  2       1
 8  ribblar  3       2)

In [2]:

import shap
X,y = shap.datasets.boston()



In [3]:
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48
