# This notebooks generates 4 subclass blobs and recovers them in LRP space

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import keras
import umap

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedShuffleSplit
# Converting labels to 1-Hot Vectors
from sklearn.preprocessing import OneHotEncoder
from mpl_toolkits.mplot3d import Axes3D


import sys
# sys.path.append("/Users/Work/Developer/interpretDL/interprettensor")
root_logdir = "./tf_logs"
data_dir = "data/"
figures_dir = "data/figures/"

# To plot pretty figures
%matplotlib widget
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

np.random.seed(seed=42) 
tf.__version__

Using TensorFlow backend.


'1.13.1'

In [2]:
def get_split_index(features, labels):
    features = np.array(features)
    # The train set will have equal amounts of each target class
    # Performing single split
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    return [[train_index, test_index] for train_index,test_index in split.split(features, labels)]

def split_valid(features, original_labels, training_labels):
    train_index, validation_index = get_split_index(features, original_labels)[0]
    
    X_valid, y_valid, y_valid_original = features.iloc[validation_index],  training_labels.iloc[validation_index], original_labels.iloc[validation_index]
    X_train, y_train, y_original = features.iloc[train_index], training_labels.iloc[train_index], original_labels.iloc[train_index]
     
    return X_train, y_train, y_original, X_valid, y_valid, y_valid_original

def get_train_test_val(features, original_labels, training_labels):
    
    X, y, y_original, X_valid, y_valid, y_valid_original = split_valid(features,original_labels, training_labels)
   
    train_index, test_index = get_split_index(X, y_original)[0]
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    return X_train, y_train, X_test, y_test, y_original, X_valid, y_valid, y_valid_original

def plot_history(history):
    plt.close("History")
    fig, axs = plt.subplots(1, 2, figsize=(12,6),num="History")

    # Plot training & validation accuracy values
    axs[0].grid(True)
    axs[0].plot(history.history['binary_accuracy'])
    axs[0].plot(history.history['val_binary_accuracy'])
    axs[0].set(title='Model accuracy', ylabel='Accuracy', xlabel='Epoch')
    axs[0].legend(['Train', 'Test'], loc='upper left')

    # Plot training & validation loss values
    axs[1].grid(True)
    axs[1].plot(history.history['loss'])
    axs[1].plot(history.history['val_loss'])
    axs[1].set(title='Model loss',ylabel='Loss', xlabel='Epoch')
    axs[1].legend(['Train', 'Test'], loc='upper left')

    plt.show()

## Making 4 blobs

In [3]:
from sklearn.datasets.samples_generator import make_blobs
# generate 2d classification dataset
# X, y = make_circles(n_samples=100, noise=0)a
class_size = 500
centers = [(0,0),(1,0),(-1,0),(0,1),(0,-1)]
n_samples = [class_size//(len(centers)-1)]*len(centers)
n_samples[0] = class_size

X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=2, cluster_std=0.1, shuffle=False, random_state=42)

plt.close("Original")
df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
fig, ax = plt.subplots(num="Original")
colors = {0:'red', 1:'blue'}
df.plot(ax=ax,kind="scatter", x='x', y='y',c="label", cmap= "Paired")
# plt.colorbar()
plt.show()

original_labels = df["label"].copy()
modded_samples = df[["x","y"]].copy()
training_labels = df["label"].copy()
training_labels[training_labels > 0] = 1

FigureCanvasNbAgg()

## Seperate out train, test set

In [4]:
# train_index, test_index = get_split_index(modded_samples, modded_labels)[0]
# features = modded_samples.copy()
# X_train = features.iloc[train_index]
# y_train = labels.iloc[train_index]
# X_test = features.iloc[test_index]
# y_test = labels.iloc[test_index]
X_train, y_train, X_test, y_test, y_original, X_valid, y_valid, y_valid_original = get_train_test_val(modded_samples, original_labels, training_labels)

print("Train Size:", X_train.shape)
print("Test Size:", y_test.shape)


hot_encoder = OneHotEncoder(categories="auto", sparse=False)
hot_encoder.fit(training_labels.values.reshape(-1,1)) # Since the function expects an array of "features" per sample
print("Categories:", hot_encoder.categories_)
# X_test, y_test.values

Train Size: (640, 2)
Test Size: (160,)
Categories: [array([0, 1])]


In [5]:
NUM_FEATURES = X_train.shape[1]
NUM_LABELS = len(hot_encoder.categories_[0])

### Train a DNN on the modified dataset

In [6]:
from sklearn.preprocessing import StandardScaler

def build_dnn(num_features, num_labels=3):

#     reset_graph()
    
    keras.backend.clear_session()

    nn = keras.models.Sequential()
    Dense = keras.layers.Dense
    
    # Using He initialization
    he_init = tf.keras.initializers.he_uniform()
    
    nn.add(Dense(units = 12, activation="elu", input_dim=num_features,
                kernel_initializer=he_init))
    nn.add(Dense(units = 12, activation="elu",
                kernel_initializer=he_init))
    nn.add(Dense(units=1, activation= "sigmoid",
                kernel_initializer=he_init))

#     BCE = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    
    nn.compile(loss="binary_crossentropy",
                  optimizer='sgd',
                  metrics=['binary_accuracy'])
    
    return nn

def train_model(model, X, y, X_test=[], y_test=[], epochs=30, batch_size=20, verbose=1, plot=True):
    
    callback_list = []
    
    ZScaler = StandardScaler().fit(X)
    
    X_train = ZScaler.transform(X)
    X_test = ZScaler.transform(X_test)
    
    y_train = y.values
    y_test = y_test.values
    
    
    history = model.fit(X_train, y_train, epochs=epochs, batch_size = batch_size,
                        validation_data=(X_test, y_test), callbacks=callback_list, verbose=verbose)
    
#     if plot: plot_history(history)
    
    return history, ZScaler


In [7]:
nn = build_dnn(NUM_FEATURES)
%time history, Zscaler = train_model(nn, X_train, y_train, X_test, y_test, epochs=30, batch_size=20)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 640 samples, validate on 160 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
CPU times: user 1.86 s, sys: 469 ms, total: 2.33 s
Wall time: 1.23 s


In [8]:
# Plotting results from history
plot_history(history)

FigureCanvasNbAgg()

## Performing SVM on Modded Samples

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("SVM", LinearSVC(C=1, loss="hinge", max_iter=1000 ))
])

%time svm_clf.fit(X_train, y_train)
print("Linear SVM Test Accuracy: {:0.3f}".format(svm_clf.score(X_test, y_test)))

CPU times: user 5.29 ms, sys: 1.65 ms, total: 6.95 ms
Wall time: 5.35 ms
Linear SVM Test Accuracy: 0.750


## Performing LRP

In [10]:
model = nn
scaled_samples = Zscaler.transform(X_valid)

print("Validation Accuracy")
loss_and_metrics = model.evaluate(scaled_samples, y_valid)
print("Scores on validation set: loss={:0.3f} accuracy={:.4f}".format(*loss_and_metrics))

predictions = model.predict(scaled_samples)
preds = np.array([np.round(x[0]) for x in predictions])
true_labels = [np.float(x) for x in y_valid]

# For test population that was correctly classified
correct = preds == true_labels

# Get correctly predicted samples,labels along with their original labels
correct_labels_original = y_valid_original[correct]
correct_samples = scaled_samples[correct]
correct_labels_training = y_valid[correct]



Validation Accuracy
Scores on validation set: loss=0.032 accuracy=1.0000


In [15]:
print("Class Distribution")
pd.Series(correct_labels_original).value_counts()

Class Distribution


0    100
4     25
3     25
2     25
1     25
Name: label, dtype: int64

In [12]:
import innvestigate
import innvestigate.utils as iutils

def perform_analysis(model, analyzer, data, labels=[]):
    analysis = analyzer.analyze(data)
    prediction = model.predict(data)
    
    df_anal = pd.DataFrame(analysis)
    
    return df_anal


# Stripping the softmax activation from the model
# model_wo_sm = iutils.keras.graph.model_wo_softmax(model)

# Creating an analyzer
lrp_E = innvestigate.analyzer.relevance_based.relevance_analyzer.LRPEpsilon(model=model, epsilon=1e-3)
lrp_Z = innvestigate.analyzer.relevance_based.relevance_analyzer.LRPZPlus(model=model)
lrp_AB   = innvestigate.analyzer.relevance_based.relevance_analyzer.LRPAlpha2Beta1(model=model)

# Getting all the samples that can be correctly predicted
test_idx = correct
all_samples = scaled_samples[test_idx]


# perform_analysis(nn,gradient_analyzer,flowers,types)
all_lrp_AB = perform_analysis(model,lrp_AB, all_samples)
all_lrp_E = perform_analysis(model,lrp_E, all_samples)
all_lrp_Z = perform_analysis(model,lrp_Z, all_samples)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [13]:
plt.close("Comparison")
fig, axs = plt.subplots(2,2, figsize=(12,10), num="Comparison")
_labels = correct_labels_original

df.plot(ax=axs[0][0],kind="scatter", x='x', y='y',c="label", cmap= "Paired", title="Original Distribution")
all_lrp_E.plot(ax=axs[0][1],kind="scatter", x=0, y=1, c=_labels, cmap="Paired", s=20, alpha=0.75, title="LRP E")

all_lrp_AB.plot(ax=axs[1][0],kind="scatter", x=0, y=1, c=_labels, cmap="Paired", s=20, alpha=0.75, title="LRP AB")
all_lrp_Z.plot(ax=axs[1][1],kind="scatter", x=0, y=1, c=_labels, cmap="Paired", s=20, alpha=0.75, title="LRP Z")

# plt.tight_layout()
plt.savefig(figures_dir+"4_subclass_LRP.png")
plt.show()

FigureCanvasNbAgg()