A pipe line using 3 datasets, vanilla denoising AE, iNNvestiage backtrace, then PCA and TSNE.

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
import keras
from keras.layers import Input, Dense
from keras.models import Model

import innvestigate
import innvestigate.utils as iutils

import numpy as np
import pandas as pd
import time
import os
import pickle

from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

Load data

In [None]:
matfile = 'data/sob_cyto.pkl'
labelfile = 'data/sob_cyto_label.pkl'
with open(matfile, 'rb') as fmat open(labelfile, 'rb') as flab:
    feat_mat = pickle.load(fmat)
    feat_label = pickle.load(flab)

Scale data

In [None]:
feat_scaled = preprocessing.MinMaxScaler().fit_transform(np.abs(feat_mat))

Global Params

In [None]:
early_stopping_monitor = keras.callbacks.EarlyStopping(monitor='loss', patience=20,mode='min')
input_size = feat_mat.shape[1]
hidden_size = 800
output_size = input_size
epochs = 2000
batch_size = 128

### Vanilla DAE

In [None]:
epochs = 2000
noise_factor = 0.05
feat_noisy = featmat_scaled + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=feat_mat.shape) 
feat_noisy = np.clip(feat_noisy, 0., 1.)

x = Input(shape=(input_size,))
h = Dense(hidden_size, activation='relu')(x)
r = Dense(output_size, activation='sigmoid')(h)

ae = Model(inputs=x, outputs=r)
ae.compile(optimizer='adam', loss='mse',metrics=['accuracy'])

history = ae.fit(feat_noisy, feat_noisy, batch_size=batch_size,epochs=epochs, callbacks = [early_stopping_monitor],shuffle=True)

encoder = Model(x,h)

### Trace back weights

In [None]:
gradient_analyzer = innvestigate.create_analyzer("lrp.epsilon", encoder, neuron_selection_mode="index")
i = 0
analysis = np.zeros(input_size)
for neuron_index in range(hidden_size):
    analysis = np.add(analysis, gradient_analyzer.analyze(feat_scaled, neuron_index))
    i +=1
print(i)

Load bands

In [None]:
filepath = 'data/bands.pkl'
with open(filepath, 'rb') as fi:
    bands = pickle.load(fi)

Sum of features

In [None]:
feat_sum = np.sum(analysis, axis=0)

In [None]:
feat_amps = feat_sum[:int(input_size/2),]
feat_dels = feat_sum[int(input_size/2):,]

In [None]:
filepath = 'data/bands.pkl'
with open(filepath, 'rb') as fi:
    bands = pickle.load(fi)

In [None]:
bands['amp'] = feat_amps
bands['del'] = feat_dels
bands['index'] = bands.index

In [None]:
top_amps = bands.nlargest(20, 'amp')
top_dels = bands.nlargest(20, 'del')
bot_amps = bands.nsmallest(20, 'amp')
bot_dels = bands.nsmallest(20, 'del')

In [None]:
sub_index = top_amps['index'] + bot_amps['index'] + top_dels['index'] + bot_dels['index']

In [None]:
feat_sub = feat_scaled.take(sub_index, axis=0)

PCA

In [None]:
time_start = time.time()

pca = PCA(n_components=4)
pca_result = pca.fit_transform(feat_sub)

print('PCA done! Time elapsed: {} seconds'.format(time.time()-time_start))
print ('Variance explained per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
pca_df = pd.DataFrame(pca_result, columns = ['pca1','pca2','pca3','pca4'])
pca_df['label'] = feat_label

plt.figure(figsize=(16,10))
ax = sns.scatterplot(
    x="pca1", y="pca2",
    hue="label",
    palette=sns.color_palette("hls", 9),
    data=pca_df,
    legend="full",
    alpha=0.3
)

TSNE

In [None]:
time_start = time.time()

tsne = TSNE(random_state=RS, perplexity=40).fit_transform(feat_sub)

print ('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
tsne_df = pd.DataFrame(tsne, columns = ['tsne1','tsne2'])
tsne_df['label'] = feat_label

plt.figure(figsize=(16,10))
ax = sns.scatterplot(
    x="tsne1", y="tsne2",
    hue="label",
    palette=sns.color_palette("hls", 9),
    data=tsne_df,
    legend="full",
    alpha=0.3
)

K-means

In [None]:
clusters = []

for i in range(1, 11):
    km = KMeans(n_clusters=i).fit(feat_sub)
    clusters.append(km.inertia_)
    
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 11)), y=clusters, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')

plt.show()

In [None]:
km3 = KMeans(n_clusters=3).fit(feat_sub)
km3_df = pd.DataFrame({'km3_label': km3.labels_, 'sample_label':labels})

In [None]:
plt.figure(figsize=(16,10))
ax = sns.catplot(y="sample_label", hue="km3_label", kind="count",
            palette="pastel", edgecolor=".6",
            data=km3_df)