In [12]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import tensorflow as tf
import seaborn as sns

In [13]:
column_names = ['Names', 'MCG','GVH' ,'LIP','CHG','AAC','ALM1','ALM2','SITE']
ecoli_df = pd.read_csv('./uci_repos/ecoli/ecoli.data',
                      sep="   |  ",
                                   names=column_names, 
                                   header=None, 
                                   engine="python")
ecoli_df

Unnamed: 0,Names,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,AAT_ECOLI,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,ACEA_ECOLI,0.07,0.40,0.48,0.5,0.54,0.35,0.44,cp
2,ACEK_ECOLI,0.56,0.40,0.48,0.5,0.49,0.37,0.46,cp
3,ACKA_ECOLI,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,ADI_ECOLI,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp
...,...,...,...,...,...,...,...,...,...
331,TREA_ECOLI,0.74,0.56,0.48,0.5,0.47,0.68,0.30,pp
332,UGPB_ECOLI,0.71,0.57,0.48,0.5,0.48,0.35,0.32,pp
333,USHA_ECOLI,0.61,0.60,0.48,0.5,0.44,0.39,0.38,pp
334,XYLF_ECOLI,0.59,0.61,0.48,0.5,0.42,0.42,0.37,pp


In [14]:
ecoli_df.describe()

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2
count,336.0,336.0,336.0,336.0,336.0,336.0,336.0
mean,0.50006,0.5,0.495476,0.501488,0.50003,0.500179,0.499732
std,0.194634,0.148157,0.088495,0.027277,0.122376,0.215751,0.209411
min,0.0,0.16,0.48,0.5,0.0,0.03,0.0
25%,0.34,0.4,0.48,0.5,0.42,0.33,0.35
50%,0.5,0.47,0.48,0.5,0.495,0.455,0.43
75%,0.6625,0.57,0.48,0.5,0.57,0.71,0.71
max,0.89,1.0,1.0,1.0,0.88,1.0,0.99


In [15]:
del ecoli_df['Names']
class_names = ecoli_df['SITE'].unique()
class_names

array(['cp', 'im', 'imS', 'imL', 'imU', 'om', 'omL', 'pp'], dtype=object)

In [16]:
ecoli_df.head()

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,0.49,0.29,0.48,0.5,0.56,0.24,0.35,cp
1,0.07,0.4,0.48,0.5,0.54,0.35,0.44,cp
2,0.56,0.4,0.48,0.5,0.49,0.37,0.46,cp
3,0.59,0.49,0.48,0.5,0.52,0.45,0.36,cp
4,0.23,0.32,0.48,0.5,0.55,0.25,0.35,cp


In [17]:
minority_group = ['imL', 'omL', 'imS']
new_ecoli_df = ecoli_df[~ecoli_df['SITE'].isin(minority_group)]

In [19]:
Y = new_ecoli_df.iloc[:,-1:].copy().to_numpy() 
X = new_ecoli_df.iloc[:, 0:-1].copy().to_numpy()

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=40)

In [21]:
from sklearn.preprocessing import LabelEncoder
y_train = LabelEncoder().fit_transform(y_train)
y_test = LabelEncoder().fit_transform(y_test)

  return f(*args, **kwargs)


In [22]:
print(np.unique(y_train))
print(np.unique(y_test))
print(X_train.shape)

[0 1 2 3 4]
[0 1 2 3 4]
(261, 7)


In [23]:
y_train_oh = np.array(tf.keras.utils.to_categorical(y_train, num_classes=5, dtype='float32'))
X_train = np.array(X_train)

# VAE Model

In [25]:
import tensorflow.compat.v1  as tf
tf.disable_v2_behavior()
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import os
import numpy as np

mb_size = 16
z_dim = 3
X_dim = 7
y_dim = 5
h_dim = 3
lr = 1e-3


def xavier_init(size):
    in_dim = size[0]
    xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
    return tf.random.normal(shape=size, stddev=xavier_stddev)

X = tf.keras.Input(shape=(X_dim,))
c = tf.keras.Input(shape=(y_dim,))
z = tf.keras.Input(shape=(z_dim,))

Q_W1 = tf.Variable(xavier_init([X_dim + y_dim, h_dim]))
Q_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

Q_W2_mu = tf.Variable(xavier_init([h_dim, z_dim]))
Q_b2_mu = tf.Variable(tf.zeros(shape=[z_dim]))

Q_W2_sigma = tf.Variable(xavier_init([h_dim, z_dim]))
Q_b2_sigma = tf.Variable(tf.zeros(shape=[z_dim]))


def Q(X, c):
    inputs = tf.concat(axis=1, values=[X, c])
    h = tf.nn.relu(tf.matmul(inputs, Q_W1) + Q_b1)
    z_mu = tf.matmul(h, Q_W2_mu) + Q_b2_mu
    z_logvar = tf.matmul(h, Q_W2_sigma) + Q_b2_sigma
    return z_mu, z_logvar


def sample_z(mu, log_var):
    eps = tf.random.normal(shape=tf.shape(mu))
    return mu + tf.exp(log_var / 2) * eps

P_W1 = tf.Variable(xavier_init([z_dim + y_dim, h_dim]))
P_b1 = tf.Variable(tf.zeros(shape=[h_dim]))

P_W2 = tf.Variable(xavier_init([h_dim, X_dim]))
P_b2 = tf.Variable(tf.zeros(shape=[X_dim]))


def P(z, c):
    inputs = tf.concat(axis=1, values=[z, c])
    h = tf.nn.relu(tf.matmul(inputs, P_W1) + P_b1)
    logits = tf.matmul(h, P_W2) + P_b2
    prob = tf.nn.sigmoid(logits)
    return prob, logits

z_mu, z_logvar = Q(X, c)
z_sample = sample_z(z_mu, z_logvar)
_, logits = P(z_sample, c)

X_samples, _ = P(z, c)

recon_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=X), 1)
kl_loss = 0.5 * tf.reduce_sum(tf.exp(z_logvar) + z_mu**2 - 1. - z_logvar, 1)
vae_loss = tf.reduce_mean(recon_loss + kl_loss)

solver = tf.compat.v1.train.AdamOptimizer().minimize(vae_loss)
sess = tf.compat.v1.Session ()
sess.run(
tf.compat.v1.global_variables_initializer())

i = 0

for it in range(50000):
    ind = np.random.choice(X_train.shape[0], mb_size)
    X_mb = np.array(X_train[ind])
    y_mb = np.array(y_train_oh[ind])
    
    _, loss = sess.run([solver, vae_loss], feed_dict={X: X_mb, c: y_mb})

    if it % 1000 == 0:
        print('Iter: {}'.format(it))
        print('Loss: {:.4}'. format(loss))
        print()
    

Iter: 0
Loss: 6.016

Iter: 1000
Loss: 4.827

Iter: 2000
Loss: 4.754

Iter: 3000
Loss: 4.609

Iter: 4000
Loss: 4.595

Iter: 5000
Loss: 4.705

Iter: 6000
Loss: 4.592

Iter: 7000
Loss: 4.589

Iter: 8000
Loss: 4.543

Iter: 9000
Loss: 4.634

Iter: 10000
Loss: 4.685

Iter: 11000
Loss: 4.761

Iter: 12000
Loss: 4.626

Iter: 13000
Loss: 4.748

Iter: 14000
Loss: 4.658

Iter: 15000
Loss: 4.692

Iter: 16000
Loss: 4.654

Iter: 17000
Loss: 4.716

Iter: 18000
Loss: 4.681

Iter: 19000
Loss: 4.625

Iter: 20000
Loss: 4.662

Iter: 21000
Loss: 4.666

Iter: 22000
Loss: 4.703

Iter: 23000
Loss: 4.538

Iter: 24000
Loss: 4.691

Iter: 25000
Loss: 4.589

Iter: 26000
Loss: 4.679

Iter: 27000
Loss: 4.633

Iter: 28000
Loss: 4.669

Iter: 29000
Loss: 4.681

Iter: 30000
Loss: 4.681

Iter: 31000
Loss: 4.658

Iter: 32000
Loss: 4.643

Iter: 33000
Loss: 4.655

Iter: 34000
Loss: 4.674

Iter: 35000
Loss: 4.66

Iter: 36000
Loss: 4.717

Iter: 37000
Loss: 4.682

Iter: 38000
Loss: 4.669

Iter: 39000
Loss: 4.657

Iter: 40000
Lo

In [26]:
### generating sample outputs after training
samples = []
gen_labels =[]
for r in range(10):
    for index in range(y_dim):
        gen_labels = gen_labels + [index]*mb_size
        y = np.zeros([mb_size, y_dim])
        y[range(mb_size), index] = 1
        samples.extend(sess.run(X_samples,
                               feed_dict={z: np.random.randn(mb_size, z_dim), c: y}))

gen_samples = np.array(samples).round(decimals=2)
gen_labels = np.array(gen_labels)
print(gen_samples.shape)
print(gen_labels.shape)

(800, 7)
(800,)


In [27]:
print(gen_labels)
print(gen_samples[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [28]:
gen_df = pd.DataFrame(data=gen_samples, columns=column_names[1:8])
gen_df

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2
0,0.37,0.42,0.48,0.5,0.44,0.32,0.40
1,0.37,0.42,0.48,0.5,0.44,0.32,0.40
2,0.37,0.42,0.48,0.5,0.44,0.32,0.40
3,0.37,0.42,0.48,0.5,0.44,0.32,0.40
4,0.37,0.42,0.48,0.5,0.44,0.32,0.40
...,...,...,...,...,...,...,...
795,0.64,0.68,0.49,0.5,0.53,0.46,0.36
796,0.64,0.68,0.49,0.5,0.53,0.46,0.37
797,0.64,0.68,0.49,0.5,0.53,0.46,0.37
798,0.64,0.68,0.49,0.5,0.53,0.46,0.36


In [29]:
gen_df['SITE'] = gen_labels
gen_df

Unnamed: 0,MCG,GVH,LIP,CHG,AAC,ALM1,ALM2,SITE
0,0.37,0.42,0.48,0.5,0.44,0.32,0.40,0
1,0.37,0.42,0.48,0.5,0.44,0.32,0.40,0
2,0.37,0.42,0.48,0.5,0.44,0.32,0.40,0
3,0.37,0.42,0.48,0.5,0.44,0.32,0.40,0
4,0.37,0.42,0.48,0.5,0.44,0.32,0.40,0
...,...,...,...,...,...,...,...,...
795,0.64,0.68,0.49,0.5,0.53,0.46,0.36,4
796,0.64,0.68,0.49,0.5,0.53,0.46,0.37,4
797,0.64,0.68,0.49,0.5,0.53,0.46,0.37,4
798,0.64,0.68,0.49,0.5,0.53,0.46,0.36,4


In [31]:
x = np.concatenate([X_train, gen_samples])
y = np.concatenate([y_train, gen_labels])

# MLP experiment

In [39]:
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Input, Flatten
from tensorflow.keras.models import Sequential


def build_model(input_shape=(64,), num_classes=10):
    """

    :param input_shape: shape of input_data
    :param num_classes: number of classes
    :return: keras.model.sequential compiled with categorical cross-entropy loss
    """
    model = Sequential([
        Input(shape=input_shape),
        Dense(32, activation="relu"),
        BatchNormalization(),
        Dense(64, activation="relu"),
        BatchNormalization(),
        Flatten(),
        Dropout(0.5),
        Dense(num_classes, activation="softmax"),
    ])

    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.summary()
    return model

In [40]:
from sklearn.metrics import classification_report

baseline_model = build_model((7,), 5)
batch_size=8
epochs=2
test_y = np.array(tf.keras.utils.to_categorical(y_test, num_classes=5, dtype='float32'))
history_baseline = baseline_model.fit(X_train, y_train_oh, batch_size=batch_size, 
                    epochs=epochs, validation_data=(X_test, test_y))
score_baseline = baseline_model.evaluate(X_test, test_y, verbose=0)
print('baseline test loss: ',score_baseline[0])
print('baseline test accuracy: ', score_baseline[1] )

y_pred_baseline_oh = baseline_model.predict(X_test)
y_pred_baseline = y_pred_baseline_oh.argmax(axis=-1)
print('MLP baseline classification report\n',classification_report(y_test, y_pred_baseline))

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                256       
_________________________________________________________________
batch_normalization (BatchNo (None, 32)                128       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                2112      
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
flatten (Flatten)            (None, 64)                0         
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
__________________________________________________



MLP baseline classification report
               precision    recall  f1-score   support

           0       0.80      0.14      0.24        28
           1       0.28      1.00      0.44        17
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         9

    accuracy                           0.32        66
   macro avg       0.22      0.23      0.14        66
weighted avg       0.41      0.32      0.22        66



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Training on real and generated data

In [41]:
x = np.array(x)
y_oh = np.array(tf.keras.utils.to_categorical(y, num_classes=5, dtype='float32'))
test_y = np.array(tf.keras.utils.to_categorical(y_test, num_classes=5, dtype='float32'))
aug_model = build_model((7,), 5)
history_aug = aug_model.fit(x, y_oh, batch_size=batch_size, epochs=epochs, 
                            validation_data=(X_test, test_y))
y_pred_aug_oh = aug_model.predict(X_test)
y_pred_aug = y_pred_aug_oh.argmax(axis=-1)
print('Combined MLP classification report on real samples only.\n',classification_report(y_test, y_pred_aug))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 32)                256       
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
dense_4 (Dense)              (None, 64)                2112      
_________________________________________________________________
batch_normalization_3 (Batch (None, 64)                256       
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 5)                



Combined MLP classification report on real samples only.
               precision    recall  f1-score   support

           0       0.90      0.68      0.78        28
           1       0.62      0.88      0.73        17
           2       0.00      0.00      0.00         9
           3       0.00      0.00      0.00         3
           4       0.43      1.00      0.60         9

    accuracy                           0.65        66
   macro avg       0.39      0.51      0.42        66
weighted avg       0.60      0.65      0.60        66



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
