In [21]:
#This is the modified version of the ladder network code from https://github.com/rinuboney/ladder
#Certain modfications are made to use & experiment with gene expression data
#
# import pickle
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from collections import Counter 


import tensorflow as tf
#import input_data
import math
import os
import csv
from tqdm import tqdm
import sklearn as sk
from sklearn.metrics import confusion_matrix

In [22]:
join = lambda l, u: tf.concat([l, u], 0)
labeled = lambda x: tf.slice(x, [0, 0], [batch_size, -1]) if x is not None else x
unlabeled = lambda x: tf.slice(x, [batch_size, 0], [-1, -1]) if x is not None else x
split_lu = lambda x: (labeled(x), unlabeled(x))

In [23]:
# cancer_type = 'aggregates/LUNG'
cancer_type = 'BRCA'

file = 'out/' + cancer_type.replace("/","_") +  ".tsv"
file


'out/BRCA.tsv'

In [24]:
#class definitions

class DataSet(object):

  def __init__(self, dataset, labels):
    
    self._dataset = dataset
    self._labels = labels
    self._epochs_completed = 0
    self._index_in_epoch = 0
    self._num_examples = dataset.shape[0]

  @property
  def dataset(self):
    return self._dataset

  @property
  def labels(self):
    return self._labels

  @property
  def num_examples(self):
    return self._num_examples

  @property
  def epochs_completed(self):
    return self._epochs_completed

  def next_batch(self, batch_size):
    """Return the next `batch_size` examples from this data set."""
#     print ("index", self._index_in_epoch)
    start = self._index_in_epoch
    self._index_in_epoch += batch_size
    if self._index_in_epoch > self._num_examples:
        # Finished epoch
        self._epochs_completed += 1
        # Shuffle the data
        perm = np.arange(self._num_examples)
        np.random.shuffle(perm)
        self._dataset = self._dataset[perm]
        self._labels = self._labels[perm]
        # Start next epoch
        start = 0
        self._index_in_epoch = batch_size
#         print(batch_size, self._num_examples)
        assert batch_size <= self._num_examples
    end = self._index_in_epoch
    return self._dataset[start:end], self._labels[start:end]

class SemiDataSet(object):
    def __init__(self, dataset, labels, n_labeled):
        
        self.n_labeled = n_labeled

        # Unlabled DataSet
        self.unlabeled_ds = DataSet(dataset, labels)

        # Labeled DataSet
        self.num_examples = self.unlabeled_ds.num_examples
        indices = np.arange(self.num_examples)
        shuffled_indices = np.random.permutation(indices)
        dataset = dataset[shuffled_indices]
        labels = labels[shuffled_indices]
#         print('labels',labels)
        
        y = np.array([np.arange(2)[l==1][0] for l in labels])
#         print('y',y)
#         global test
#         test=labels

        
#         idx = indices[y==0][:5]
#         print('idx',idx)


        n_classes = y.max() + 1
        print('n_classes',n_classes)
        n_from_each_class = n_labeled // n_classes
        i_labeled = []
        for c in range(n_classes):
            i = indices[y==c][:n_from_each_class]
            i_labeled += list(i)
        l_dataset = dataset[i_labeled]
        l_labels = labels[i_labeled]
        self.labeled_ds = DataSet(l_dataset, l_labels)

    def next_batch(self, batch_size):
        #print ("batch size semi", batch_size)
        unlabeled_dataset, _ = self.unlabeled_ds.next_batch(batch_size)
     
        if batch_size > self.n_labeled:
            labeled_dataset, labels = self.labeled_ds.next_batch(self.n_labeled)
        else:
            labeled_dataset, labels = self.labeled_ds.next_batch(batch_size)
            #print (labeled_dataset.shape)
        #print ("labels shape aasd", labels.shape)
        #print (labels)
        dataset = np.vstack([labeled_dataset, unlabeled_dataset])
        return dataset, labels
# aa = SemiDataSet(X_train,y_train , 60)
# aa.next_batch(60)

In [25]:
#one-hot label
def dense_to_one_hot(labels_dense, num_classes=2):

  """Convert class labels from scalars to one-hot vectors."""
  num_labels = labels_dense.shape[0]
#   print(num_labels)
  index_offset = np.arange(num_labels) * num_classes
  labels_one_hot = np.zeros((num_labels, num_classes))
  labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
  return labels_one_hot


In [26]:
#fix labels  1 for tumoral, 0 for healthy
def fix_label(labels):
    labels= [1 if x==1 else 0 for x in labels]
    
    return np.array(labels)


In [27]:
wd = '../nanni_data_tcga_cibb/'+cancer_type+'/'
# wd = '/home/nanni/Data/TCGA/CIBB/aggregates/LUNG/'
# wd = '/home/nanni/Data/TCGA/CIBB/aggregates/KIDNEY/'


X_file = wd + '/X.npy'
y_file = wd + '/y.npy'
X = np.load(X_file)
y = np.load(y_file)
print(X.shape)
print(y.shape)

(1218, 20530)
(1218,)


In [28]:
print ("===  Loading Data ===")


class DataSets(object):
    pass
data_sets = DataSets()


X = np.load(X_file)
ylabels = np.load(y_file)
print(X.shape)
print(ylabels.shape)




#TODO remove all zeros
scaler = MinMaxScaler()
# Xnew = scaler.fit_transform(X.T).T

# Xnew = scaler.fit_transform(X)





# Xnew = scaler.fit_transform(X)

# Xnew = (X.T/X.sum(axis=1)).T * 10000



# Xnew = X


Xnew = X.T[X.sum(axis =0) != 0].T

print('# of 1s', sum(ylabels))

ynew = dense_to_one_hot(fix_label(ylabels)).astype(np.float32)

print("number of element in each class:", sum(ynew))

# del X
# del ylabels

===  Loading Data ===
(1218, 20530)
(1218,)
# of 1s 114.0
number of element in each class: [1104.  114.]


In [9]:


# 0 for maximum parallization
parallelization_factor = 15


LOGDIR = "./log"

layer_sizes = [Xnew.shape[1], 2000, 1000, 500, 250, 10,2] # X.shape[1]
print('layer_sizes', layer_sizes)

#TODO X.shape[1]

L = len(layer_sizes) - 1  # number of layers

num_epochs = 100 #100 # TODO change
# num_labeled = 6 #
num_examples = X.shape[0] # TODO read from input
# tot_number_examples = X.shape[0]

learning_rate = 0.005

batch_size = 60

num_iter = (num_examples//batch_size + 1) * num_epochs  # number of loop iterations

inputs = tf.placeholder(tf.float32, shape=(None, layer_sizes[0]), name= "input")
outputs = tf.placeholder(tf.float32, name = "output")



layer_sizes [20252, 2000, 1000, 500, 250, 10, 2]


In [10]:
#training util functions
def bi(inits, size, name):
    with tf.name_scope(name):
        b = tf.Variable(inits * tf.ones([size]), name="B")
        tf.summary.histogram("bias", b)
        return b

def wi(shape, name):
    with tf.name_scope(name):
        w = tf.Variable(tf.random_normal(shape, name="W")) / math.sqrt(shape[0])
        tf.summary.histogram("weight", w)
        print(w)
        return w

In [11]:
#training params
shapes = list(zip(list(layer_sizes)[:-1], list(layer_sizes[1:])))  # shapes of linear layers
print('shapes', shapes)

weights = {'W': [wi(s, "W") for s in shapes],  # Encoder weights
           'V': [wi(s[::-1], "V") for s in shapes],  # Decoder weights
           # batch normalization parameter to shift the normalized value
           'beta': [bi(0.0, layer_sizes[l+1], "beta") for l in range(L)],
           # batch normalization parameter to scale the normalized value
           'gamma': [bi(1.0, layer_sizes[l+1], "beta") for l in range(L)]}

print(weights['V'],shapes)

noise_std = 0.3  # scaling factor for noise used in corrupted encoder

# hyperparameters that denote the importance of each layer
denoising_cost = [1000.0, 10.0, 0.10, 0.10, 0.10, 0.10, 0.10]

shapes [(20252, 2000), (2000, 1000), (1000, 500), (500, 250), (250, 10), (10, 2)]
Tensor("W/truediv:0", shape=(20252, 2000), dtype=float32)
Tensor("W_1/truediv:0", shape=(2000, 1000), dtype=float32)
Tensor("W_2/truediv:0", shape=(1000, 500), dtype=float32)
Tensor("W_3/truediv:0", shape=(500, 250), dtype=float32)
Tensor("W_4/truediv:0", shape=(250, 10), dtype=float32)
Tensor("W_5/truediv:0", shape=(10, 2), dtype=float32)
Tensor("V/truediv:0", shape=(2000, 20252), dtype=float32)
Tensor("V_1/truediv:0", shape=(1000, 2000), dtype=float32)
Tensor("V_2/truediv:0", shape=(500, 1000), dtype=float32)
Tensor("V_3/truediv:0", shape=(250, 500), dtype=float32)
Tensor("V_4/truediv:0", shape=(10, 250), dtype=float32)
Tensor("V_5/truediv:0", shape=(2, 10), dtype=float32)
[<tf.Tensor 'V/truediv:0' shape=(2000, 20252) dtype=float32>, <tf.Tensor 'V_1/truediv:0' shape=(1000, 2000) dtype=float32>, <tf.Tensor 'V_2/truediv:0' shape=(500, 1000) dtype=float32>, <tf.Tensor 'V_3/truediv:0' shape=(250, 500) dtype

In [12]:
#training params and placeholders
training = tf.placeholder(tf.bool)

ewma = tf.train.ExponentialMovingAverage(decay=0.99)  # to calculate the moving averages of mean and variance
bn_assigns = []  # this list stores the updates to be made to average mean and variance


def batch_normalization(batch, mean=None, var=None):
    if mean is None or var is None:
        mean, var = tf.nn.moments(batch, axes=[0])
    return (batch - mean) / tf.sqrt(var + tf.constant(1e-10))

# average mean and variance of all layers
running_mean = [tf.Variable(tf.constant(0.0, shape=[l]), trainable=False) for l in layer_sizes[1:]]
running_var = [tf.Variable(tf.constant(1.0, shape=[l]), trainable=False) for l in layer_sizes[1:]]

def update_batch_normalization(batch, l):
    "batch normalize + update average mean and variance of layer l"
    mean, var = tf.nn.moments(batch, axes=[0])
    assign_mean = running_mean[l-1].assign(mean)
    assign_var = running_var[l-1].assign(var)
    bn_assigns.append(ewma.apply([running_mean[l-1], running_var[l-1]]))
    with tf.control_dependencies([assign_mean, assign_var]):
        return (batch - mean) / tf.sqrt(var + 1e-10)

In [13]:
#encoder
def encoder(inputs, noise_std):
    h = inputs + tf.random_normal(tf.shape(inputs)) * noise_std  # add noise to input
    d = {}  # to store the pre-activation, activation, mean and variance for each layer
    # The data for labeled and unlabeled examples are stored separately
    d['labeled'] = {'z': {}, 'm': {}, 'v': {}, 'h': {}}
    d['unlabeled'] = {'z': {}, 'm': {}, 'v': {}, 'h': {}}
    d['labeled']['z'][0], d['unlabeled']['z'][0] = split_lu(h)
    for l in range(1, L+1):
        print ("Layer ", l, ": ", layer_sizes[l-1], " -> ", layer_sizes[l])
        d['labeled']['h'][l-1], d['unlabeled']['h'][l-1] = split_lu(h)
        z_pre = tf.matmul(h, weights['W'][l-1])  # pre-activation
        z_pre_l, z_pre_u = split_lu(z_pre)  # split labeled and unlabeled examples

        m, v = tf.nn.moments(z_pre_u, axes=[0])

        # if training:
        def training_batch_norm():
            # Training batch normalization
            # batch normalization for labeled and unlabeled examples is performed separately
            if noise_std > 0:
                # Corrupted encoder
                # batch normalization + noise
                z = join(batch_normalization(z_pre_l), batch_normalization(z_pre_u, m, v))
                z += tf.random_normal(tf.shape(z_pre)) * noise_std
            else:
                # Clean encoder
                # batch normalization + update the average mean and variance using batch mean and variance of labeled examples
                z = join(update_batch_normalization(z_pre_l, l), batch_normalization(z_pre_u, m, v))
            return z

        # else:
        def eval_batch_norm():
            # Evaluation batch normalization
            # obtain average mean and variance and use it to normalize the batch
            mean = ewma.average(running_mean[l-1])
            var = ewma.average(running_var[l-1])
            z = batch_normalization(z_pre, mean, var)
            # Instead of the above statement, the use of the following 2 statements containing a typo
            # consistently produces a 0.2% higher accuracy for unclear reasons.
            return z

        # perform batch normalization according to value of boolean "training" placeholder:
        z = tf.cond(training, training_batch_norm, eval_batch_norm)

        if l == L:
            # use softmax activation in output layer
            h = tf.nn.softmax(weights['gamma'][l-1] * (z + weights["beta"][l-1]))
        else:
            # use ReLU activation in hidden layers
            h = tf.nn.relu(z + weights["beta"][l-1])
        d['labeled']['z'][l], d['unlabeled']['z'][l] = split_lu(z)
        d['unlabeled']['m'][l], d['unlabeled']['v'][l] = m, v  # save mean and variance of unlabeled examples for decoding
    d['labeled']['h'][l], d['unlabeled']['h'][l] = split_lu(h)
    return h, d
print ("=== Corrupted Encoder ===")
y_c, corr = encoder(inputs, noise_std)

print ("=== Clean Encoder ===")
y, clean = encoder(inputs, 0.0)  # 0.0 -> do not add noise

print ("=== Decoder ===")

=== Corrupted Encoder ===
Layer  1 :  20252  ->  2000
Layer  2 :  2000  ->  1000
Layer  3 :  1000  ->  500
Layer  4 :  500  ->  250
Layer  5 :  250  ->  10
Layer  6 :  10  ->  2
=== Clean Encoder ===
Layer  1 :  20252  ->  2000
Layer  2 :  2000  ->  1000
Layer  3 :  1000  ->  500
Layer  4 :  500  ->  250
Layer  5 :  250  ->  10
Layer  6 :  10  ->  2
=== Decoder ===


In [14]:
def g_gauss(z_c, u, size):
    "gaussian denoising function proposed in the original paper"
    wi = lambda inits, name: tf.Variable(inits * tf.ones([size]), name=name)
    a1 = wi(0., 'a1')
    a2 = wi(1., 'a2')
    a3 = wi(0., 'a3')
    a4 = wi(0., 'a4')
    a5 = wi(0., 'a5')

    a6 = wi(0., 'a6')
    a7 = wi(1., 'a7')
    a8 = wi(0., 'a8')
    a9 = wi(0., 'a9')
    a10 = wi(0., 'a10')

    mu = a1 * tf.sigmoid(a2 * u + a3) + a4 * u + a5
    v = a6 * tf.sigmoid(a7 * u + a8) + a9 * u + a10

    z_est = (z_c - mu) * v + mu
    return z_est

In [15]:
# Decoder
z_est = {}
d_cost = []  # to store the denoising cost of all layers
for l in range(L, -1, -1):
    print ("Layer ", l, ": ", layer_sizes[l+1] if l+1 < len(layer_sizes) else None, " -> ", layer_sizes[l], ", denoising cost: ", denoising_cost[l])
    z, z_c = clean['unlabeled']['z'][l], corr['unlabeled']['z'][l]
    m, v = clean['unlabeled']['m'].get(l, 0), clean['unlabeled']['v'].get(l, 1-1e-10)
    if l == L:
        u = unlabeled(y_c)
    else:
        u = tf.matmul(z_est[l+1], weights['V'][l])
    u = batch_normalization(u)
    z_est[l] = g_gauss(z_c, u, layer_sizes[l])
    z_est_bn = (z_est[l] - m) / v
    # append the cost of this layer to d_cost
    d_cost.append((tf.reduce_mean(tf.reduce_sum(tf.square(z_est_bn - z), 1)) / layer_sizes[l]) * denoising_cost[l])

# calculate total unsupervised cost by adding the denoising cost of all layers
u_cost = tf.add_n(d_cost)

y_N = labeled(y_c)
cost = -tf.reduce_mean(tf.reduce_sum(outputs*tf.log(y_N), 1))  # supervised cost
loss = cost + u_cost  # total cost

pred_cost = -tf.reduce_mean(tf.reduce_sum(outputs*tf.log(y), 1))  # cost used for prediction
with tf.name_scope("accuracy"):
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(outputs, 1))  # no of correct predictions
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) * tf.constant(100.0)
    tf.summary.scalar("accuracy", accuracy)

#learning_rate = tf.Variable(starter_learning_rate, trainable=False)
with tf.name_scope("train"):
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    


# add the updates of batch normalization statistics to train_step
bn_updates = tf.group(*bn_assigns)
with tf.control_dependencies([train_step]):
    train_step = tf.group(bn_updates)

Layer  6 :  None  ->  2 , denoising cost:  0.1
Layer  5 :  2  ->  10 , denoising cost:  0.1
Layer  4 :  10  ->  250 , denoising cost:  0.1
Layer  3 :  250  ->  500 , denoising cost:  0.1
Layer  2 :  500  ->  1000 , denoising cost:  0.1
Layer  1 :  1000  ->  2000 , denoising cost:  10.0
Layer  0 :  2000  ->  20252 , denoising cost:  1000.0


In [16]:
aa = (X.T/X.sum(axis=1)).T * 10000



In [17]:
aa.max()

1.6551339519268424

In [18]:
    #create checkpoint 
#     ckpt = tf.train.get_checkpoint_state('checkpoints_cancer/')  # get latest checkpoint (if any)
#     if ckpt and ckpt.model_checkpoint_path:
#         # if checkpoint exists, restore the parameters and set epoch_n and i_iter
#         saver.restore(sess, ckpt.model_checkpoint_path)
#         epoch_n = int(ckpt.model_checkpoint_path.split('-')[1])
#         i_iter = (epoch_n+1) * (num_examples//batch_size)
#         print ("Restored Epoch ", epoch_n)
#     else:
#         # no checkpoint exists. create checkpoints directory if it does not exist.
#         if not os.path.exists('checkpoints_cancer'):
#             os.makedirs('checkpoints_cancer')
#         writer = tf.summary.FileWriter('./log', sess.graph)
#         init = tf.global_variables_initializer()
#         sess.run(init)

In [19]:
def get_accuracies(epoch, sess, datasets) :
    y_true = np.argmax(datasets.test.labels,1)
    
    train_acc = sess.run(accuracy, feed_dict={inputs: datasets.train.unlabeled_ds.dataset, outputs: datasets.train.unlabeled_ds.labels, training: False})
    validation_acc = sess.run(accuracy, feed_dict={inputs: datasets.validation.dataset, outputs: datasets.validation.labels, training: False})
    
    y_p = tf.argmax(y, 1)
    test_acc, y_predicted = sess.run([accuracy,y_p], feed_dict={inputs: datasets.test.dataset, outputs: datasets.test.labels, training: False})
    test_f1 = str(sk.metrics.f1_score(y_true, y_predicted))
    
    print(epoch, " =>", " train: ", train_acc, " validation: ", validation_acc, " test: ", test_acc, " f1(test): " + test_f1)
    return train_acc, validation_acc, test_acc

def run_model(datasets):
    expression_dataset = datasets

    saver = tf.train.Saver(write_version=tf.train.SaverDef.V1)

    sess = tf.Session(config=
        tf.ConfigProto(inter_op_parallelism_threads=parallelization_factor,
                   intra_op_parallelism_threads=parallelization_factor))
    
    i_iter = 0

    

    init = tf.global_variables_initializer()
    sess.run(init)

    acc_count = 0
    
    train_acc = sess.run(accuracy, feed_dict={inputs: expression_dataset.train.unlabeled_ds.dataset, outputs: expression_dataset.train.unlabeled_ds.labels, training: False})
    validation_acc = sess.run(accuracy, feed_dict={inputs: expression_dataset.validation.dataset, outputs: expression_dataset.validation.labels, training: False})
    test_acc = sess.run(accuracy, feed_dict={inputs: expression_dataset.test.dataset, outputs: expression_dataset.test.labels, training: False})
    
    
    print("INITIAL VALUES")
    _, pre_acc, _ = get_accuracies("Initial", sess, expression_dataset)
    
    
#     print('i_iter: ', i_iter, ' num_iter:', num_iter)
#     print('num_examples', num_examples)

#     for i in tqdm(range(i_iter, num_iter)):
    for i in (range(i_iter, num_iter)):

        dataset, labels = expression_dataset.train.next_batch(batch_size)
        print(dataset.shape)
        print(labels.shape)

        sess.run(train_step, feed_dict={inputs: dataset, outputs: labels, training: True})

        if (i > 1) and ((i+1) % (num_iter//num_epochs) == 0):
            epoch_n = i//(num_examples//batch_size)
            
            _, curr_acc, _ = get_accuracies("Epoch(" + str(epoch_n) + ")", sess, expression_dataset)
            
            if curr_acc == pre_acc:
                acc_count += 1
            else :
                acc_count = 0
                
            # TODO EARLY STOPPING
            if acc_count > 3 and epoch_n > 30:
                print("Early stop!!!!!", acc_count, epoch_n)
                break

    get_accuracies("FINAL", sess, expression_dataset)

    y_p = tf.argmax(y, 1)
    test_accuracy, y_pred = sess.run([accuracy,y_p], feed_dict={inputs: expression_dataset.test.dataset, outputs: expression_dataset.test.labels, training: False})
    
    
    

    
    
    print ("TEST accuracy:", test_accuracy)
    y_true = np.argmax(expression_dataset.test.labels,1)
    print ("Precision", sk.metrics.precision_score(y_true, y_pred))
    print ("Recall", sk.metrics.recall_score(y_true, y_pred))
    print ("f1_score", sk.metrics.f1_score(y_true, y_pred))
    print ("confusion_matrix")
    print (sk.metrics.confusion_matrix(y_true, y_pred))
    with open(file, "a") as text_file:
        text_file.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ('', 
                                                test_accuracy, 
                                                sk.metrics.f1_score(y_true, y_pred), 
                                                sk.metrics.precision_score(y_true, y_pred), 
                                                sk.metrics.recall_score(y_true, y_pred),
                                                sk.metrics.confusion_matrix(y_true, y_pred).tolist()))


    sess.close()
#     raise ValueError('A very specific bad thing happened.')
    return y_true, y_pred


In [20]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

with open(file, "w") as text_file:
    text_file.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ('fold','accuracy', 'f1-score', 'precision', 'recall', 'conf_m'))
    
all_y_true = np.array([]).astype(int)
all_y_pred = np.array([]).astype(int)

print(Xnew.shape)


# print('XXXXX',    len(Xnew), len(ynew))

for train_valid_index, test_index in skf.split(Xnew, ynew[:,0]):
    
#     print('XXXXX', len(train_valid_index), len(test_index))


    X_train_valid,  X_test = Xnew[train_valid_index], Xnew[test_index]
    y_train_valid,  y_test = ynew[train_valid_index], ynew[test_index]
    
    X_train, X_valid, y_train, y_valid= train_test_split(X_train_valid, y_train_valid, test_size=0.25, stratify=y_train_valid[:,0])
    
#     from imblearn.over_sampling import RandomOverSampler 
#     ros = RandomOverSampler(random_state=0)
#     X_train, y_train = ros.fit_sample(X_train, y_train[:,0])
#     y_train = np.array(list(zip(y_train, 1- y_train)))
    
                         
#     print('YYYYY', len(X_train), len(X_valid), len(X_test))
#     print('ZZZZZ', len(y_train), len(y_valid), len(y_test))
#     print('QQQQQ', sum(y_train[:,0]) / len(y_train), sum(y_valid[:,0]) / len(y_valid), sum(y_test[:,0]) / len(y_test))


    print(X_train.shape)
    
    data_sets.train = SemiDataSet(X_train,y_train , 60)
    data_sets.validation = DataSet(X_valid,y_valid)
    data_sets.test = DataSet(X_test,y_test)
    
    y_true, y_pred = run_model(data_sets)
    

    all_y_true = np.append(all_y_true,y_true)
    all_y_pred = np.append(all_y_pred,y_pred)
    


#     break
print (sk.metrics.confusion_matrix(all_y_true, all_y_pred))
with open(file, "a") as text_file:
    text_file.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ('ALL', 
                        sk.metrics.accuracy_score(all_y_true, all_y_pred), 
                        sk.metrics.f1_score(all_y_true, all_y_pred), 
                        sk.metrics.precision_score(all_y_true, all_y_pred), 
                        sk.metrics.recall_score(all_y_true, all_y_pred),
                        sk.metrics.confusion_matrix(all_y_true, all_y_pred).tolist())
                   )



(1218, 20252)
(730, 20252)
n_classes 2
INITIAL VALUES
Initial  =>  train:  9.315068  validation:  9.4262295  test:  9.4262295  f1(test): 0.17228464419475656
(120, 20252)
(60, 2)
(120, 20252)
(60, 2)
(120, 20252)
(60, 2)
(120, 20252)
(60, 2)
(120, 20252)
(60, 2)
(120, 20252)
(60, 2)
(120, 20252)
(60, 2)
(120, 20252)
(60, 2)
(120, 20252)
(60, 2)


KeyboardInterrupt: 

In [None]:
sk.metrics.confusion_matrix(all_y_true, all_y_pred).tolist()

In [None]:
maxxx = X.max(axis=0)

In [None]:
maxxx = maxxx[maxxx!=0]

In [None]:
maxxx.max()

In [None]:
import pandas as pd
tcga_path = "/home/nanni/Data/TCGA/Xena/tcga.tsv"
tcga = pd.read_csv(tcga_path, sep="\t")

In [None]:
aa = tcga[tcga.columns[7:]]

In [None]:
aa.mean(axis=1).shape
