In [18]:
import os
import pandas as pd

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow import keras
import scipy


In [19]:
absolute_path = '/Users/aygalic/OneDrive/polimi/Thesis/data/quant/'

# Exploring the Dataset

In [3]:
# getting entries ready
# each couple of entries correspond to one patient

entries = os.listdir(absolute_path)


In [4]:
entries_transcripts = [e for e in entries if "transcripts" in e ]

# Building a TensorFlow input pipeline

## We want to build a tf.Dataset from this

In [5]:
# from filename to tensor

def load_patient_data(filename):
  #specify read types for our data
  read_types = [ float()]
  # get a first sample to base everything of
  text = pathlib.Path(absolute_path + filename).read_text()
  lines = text.split('\n')[1:-1]
  features = tf.io.decode_csv(lines, record_defaults=read_types, field_delim = "\t", select_cols=[3]) 
  data = tf.convert_to_tensor(features)[0]

  return data

# Feed it into a neural net


## First, we build a tf.dataset with all patients inside


In [32]:
# load the dataset into a list using the first pipeline
data = [load_patient_data(e) for e in entries_transcripts]

# remove artifacts
samples_to_keep = [1 if s.shape ==(95309) else 0 for s in data]

train_ds = [sample for (sample, test) in  zip(data, samples_to_keep) if test]

In [33]:
# if feature selection is applied
data_array = np.array(train_ds)
MAD = scipy.stats.median_abs_deviation(data_array)
gene_selected = [True if val > 5 else False for val in MAD]
train_ds = data_array[:,gene_selected]


In [26]:
#train_ds[0]


In [34]:
# turn it into a tf.data.Dataset object
x_train = tf.data.Dataset.from_tensor_slices(train_ds)




In [28]:
# input are the same as the target
zipped_boi = tf.data.Dataset.zip((x_train, x_train))


# Load the model

In [35]:
model = keras.models.load_model(
    #"../workfiles/placeholder_model/"
    #'../workfiles/vae_model'
    '../workfiles/vae_model_complex'
)




In [36]:
model.compile()

# compute the output

In [37]:
# for VAE model

In [38]:
batch_size = len(x_train)
print(batch_size)
dataset = x_train.batch(batch_size)

4755


In [39]:
_, __, compressed_dataframe = model.predict(dataset)







In [25]:
# for legacy model

In [10]:
# Try on a single file and check the result.
compressed_0 = model.encoder.predict(train_ds[0])
#print(compressed_0[0])

reconstruct_0 = model.decoder.predict(compressed_0)
#print(reconstruct_0[0][0])


AttributeError: 'Functional' object has no attribute 'encoder'

In [31]:
# now let's do all of it with list comprehension
compressed_dataframe = [model.encoder.predict(sample)[0] for sample in train_ds]




# now we have to match the corresponding file names with the compressed output.


In [40]:
filenames = [f for (f, test) in  zip(entries_transcripts, samples_to_keep) if test]

print(len(filenames))
print(len(compressed_dataframe))


4755
4755


In [41]:
df = pd.DataFrame(compressed_dataframe)
df["name"] = filenames


In [42]:
#df.to_csv("../workfiles/compressed_data.csv")
#df.to_csv("../workfiles/compressed_data_after_norm.csv")
df.to_csv("../workfiles/compressed_data_vae.csv")

In [43]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,name
0,2.008957,-2.103850,1.031219,-0.002547,0.303390,-0.074005,-2.691605,0.272512,2.014237,-0.122650,...,3.259330,0.016247,3.016416,-6.285974,1.759600,2.909898,-0.697876,2.717339,-4.229718,PPMI-Phase2-IR2.41282.V02.0003241603.5104-SL-4...
1,0.430460,0.335234,0.280833,-3.243691,-0.126746,3.010376,0.357755,3.229836,0.765347,0.689616,...,5.300403,1.775766,-0.336772,-4.456578,-0.179578,0.992722,-3.639213,2.247850,0.879471,PPMI-Phase2-IR2.40550.BL.PP0041-6989.5104-SL-4...
2,-0.121533,-0.018711,-0.892627,-1.851459,2.613491,-1.146759,-3.040816,0.353548,2.582774,-1.023607,...,-0.659998,1.157161,0.647527,-1.358512,-0.361639,1.854530,1.601144,-1.117731,1.812193,PPMI-Phase1-IR2.3185.V04.0003156143.5104-SL-13...
3,-0.379559,-1.976103,-1.439888,-0.504539,1.047359,-5.481050,-0.254342,1.363258,3.105594,2.608235,...,1.662611,0.007746,-0.730571,-2.192609,-0.072969,1.467245,1.381331,2.760749,0.118371,PPMI-Phase2-IR2.53925.BL.0003400475.5104-SL-39...
4,-2.092091,1.671491,-3.714539,-2.097041,2.669677,-1.594026,1.130771,3.708401,5.216286,-0.196805,...,-0.885690,0.092654,-0.658619,-4.478239,-1.992624,0.975743,2.654303,-0.944321,2.099800,PPMI-Phase2-IR2.3591.V08.0002308200.5104-SL-14...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4750,-0.797460,1.210723,-0.235993,-2.898754,0.531055,3.440106,0.766048,3.076600,0.940334,-1.188090,...,0.243499,0.824277,0.385428,-4.136965,-0.767662,0.582797,-3.985946,1.247227,1.136902,PPMI-Phase2-IR2.3268.V02.0003446965.5104-SL-37...
4751,-1.512246,0.349483,0.960484,-3.024806,-0.525299,1.649851,0.414889,1.220319,2.109547,0.101081,...,-2.026013,1.897940,-0.887179,-0.278582,-0.405202,1.978832,-0.084356,-1.808054,0.639135,PPMI-Phase1-IR2.4071.V04.0003136698.5104-SL-12...
4752,-1.203377,1.280159,1.967300,-0.906377,-0.127631,4.584737,-3.160660,1.061597,3.202095,-3.155284,...,-0.746703,0.468841,-2.330758,1.688442,-0.160450,2.981339,-2.887144,-2.028784,-0.235662,PPMI-Phase1-IR2.3795.V04.PP0041-8016.5104-SL-2...
4753,-4.599179,3.687202,-2.091404,-2.719379,1.727001,-0.016259,2.028495,1.773733,3.662157,-1.229200,...,-0.491115,0.082279,-6.585121,-3.325825,-2.701751,2.159636,5.432885,-7.071024,3.431915,PPMI-Phase2-IR2.3417.BL.PP0016-1193.5104-SL-14...
