In [1]:
import os
import pandas as pd

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow import keras


In [2]:
absolute_path = '/Users/aygalic/OneDrive/polimi/Thesis/data/quant/'

# Exploring the Dataset

In [3]:
# getting entries ready
# each couple of entries correspond to one patient

entries = os.listdir(absolute_path)


In [4]:
entries_transcripts = [e for e in entries if "transcripts" in e ]

# Building a TensorFlow input pipeline

## We want to build a tf.Dataset from this

In [5]:
# from filename to tensor

def load_patient_data(filename):
  #specify read types for our data
  read_types = [ float()]
  # get a first sample to base everything of
  text = pathlib.Path(absolute_path + filename).read_text()
  lines = text.split('\n')[1:-1]
  features = tf.io.decode_csv(lines, record_defaults=read_types, field_delim = "\t", select_cols=[3]) 
  data = tf.convert_to_tensor(features)

  return data

# Feed it into a neural net


## First, we build a tf.dataset with all patients inside


In [6]:
# load the dataset into a list using the first pipeline

data = [load_patient_data(e) for e in entries_transcripts]

# remove artifacts
samples_to_keep = [1 if s.shape ==(1, 95309) else 0 for s in data]

train_ds = [sample for (sample, test) in  zip(data, samples_to_keep) if test]

In [26]:
#train_ds[0]


In [7]:
# turn it into a tf.data.Dataset object
x_train = tf.data.Dataset.from_tensor_slices(train_ds)




In [28]:
# input are the same as the target
zipped_boi = tf.data.Dataset.zip((x_train, x_train))


# Load the model

In [8]:
model = keras.models.load_model(
    #"../workfiles/placeholder_model/"
    '../workfiles/vae_model'
)




In [9]:
model.compile()

# compute the output

In [16]:
batch_size = len(x_train)
print(batch_size)
dataset = x_train.batch(batch_size)

4755


In [21]:
_, __, compressed_0 = model.predict(dataset)






In [22]:
compressed_dataframe = compressed_0


In [10]:
# Try on a single file and check the result.
compressed_0 = model.encoder.predict(train_ds[0])
#print(compressed_0[0])

reconstruct_0 = model.decoder.predict(compressed_0)
#print(reconstruct_0[0][0])


AttributeError: 'Functional' object has no attribute 'encoder'

In [31]:
# now let's do all of it with list comprehension
compressed_dataframe = [model.encoder.predict(sample)[0] for sample in train_ds]




# now we have to match the corresponding file names with the compressed output.


In [23]:
filenames = [f for (f, test) in  zip(entries_transcripts, samples_to_keep) if test]

print(len(filenames))
print(len(compressed_dataframe))


4755
4755


In [24]:
df = pd.DataFrame(compressed_dataframe)
df["name"] = filenames


In [25]:
#df.to_csv("../workfiles/compressed_data.csv")
#df.to_csv("../workfiles/compressed_data_after_norm.csv")
df.to_csv("../workfiles/compressed_data_vae.csv")

In [26]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,name
0,4.127191,3.175187,-9.968276,-15.821952,-12.527405,16.210093,7.146112,-5.022869,10.154376,-14.437165,...,-8.108550,10.907180,-14.083506,-4.122663,-0.422250,12.000807,-2.276514,2.137713,2.266871,PPMI-Phase2-IR2.41282.V02.0003241603.5104-SL-4...
1,-7.736126,-5.643638,-8.231217,-5.554824,-0.256929,5.126742,4.897073,3.400235,-1.313438,-2.765753,...,1.743267,-0.184523,-4.066635,6.685735,-6.369447,7.656936,6.876622,-5.991385,-5.907690,PPMI-Phase2-IR2.40550.BL.PP0041-6989.5104-SL-4...
2,-3.409613,-9.298646,-7.035962,0.530154,0.039143,-0.124186,5.073489,0.529547,1.228880,-0.508073,...,0.509232,-0.060426,0.018115,6.335011,-10.145216,6.630260,3.348218,-4.260819,-3.646299,PPMI-Phase1-IR2.3185.V04.0003156143.5104-SL-13...
3,-2.727700,-7.761249,-1.510719,-6.445801,-2.868471,5.339830,-1.303087,-0.037059,2.415479,-6.748822,...,1.230753,4.988106,-5.374324,4.215699,-13.289690,4.201027,3.353152,-2.882957,-3.059753,PPMI-Phase2-IR2.53925.BL.0003400475.5104-SL-39...
4,-6.382964,-6.205849,-5.886121,-1.833633,2.927473,0.638814,2.134443,4.430057,-2.565719,-1.785221,...,1.974383,-0.346060,-1.361665,5.068215,-8.427131,8.383598,6.112727,-6.857926,-5.499253,PPMI-Phase2-IR2.3591.V08.0002308200.5104-SL-14...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4750,-6.676854,-6.810770,-5.346162,-5.400218,1.444289,4.121007,0.231013,4.302355,-2.108654,-6.833271,...,3.105545,3.223683,-6.970109,5.012974,-8.021526,6.762115,9.767344,-6.294116,-6.838994,PPMI-Phase2-IR2.3268.V02.0003446965.5104-SL-37...
4751,-4.243416,-5.709151,-5.449828,-2.941432,1.686005,1.993744,3.262237,1.154197,-2.289668,-2.952303,...,0.719546,-0.504438,-2.429273,3.659775,-8.573174,5.277709,5.578309,-6.763084,-8.200142,PPMI-Phase1-IR2.4071.V04.0003136698.5104-SL-12...
4752,-1.024085,-8.089607,-6.658557,-4.214872,-5.086173,5.134083,4.109097,-0.800557,3.655284,-4.754408,...,-4.060655,2.904375,-4.909195,4.574725,-9.379907,6.898866,1.268176,-2.177377,-0.182489,PPMI-Phase1-IR2.3795.V04.PP0041-8016.5104-SL-2...
4753,-6.722237,-1.671198,-7.792471,7.399540,14.055175,-12.944132,4.102320,12.572176,-12.605488,3.546062,...,14.411788,-1.474564,2.778646,0.936809,-0.112810,8.382540,9.435561,-5.935243,-4.591788,PPMI-Phase2-IR2.3417.BL.PP0016-1193.5104-SL-14...
