In [1]:
import os
import pandas as pd

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow import keras


In [2]:
absolute_path = '/Users/aygalic/OneDrive/polimi/Thesis/data/quant/'

# Exploring the Dataset

In [3]:
# getting entries ready
# each couple of entries correspond to one patient

entries = os.listdir(absolute_path)


In [4]:
entries_transcripts = [e for e in entries if "transcripts" in e ]

# Building a TensorFlow input pipeline

## We want to build a tf.Dataset from this

In [5]:
# from filename to tensor

def load_patient_data(filename):
  #specify read types for our data
  read_types = [ float()]
  # get a first sample to base everything of
  text = pathlib.Path(absolute_path + filename).read_text()
  lines = text.split('\n')[1:-1]
  features = tf.io.decode_csv(lines, record_defaults=read_types, field_delim = "\t", select_cols=[3]) 
  data = tf.convert_to_tensor(features)

  return data

# Feed it into a neural net


## First, we build a tf.dataset with all patients inside


In [6]:
# load the dataset into a list using the first pipeline

data = [load_patient_data(e) for e in entries_transcripts]

# remove artifacts
samples_to_keep = [1 if s.shape ==(1, 95309) else 0 for s in data]

train_ds = [sample for (sample, test) in  zip(data, samples_to_keep) if test]

In [26]:
#train_ds[0]


In [7]:
# turn it into a tf.data.Dataset object
x_train = tf.data.Dataset.from_tensor_slices(train_ds)




In [28]:
# input are the same as the target
zipped_boi = tf.data.Dataset.zip((x_train, x_train))


# Load the model

In [37]:
model = keras.models.load_model(
    #"../workfiles/placeholder_model/"
    '../workfiles/vae_model'
)




In [38]:
model.compile()

# compute the output

In [39]:
# for VAE model

In [40]:
batch_size = len(x_train)
print(batch_size)
dataset = x_train.batch(batch_size)

4755


In [41]:
_, __, compressed_dataframe = model.predict(dataset)







In [33]:
# for legacy model

In [10]:
# Try on a single file and check the result.
compressed_0 = model.encoder.predict(train_ds[0])
#print(compressed_0[0])

reconstruct_0 = model.decoder.predict(compressed_0)
#print(reconstruct_0[0][0])


AttributeError: 'Functional' object has no attribute 'encoder'

In [31]:
# now let's do all of it with list comprehension
compressed_dataframe = [model.encoder.predict(sample)[0] for sample in train_ds]




# now we have to match the corresponding file names with the compressed output.


In [42]:
filenames = [f for (f, test) in  zip(entries_transcripts, samples_to_keep) if test]

print(len(filenames))
print(len(compressed_dataframe))


4755
4755


In [43]:
df = pd.DataFrame(compressed_dataframe)
df["name"] = filenames


In [44]:
#df.to_csv("../workfiles/compressed_data.csv")
#df.to_csv("../workfiles/compressed_data_after_norm.csv")
df.to_csv("../workfiles/compressed_data_vae.csv")

In [45]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,name
0,-2.502696,-5.952868,5.737478,7.125837,-4.211463,-6.525736,2.707397,-5.678664,3.462354,-1.478496,...,-10.250145,2.262773,-0.286852,4.282173,-5.322594,-1.240745,-3.632484,10.261468,-1.002420,PPMI-Phase2-IR2.41282.V02.0003241603.5104-SL-4...
1,6.241638,-2.794071,5.710476,-3.624309,-3.751702,-3.992031,0.363205,-2.325102,4.638640,-3.714130,...,-2.231129,0.611971,-4.944630,-5.776441,3.090179,7.156087,-4.655008,0.005338,-7.683589,PPMI-Phase2-IR2.40550.BL.PP0041-6989.5104-SL-4...
2,3.022236,-0.339438,2.932422,-3.542511,-4.090177,-0.250496,-0.868559,0.365593,2.131110,-0.291701,...,-2.104655,-0.153032,-3.411074,-1.128819,0.756577,4.373021,-1.743427,-0.104331,-4.251028,PPMI-Phase1-IR2.3185.V04.0003156143.5104-SL-13...
3,4.832498,1.695562,6.491729,0.405430,-3.300136,-0.633135,3.115772,1.830390,-0.031552,0.323276,...,-2.765874,-2.691363,-1.271023,0.157898,-0.914021,4.492011,0.668535,3.466612,-5.573043,PPMI-Phase2-IR2.53925.BL.0003400475.5104-SL-39...
4,3.098778,-0.224330,3.794454,-0.746743,-5.280137,-1.612487,2.477489,2.033479,1.161827,-1.597406,...,-3.753467,-1.303052,-4.179739,-3.089771,-1.716404,3.929124,-1.356016,1.966428,-5.637395,PPMI-Phase2-IR2.3591.V08.0002308200.5104-SL-14...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4750,6.962807,-1.969601,5.483219,-0.434171,-3.427397,-2.452698,2.079362,-2.674683,3.545458,-3.801524,...,-1.172333,2.758611,-4.109983,-6.713953,-0.086517,6.115964,-2.911243,1.809898,-6.563694,PPMI-Phase2-IR2.3268.V02.0003446965.5104-SL-37...
4751,3.340846,-2.471972,0.644732,-6.041498,-5.439080,-0.908830,-4.768017,-2.181172,2.202845,-1.788078,...,0.112831,2.189338,-3.847153,-5.830025,1.288395,3.422444,-1.485064,-0.487780,-2.681257,PPMI-Phase1-IR2.4071.V04.0003136698.5104-SL-12...
4752,1.984788,-4.331240,0.928475,-2.448471,-2.956162,-2.214352,-5.185519,-4.616153,3.679854,-2.118062,...,-1.840785,5.406852,-1.422889,-2.709921,-1.250070,1.831775,-2.101094,2.047422,-2.683778,PPMI-Phase1-IR2.3795.V04.PP0041-8016.5104-SL-2...
4753,-0.241122,-3.716935,-2.949133,-0.683076,-8.543454,-4.818295,7.422138,3.440265,3.255480,-1.343344,...,-5.415650,-4.477695,-2.092573,-7.433320,-4.441499,-1.109432,-2.229609,-0.179051,2.294057,PPMI-Phase2-IR2.3417.BL.PP0016-1193.5104-SL-14...
