In [20]:
import os
import pandas as pd

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow import keras


In [21]:
absolute_path = '/Users/aygalic/OneDrive/polimi/Thesis/data/quant/'

# Exploring the Dataset

In [22]:
# getting entries ready
# each couple of entries correspond to one patient

entries = os.listdir(absolute_path)


In [23]:
entries_transcripts = [e for e in entries if "transcripts" in e ]

# Building a TensorFlow input pipeline

## We want to build a tf.Dataset from this

In [24]:
# from filename to tensor

def load_patient_data(filename):
  #specify read types for our data
  read_types = [ float()]
  # get a first sample to base everything of
  text = pathlib.Path(absolute_path + filename).read_text()
  lines = text.split('\n')[1:-1]
  features = tf.io.decode_csv(lines, record_defaults=read_types, field_delim = "\t", select_cols=[3]) 
  data = tf.convert_to_tensor(features)

  return data

# Feed it into a neural net


## First, we build a tf.dataset with all patients inside


In [25]:
# load the dataset into a list using the first pipeline

data = [load_patient_data(e) for e in entries_transcripts]

# remove artifacts
samples_to_keep = [1 if s.shape ==(1, 95309) else 0 for s in data]

train_ds = [sample for (sample, test) in  zip(data, samples_to_keep) if test]

In [26]:
#train_ds[0]


In [27]:
# turn it into a tf.data.Dataset object
x_train = tf.data.Dataset.from_tensor_slices(train_ds)



In [28]:
# input are the same as the target
zipped_boi = tf.data.Dataset.zip((x_train, x_train))


# Load the model

In [29]:
model = keras.models.load_model(
    #"../workfiles/placeholder_model/"
    '../workfiles/placeholder_normalized_model'
)


# compute the output

In [30]:
# Try on a single file and check the result.
compressed_0 = model.encoder.predict(train_ds[0])
#print(compressed_0[0])

reconstruct_0 = model.decoder.predict(compressed_0)
#print(reconstruct_0[0][0])




In [31]:
# now let's do all of it with list comprehension
compressed_dataframe = [model.encoder.predict(sample)[0] for sample in train_ds]




# now we have to match the corresponding file names with the compressed output.


In [32]:
filenames = [f for (f, test) in  zip(entries_transcripts, samples_to_keep) if test]

print(len(filenames))
print(len(compressed_dataframe))


4755
4755


In [33]:
df = pd.DataFrame(compressed_dataframe)
df["name"] = filenames


In [34]:
#df.to_csv("../workfiles/compressed_data.csv")
df.to_csv("../workfiles/compressed_data_after_norm.csv")

In [35]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,name
0,106915.000000,36735.476562,0.000000,70555.804688,0.000000,0.0,0.0,0.000000,69582.187500,0.000000,...,0.000000,126075.875000,0.00000,134439.406250,0.000000,54574.300781,0.0,8243.381836,86285.046875,PPMI-Phase2-IR2.41282.V02.0003241603.5104-SL-4...
1,68337.414062,1758.159668,0.000000,28325.824219,0.000000,0.0,0.0,0.000000,85526.460938,39944.664062,...,0.000000,127569.898438,0.00000,178500.531250,0.000000,20525.738281,0.0,22166.361328,20947.484375,PPMI-Phase2-IR2.40550.BL.PP0041-6989.5104-SL-4...
2,85332.750000,0.000000,30753.343750,25815.619141,0.000000,0.0,0.0,0.000000,84408.875000,74174.585938,...,0.000000,131209.734375,0.00000,184123.578125,0.000000,16748.314453,0.0,57272.988281,2193.991943,PPMI-Phase1-IR2.3185.V04.0003156143.5104-SL-13...
3,98174.828125,0.000000,1226.421875,53788.335938,0.000000,0.0,0.0,0.000000,98915.093750,34541.722656,...,0.000000,132407.015625,0.00000,159895.578125,0.000000,41268.554688,0.0,39718.863281,33166.980469,PPMI-Phase2-IR2.53925.BL.0003400475.5104-SL-39...
4,305149.250000,0.000000,329760.125000,0.000000,0.000000,0.0,0.0,0.000000,113064.468750,390266.562500,...,0.000000,104792.648438,256811.09375,284603.062500,0.000000,0.000000,0.0,323416.968750,10732.398438,PPMI-Phase2-IR2.3591.V08.0002308200.5104-SL-14...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4750,72300.109375,0.000000,0.000000,20260.179688,0.000000,0.0,0.0,0.000000,78559.101562,37625.035156,...,0.000000,124687.375000,0.00000,169126.468750,0.000000,7722.214355,0.0,21213.185547,29076.259766,PPMI-Phase2-IR2.3268.V02.0003446965.5104-SL-37...
4751,69630.289062,0.000000,47986.605469,0.000000,0.000000,0.0,0.0,0.000000,43745.472656,112767.890625,...,0.000000,136399.125000,0.00000,195873.171875,0.000000,0.000000,0.0,71291.640625,0.000000,PPMI-Phase1-IR2.4071.V04.0003136698.5104-SL-12...
4752,0.000000,510.572418,3843.121826,0.000000,0.000000,0.0,0.0,0.000000,19958.669922,49813.324219,...,0.000000,37333.238281,0.00000,63956.132812,0.000000,0.000000,0.0,11397.385742,0.000000,PPMI-Phase1-IR2.3795.V04.PP0041-8016.5104-SL-2...
4753,366613.843750,0.000000,483855.875000,0.000000,0.000000,0.0,0.0,0.000000,85009.460938,572557.250000,...,0.000000,110451.382812,382995.75000,360795.343750,0.000000,0.000000,0.0,454015.312500,0.000000,PPMI-Phase2-IR2.3417.BL.PP0016-1193.5104-SL-14...
