In [20]:
import os
import pandas as pd

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow import keras


In [21]:
absolute_path = '/Users/aygalic/OneDrive/polimi/Thesis/data/quant/'

# Exploring the Dataset

In [22]:
# getting entries ready
# each couple of entries correspond to one patient

entries = os.listdir(absolute_path)


In [23]:
entries_transcripts = [e for e in entries if "transcripts" in e ]

# Building a TensorFlow input pipeline

## We want to build a tf.Dataset from this

In [24]:
# from filename to tensor

def load_patient_data(filename):
  #specify read types for our data
  read_types = [ float()]
  # get a first sample to base everything of
  text = pathlib.Path(absolute_path + filename).read_text()
  lines = text.split('\n')[1:-1]
  features = tf.io.decode_csv(lines, record_defaults=read_types, field_delim = "\t", select_cols=[3]) 
  data = tf.convert_to_tensor(features)

  return data

# Feed it into a neural net


## First, we build a tf.dataset with all patients inside


In [25]:
# load the dataset into a list using the first pipeline

data = [load_patient_data(e) for e in entries_transcripts]

# remove artifacts
samples_to_keep = [1 if s.shape ==(1, 95309) else 0 for s in data]

train_ds = [sample for (sample, test) in  zip(data, samples_to_keep) if test]

In [26]:
#train_ds[0]


In [27]:
# turn it into a tf.data.Dataset object
x_train = tf.data.Dataset.from_tensor_slices(train_ds)



In [28]:
# input are the same as the target
zipped_boi = tf.data.Dataset.zip((x_train, x_train))


# Load the model

In [29]:
model = keras.models.load_model(
    #"../workfiles/placeholder_model/"
    '../workfiles/placeholder_normalized_model'
)


# compute the output

In [30]:
# Try on a single file and check the result.
compressed_0 = model.encoder.predict(train_ds[0])
#print(compressed_0[0])

reconstruct_0 = model.decoder.predict(compressed_0)
#print(reconstruct_0[0][0])




In [31]:
# now let's do all of it with list comprehension
compressed_dataframe = [model.encoder.predict(sample)[0] for sample in train_ds]




# now we have to match the corresponding file names with the compressed output.


In [32]:
filenames = [f for (f, test) in  zip(entries_transcripts, samples_to_keep) if test]

print(len(filenames))
print(len(compressed_dataframe))


4755
4755


In [33]:
df = pd.DataFrame(compressed_dataframe)
df["name"] = filenames


In [34]:
#df.to_csv("../workfiles/compressed_data.csv")
df.to_csv("../workfiles/compressed_data_after_norm.csv")

In [18]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,name
0,0.0,13090.774414,0.0,0.0,0.0,7381.190430,0.0,2009.523682,0.0,0.0,...,7418.710938,1.001898e+04,0.0,0.0,4571.480957,10071.144531,9772.011719,0.0,0.0,PPMI-Phase2-IR2.41282.V02.0003241603.5104-SL-4...
1,0.0,13181.596680,0.0,0.0,0.0,2341.830811,0.0,11991.915039,0.0,0.0,...,0.000000,3.552745e+03,0.0,0.0,3197.773438,17436.818359,2425.214355,0.0,0.0,PPMI-Phase2-IR2.40550.BL.PP0041-6989.5104-SL-4...
2,0.0,9473.352539,0.0,0.0,0.0,3045.830078,0.0,8842.532227,0.0,0.0,...,819.456604,0.000000e+00,0.0,0.0,9255.261719,8224.447266,5810.884277,0.0,0.0,PPMI-Phase1-IR2.3185.V04.0003156143.5104-SL-13...
3,0.0,6787.729004,0.0,0.0,0.0,0.000000,0.0,4192.905273,0.0,0.0,...,1662.432739,8.487552e+03,0.0,0.0,8313.604492,9581.026367,1774.295044,0.0,0.0,PPMI-Phase2-IR2.53925.BL.0003400475.5104-SL-39...
4,0.0,16465.246094,0.0,0.0,0.0,6185.646973,0.0,6233.983887,0.0,0.0,...,1958.636841,9.179116e+03,0.0,0.0,11922.607422,15073.864258,8774.802734,0.0,0.0,PPMI-Phase2-IR2.3591.V08.0002308200.5104-SL-14...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4750,0.0,17071.208984,0.0,0.0,0.0,1905.002197,0.0,9826.136719,0.0,0.0,...,0.000000,6.729394e+03,0.0,0.0,5678.772949,16641.548828,6336.591309,0.0,0.0,PPMI-Phase2-IR2.3268.V02.0003446965.5104-SL-37...
4751,0.0,15646.806641,0.0,0.0,0.0,5070.703613,0.0,12093.124023,0.0,0.0,...,1012.705872,6.716820e-20,0.0,0.0,16014.449219,17596.113281,0.000000,0.0,0.0,PPMI-Phase1-IR2.4071.V04.0003136698.5104-SL-12...
4752,0.0,12750.369141,0.0,0.0,0.0,3711.335693,0.0,7862.710938,0.0,0.0,...,1286.988647,0.000000e+00,0.0,0.0,12664.806641,14779.714844,0.000000,0.0,0.0,PPMI-Phase1-IR2.3795.V04.PP0041-8016.5104-SL-2...
4753,0.0,20437.814453,0.0,0.0,0.0,13467.090820,0.0,8325.549805,0.0,0.0,...,0.000000,1.020704e+04,0.0,0.0,19693.427734,29995.894531,8476.315430,0.0,0.0,PPMI-Phase2-IR2.3417.BL.PP0016-1193.5104-SL-14...
