In [1]:
import os
import pandas as pd

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

In [2]:
absolute_path = '/Users/aygalic/OneDrive/polimi/Thesis/data/quant/'

# Exploring the Dataset

In [3]:
# getting entries ready
# each couple of entries correspond to one patient

entries = os.listdir(absolute_path)


In [4]:
entries_transcripts = [e for e in entries if "transcripts" in e ]

# Building a TensorFlow input pipeline

## We want to build a tf.Dataset from this

In [5]:
# from filename to tensor

def load_patient_data(filename):
  #specify read types for our data
  read_types = [ float()]
  # get a first sample to base everything of
  text = pathlib.Path(absolute_path + filename).read_text()
  lines = text.split('\n')[1:-1]
  features = tf.io.decode_csv(lines, record_defaults=read_types, field_delim = "\t", select_cols=[3]) 
  data = tf.convert_to_tensor(features)

  return data

# Feed it into a net ?

In [6]:
from tensorflow import keras
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model

# now we build a tf.dataset with all patients inside


In [82]:
# load the dataset into a list using the first pipeline

train_ds = [load_patient_data(e) for e in entries_transcripts]
train_ds = [e for e in train_ds if e.shape == (1, 95309)]


In [83]:
train_ds[0]


<tf.Tensor: shape=(1, 95309), dtype=float32, numpy=
array([[ 0.     ,  6.55631,  0.     , ..., 21.601  , 14.5302 , 43.2891 ]],
      dtype=float32)>

In [84]:
# turn it into a tf.data.Dataset object
x_train = tf.data.Dataset.from_tensor_slices(train_ds)



In [85]:
# input are the same as the target
zipped_boi = tf.data.Dataset.zip((x_train, x_train))


In [86]:
#(iter(zipped_boi))

# Load the model

In [94]:
model = keras.models.load_model(
    "./workfiles/placeholder_model/"
)


# compute the output

In [95]:
# Try on a single file and check the result.
compressed_0 = model.encoder.predict(train_ds[0])
print(compressed_0[0])

reconstruct_0 = model.decoder.predict(compressed_0)
print(reconstruct_0[0][0])


[-1.00000000e+00 -1.00000000e+00  1.33799225e+02 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00  1.25959971e+04 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00  3.76520410e+03 -1.00000000e+00
  2.21188965e+04 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
  1.19957910e+04  7.89495508e+03 -1.00000000e+00 -1.00000000e+00
  2.25724004e+04 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
  1.36365596e+04  1.66432598e+04 -1.00000000e+00 -1.00000000e+00
  1.13486689e+04  1.08961455e+04 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00  1.61548965e+04  1.73471641e+04 -1.00000000e+00
  3.35339966e+03  1.53395830e+04 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00  1.02403467e+04
 -1.00000000e+00  4.91815723e+03 -1.00000000e+00  3.29188745e+03
 -1.00000000e+00 -1.00000000e+00  9.81028320e+03 -1.00000000e+00
  1.58717764e+04 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00  2.34491582e+04
  1.00739307e+04 -1.00000

In [96]:
# now let's do all of it with list comprehension
compressed_dataframe = [model.encoder.predict(sample)[0] for sample in train_ds]




In [30]:
# now we have to match the corresponding file names with the compressed output.


95309

In [90]:
train_ds_ = [load_patient_data(e) for e in entries_transcripts]



In [91]:
samples_to_keep = [1 if s.shape ==(1, 95309) else 0 for s in train_ds_]
filenames = [f for (f, test) in  zip(entries_transcripts, samples_to_keep) if test]

print(len(filenames))
print(len(compressed_dataframe))

4755
4755


In [97]:
df = pd.DataFrame(compressed_dataframe)
df["name"] = filenames


In [98]:
df.to_csv("./workfiles/compressed_data.csv")