In [1]:
import os
import pandas as pd

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow import keras
import scipy


In [2]:
absolute_path = '/Users/aygalic/OneDrive/polimi/Thesis/data/quant/'

# Exploring the Dataset

In [3]:
# getting entries ready
# each couple of entries correspond to one patient

entries = os.listdir(absolute_path)


In [4]:
entries_transcripts = [e for e in entries if "transcripts" in e ]

# Building a TensorFlow input pipeline

## We want to build a tf.Dataset from this

In [5]:
# from filename to tensor

def load_patient_data(filename):
  #specify read types for our data
  read_types = [ float()]
  # get a first sample to base everything of
  text = pathlib.Path(absolute_path + filename).read_text()
  lines = text.split('\n')[1:-1]
  features = tf.io.decode_csv(lines, record_defaults=read_types, field_delim = "\t", select_cols=[3]) 
  data = tf.convert_to_tensor(features)

  return data

# Feed it into a neural net


## First, we build a tf.dataset with all patients inside


In [6]:
# load the dataset into a list using the first pipeline

data = [load_patient_data(e) for e in entries_transcripts]

# remove artifacts
samples_to_keep = [1 if s.shape ==(1, 95309) else 0 for s in data]

train_ds = [sample for (sample, test) in  zip(data, samples_to_keep) if test]

In [7]:
train_ds = [e[0] for e in train_ds] 
# if feature selection is applied
data_array = np.array(train_ds)
MAD = scipy.stats.median_abs_deviation(data_array)
gene_selected = [True if val > 10 else False for val in MAD]
train_ds = data_array[:,gene_selected]


In [26]:
#train_ds[0]


In [11]:
# turn it into a tf.data.Dataset object
x_train = tf.data.Dataset.from_tensor_slices(train_ds)




In [28]:
# input are the same as the target
zipped_boi = tf.data.Dataset.zip((x_train, x_train))


# Load the model

In [8]:
model = keras.models.load_model(
    #"../workfiles/placeholder_model/"
    '../workfiles/vae_model'
)




In [9]:
model.compile()

# compute the output

In [60]:
# for VAE model

In [12]:
batch_size = len(x_train)
print(batch_size)
dataset = x_train.batch(batch_size)

4755


In [13]:
_, __, compressed_dataframe = model.predict(dataset)







2023-07-11 18:49:46.160734: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [33]:
# for legacy model

In [10]:
# Try on a single file and check the result.
compressed_0 = model.encoder.predict(train_ds[0])
#print(compressed_0[0])

reconstruct_0 = model.decoder.predict(compressed_0)
#print(reconstruct_0[0][0])


AttributeError: 'Functional' object has no attribute 'encoder'

In [31]:
# now let's do all of it with list comprehension
compressed_dataframe = [model.encoder.predict(sample)[0] for sample in train_ds]




# now we have to match the corresponding file names with the compressed output.


In [14]:
filenames = [f for (f, test) in  zip(entries_transcripts, samples_to_keep) if test]

print(len(filenames))
print(len(compressed_dataframe))


4755
4755


In [15]:
df = pd.DataFrame(compressed_dataframe)
df["name"] = filenames


In [16]:
#df.to_csv("../workfiles/compressed_data.csv")
#df.to_csv("../workfiles/compressed_data_after_norm.csv")
df.to_csv("../workfiles/compressed_data_vae.csv")

In [17]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,name
0,-2.332749,-1.762823,0.654371,-0.444359,-0.288036,-1.143666,1.247488,2.111508,-0.042297,0.929171,...,-1.649503,1.359224,3.463287,-0.612798,2.444043,-2.577483,0.527411,1.856344,-0.260353,PPMI-Phase2-IR2.41282.V02.0003241603.5104-SL-4...
1,1.507675,1.232331,-2.862547,0.990147,-0.085075,0.481430,0.079311,-0.270525,-2.866499,-0.033006,...,-0.260458,-1.288054,2.436129,2.963830,0.794879,-1.380877,4.063878,-0.765378,3.946345,PPMI-Phase2-IR2.40550.BL.PP0041-6989.5104-SL-4...
2,-1.210864,-2.244260,1.659086,3.587234,0.541101,-0.343419,-2.533396,-1.337316,-0.519141,-0.532301,...,-1.294511,-0.062408,1.810309,0.203335,-0.448900,-3.131723,1.741513,-0.646440,2.597125,PPMI-Phase1-IR2.3185.V04.0003156143.5104-SL-13...
3,-0.930465,-2.867608,-0.480582,0.598467,2.305323,1.309312,-0.893287,-3.116277,2.714720,-2.047013,...,-0.251134,0.976879,1.797512,-0.237974,1.459288,-1.417072,-0.605953,1.215908,-2.189872,PPMI-Phase2-IR2.53925.BL.0003400475.5104-SL-39...
4,-1.370378,-3.200115,-1.831520,2.897226,1.468197,0.623682,-2.401962,-1.607919,0.948924,-0.721244,...,-3.083826,1.788406,1.946097,0.131504,0.364571,-2.831831,0.171661,0.849641,0.634312,PPMI-Phase2-IR2.3591.V08.0002308200.5104-SL-14...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4750,0.436185,-0.608800,-0.987771,-0.874271,-0.524247,-1.432339,0.562695,-1.813736,-2.430324,0.893019,...,-2.079570,1.442292,1.998283,2.253543,-0.533275,-2.459770,3.550354,0.790399,1.677189,PPMI-Phase2-IR2.3268.V02.0003446965.5104-SL-37...
4751,1.531181,-1.300232,-2.117425,1.489394,0.522082,1.185367,0.032476,-0.845356,-2.324866,-1.597912,...,-1.891961,-2.051257,1.399780,2.688053,2.350878,-0.734543,2.067808,0.795357,0.935093,PPMI-Phase1-IR2.4071.V04.0003136698.5104-SL-12...
4752,-0.466687,-1.211020,-1.222982,-0.402275,-1.280251,-1.848667,0.711939,-0.112500,-1.961020,0.987657,...,-1.633183,-0.361570,-0.413067,0.421136,0.429011,-0.272279,1.265908,-0.519313,1.472940,PPMI-Phase1-IR2.3795.V04.PP0041-8016.5104-SL-2...
4753,0.409071,-2.449874,-2.172486,4.528638,0.109079,0.784592,0.145765,-4.017438,-2.595772,-1.008298,...,-3.962791,0.830117,1.072586,5.460805,1.230626,-4.398604,1.244781,0.695219,-1.721210,PPMI-Phase2-IR2.3417.BL.PP0016-1193.5104-SL-14...
