In [None]:
import os
import sys
import csv
import h5py
import argparse
import numpy as np
from collections import Counter

# 3rd party imports
from sklearn import manifold
from tensorflow.keras.models import Model
from keras import backend as K
from IPython.display import Image

# ML4CVD Imports
from ml4cvd.defines import TENSOR_EXT
from ml4cvd.arguments import parse_args
from ml4cvd.recipes import train_shallow_model, train_multimodal_multitask, test_multimodal_multitask
from ml4cvd.models import make_multimodal_multitask_model, train_model_from_generators, make_hidden_layer_model
from ml4cvd.tensor_generators import TensorGenerator, big_batch_from_minibatch_generator, test_train_valid_tensor_generators


%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter

In [None]:
gene2samples = {}
samples2genes = {}
tensor_path_prefix = '/mnt/disks/pix-size-tensors/2019-04-29/'
with open('/home/sam/genes_mri_ecg_carriers_sampleid.tsv', 'r') as my_tsv:
    lol = list(csv.reader(my_tsv, delimiter='\t'))
for row in lol[1:]:
    sample_list = [tensor_path_prefix + t + TENSOR_EXT for t in row[6].split(';')]
    gene2samples[row[0]] = sample_list
    for s in sample_list:
        samples2genes[s] = row[0]
[print(k, len(gene2samples[k])) for k in gene2samples]
gene_labels = ['TTN', 'AKAP9', 'DSC2', 'NEBL', 'RYR2', 'TRDN']


In [None]:
with open('/home/sam/start_with_me_lof.tsv', mode='w') as sample_lof_file:
    sample_lof_writer = csv.writer(sample_lof_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    sample_lof_writer.writerow(['sample_id', 'gene'])
    for g in gene2samples:
        for sample in gene2samples[g]:
            sample_lof_writer.writerow([os.path.splitext(os.path.basename(sample))[0], g])

In [None]:
print(gene2samples['LMNA'])
gene2samples['TTN'] = []
with open('/home/sam/tranche01_hc_ttntv.csv', 'r') as my_tsv:
    lol = list(csv.reader(my_tsv, delimiter=','))
for row in lol[1:]:
    tp = tensor_path_prefix + row[0] + TENSOR_EXT
    gene2samples['TTN'].append(tp)
    samples2genes[tp] = 'TTN'
for k in gene2samples:
    print(k, len(gene2samples[k]))
print(gene2samples['TTN'])

In [None]:
sys.argv = ['train', 
            '--tensors', '/mnt/disks/pix-size-tensors/2019-04-29/', 
            '--input_tensors', 'mri_systole_diastole_weighted', 'mri_pixel_width', 'mri_pixel_height',
            '--output_tensors', 'mri_systole_diastole_segmented_weighted', 'end_systole_volume', 'end_diastole_volume',
                                 'ejection_fraction', 'lv_mass',
            '--batch_size', '4',
            '--pool_z', '1',
            '--epochs', '2',  
            '--learning_rate', '0.001',
            '--u_connect',
            '--training_steps', '128',
            '--validation_steps', '6',
            '--test_steps', '6',
            '--model_file', '/home/sam/ml/trained_models/mri_systole_diastole_pixsize_unet_to_seg_volumes_mass_ef/mri_systole_diastole_pixsize_unet_to_seg_volumes_mass_ef.hd5',
            '--id', 'mri_systole_diastole_unet_lv_mass_diseases']
args = parse_args()
generate_train, generate_valid, generate_test = test_train_valid_tensor_generators(args.tensor_maps_in,  args.tensor_maps_out,  args.tensors, args.batch_size,   args.valid_ratio, args.test_ratio, args.test_modulo, args.balance_csvs)
model = make_multimodal_multitask_model(**args.__dict__)
test_data, test_labels, test_paths = big_batch_from_minibatch_generator(args.tensor_maps_in, args.tensor_maps_out,
                                                                        generate_test, args.test_steps*16)

In [None]:
gene2batch = {}
for g in gene2samples:
    gene_generator = TensorGenerator(args.batch_size, args.tensor_maps_in, args.tensor_maps_out, gene2samples[g], None, True)
    if g in gene_labels:
        gene2batch[g] = big_batch_from_minibatch_generator(args.tensor_maps_in, args.tensor_maps_out, gene_generator, args.test_steps) 

In [None]:
for g in gene_labels:
    for k in test_data:
        test_data[k] = np.concatenate([test_data[k], gene2batch[g][0][k]])
    for k in test_labels:
        test_labels[k] = np.concatenate([test_labels[k], gene2batch[g][1][k]])    
    test_paths.extend(gene2batch[g][2])

In [None]:
layer_name = 'embed'
d1 = model.get_layer(layer_name)
w1 = d1.get_weights()
for w in w1:
    print(w.shape)
embed_model = make_hidden_layer_model(model, args.tensor_maps_in, layer_name)
embed_model.summary()
print(list(test_data.keys()))
x_embed = embed_model.predict(test_data, batch_size=args.batch_size)
predictions = model.predict(test_data, batch_size=args.batch_size)

In [None]:
categorical_labels = [ 'Genetic-sex_Female_0_0', 'hypertension', 'coronary_artery_disease', 'Handedness-chiralitylaterality_Righthanded_0_0']
continuous_labels = ['22200_Year-of-birth_0_0|34_Year-of-birth_0_0', '21001_Body-mass-index-BMI_0_0', '1070_Time-spent-watching-television-TV_0_0', '102_Pulse-rate-automated-reading_0_0', '1488_Tea-intake_0_0', '21002_Weight_0_0']

label_dict = {k: np.zeros((len(test_paths))) for k in categorical_labels + continuous_labels + gene_labels}
for i, tp in enumerate(test_paths):
    hd5 = h5py.File(tp, 'r')
    #print(list(hd5['continuous'].keys()))
    for k in categorical_labels:
        if k in hd5['categorical']:
            label_dict[k][i] = 1
        elif k in hd5 and hd5[k][0] == 1:
            label_dict[k][i] = 1
    for mk in continuous_labels:
        for k in mk.split('|'):
            if k in hd5['continuous']:
                label_dict[mk][i] = hd5['continuous'][k][0]
    for k in gene_labels:
        if tp in samples2genes and samples2genes[tp] == k:
            label_dict[k][i] = 1
            
print(list(label_dict.keys()))
print(len(test_paths))

In [None]:
n_components = 2
max_rows = 30
(fig, subplots) = plt.subplots(min(max_rows, len(label_dict)), 3, figsize=(20, max_rows*4))
perplexities = [18, 25, 40]

p2y = {}
for i, perplexity in enumerate(perplexities):
    tsne = manifold.TSNE(n_components=n_components, init='random', random_state=0, perplexity=perplexity, n_iter=5000)
    p2y[perplexity] = tsne.fit_transform(x_embed)

j = -1
for k in label_dict:
    j += 1
    if j == max_rows:
        break
    if k in categorical_labels+gene_labels:
        red = label_dict[k] == 1.0
        green = label_dict[k] != 1.0
    elif k in continuous_labels:
        colors = label_dict[k]      
        
    print('process key:', k)
    for i, perplexity in enumerate(perplexities):
        ax = subplots[j, i]
        ax.set_title(k+", Perplexity=%d" % perplexity)
        if k in categorical_labels+gene_labels:
            ax.scatter(p2y[perplexity][green, 0], p2y[perplexity][green, 1], c="g")
            ax.scatter(p2y[perplexity][red, 0], p2y[perplexity][red, 1], c="r")
            ax.legend(['no_'+k, k], loc='lower left')
        elif k in continuous_labels:
            points = ax.scatter(p2y[perplexity][:, 0], p2y[perplexity][:, 1], c=colors, cmap='jet') 
            if i == len(perplexities)-1:
                fig.colorbar(points, ax=ax)
                
        ax.xaxis.set_major_formatter(NullFormatter())
        ax.yaxis.set_major_formatter(NullFormatter())
        ax.axis('tight')
plt.show()
