In [1]:
import mlflow
from mlflow.tracking import MlflowClient

import os, sys

import torch
import torch.nn.functional as F

HOME = os.environ["HOME"]
CARDIAC_COMA_REPO = f"{HOME}/01_repos/CardiacCOMA/"
import os; os.chdir(CARDIAC_COMA_REPO)

from config.load_config import load_yaml_config, to_dict

import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import Image

import pandas as pd
import shlex
from subprocess import check_output

import pickle as pkl
import pytorch_lightning as pl

from argparse import Namespace
import matplotlib.pyplot as plt

#import surgeon_pytorch
#from surgeon_pytorch import Inspect, get_layers

import numpy as np
import pandas as pd
from IPython import embed
sys.path.insert(0, '..')

# import model.Model3D
# from utils.helpers import get_coma_args, get_lightning_module, get_datamodule
from copy import deepcopy
from pprint import pprint

from typing import List
from tqdm import tqdm

import pyvista as pv
from ipywidgets import interact, interactive, fixed, interact_manual



ModuleNotFoundError: No module named 'torch'

In [2]:
from utils.mlflow_queries import \
    list_artifacts,\
    get_significant_loci,\
    get_metrics_cols, \
    get_params_cols, \
    get_runs_df, \
    get_good_runs,\
    summarize_loci_across_runs,\
    get_model_pretrained_weights

In [3]:
TRACKING_URI = f"file://{CARDIAC_COMA_REPO}/mlruns"
mlflow.set_tracking_uri(TRACKING_URI)
client = MlflowClient()

In [4]:
def experiment_selection_widget():
    
    '''
    Returns a selection widget for MLflow experiments.
    '''
    
    options = [exp.name for exp in mlflow.list_experiments()]

    experiment_w = widgets.Select(
      options=options,
      value="Cardiac - ED"
    )
    
    return experiment_w

In [5]:
exp_w = experiment_selection_widget()

@interact
def get_runs(exp_name=exp_w):
    try:  
        global runs_df
        exp_id = mlflow.get_experiment_by_name(exp_name).experiment_id
        runs_df = get_runs_df(exp_name=exp_name, only_finished=True)
        metrics, params    = get_metrics_cols(runs_df), get_params_cols(runs_df)  
        # display(runs_df.loc[:, [*metrics, *params]].drop("params.platform", axis=1).head(10))
    except:
        pass

interactive(children=(Select(description='exp_name', index=1, options=('Default', 'Cardiac - ED', 'Cardiac - E…

In [6]:
def change_col_names(exper_id, run_id, df):
    df.columns = [f"{exper_id}_{run_id[:5]}_{col}" for col in df.columns]
    return df

def path_to_z(row):
    run_id = (row.experiment_id, row.run_id)
    artifacts_dir = row.artifact_uri.replace("file://", "") 
    z_adj_path = f'''output/z_adj_{row.experiment_id}_{row.run_id}.tsv'''    
    z_adj_path = os.path.join(artifacts_dir, z_adj_path)    
    return (run_id, z_adj_path)

In [7]:
z_paths = runs_df.reset_index().apply(path_to_z, axis=1)
z_paths = dict(z_paths.values.tolist())

In [8]:
z_dfs = {}
for run_id, z_path in tqdm(z_paths.items()):
    try:
        z_dfs[run_id] = pd.read_csv(z_path, sep="\t").set_index("ID")
    except FileNotFoundError:
        # print(f"Latent vector file does not exist for {run_id}")
        pass
    
z_dfs_renamed = [change_col_names(expid, runid, z_df) for (expid, runid), z_df in z_dfs.items()]    
z_all_df = pd.concat(z_dfs_renamed, axis=1)
z_all_df.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 204/204 [00:15<00:00, 13.04it/s]


Unnamed: 0_level_0,1_e6490_z000,1_e6490_z001,1_e6490_z002,1_e6490_z003,1_e6490_z004,1_e6490_z005,1_e6490_z006,1_e6490_z007,1_e6490_z008,1_e6490_z009,...,1_17629_z006,1_17629_z007,1_17629_z008,1_17629_z009,1_17629_z010,1_17629_z011,1_17629_z012,1_17629_z013,1_17629_z014,1_17629_z015
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2471198,0.754081,0.217631,0.892939,0.801908,0.398927,-0.485893,0.287335,-0.97753,0.292033,-0.335887,...,0.529536,0.301523,-0.036083,-0.444244,-0.190534,-0.052853,-0.020239,0.952413,0.081639,-0.146265
1746709,-0.985621,-0.575624,-2.010317,-1.195979,-1.948443,2.083467,1.517243,1.628201,-0.734581,-0.537734,...,1.484463,1.207427,-1.787333,1.504024,-1.848678,-1.583979,-1.340997,-0.229178,1.150588,-1.269248
2008596,-1.32305,-2.476096,-1.314485,-1.831838,-2.217431,1.894083,0.395576,1.415595,0.80804,0.837866,...,2.471592,-0.454913,0.649513,-1.352151,-0.884789,-0.391621,0.205825,1.25026,-0.620919,0.428527
4917788,0.275106,0.853365,0.61836,1.108492,0.711543,-0.204171,0.523633,-0.321992,0.623396,0.09648,...,0.325998,0.760645,-0.456241,0.19663,-0.403658,-0.53627,-0.052783,0.592991,0.395652,-0.476883
1780889,-1.405695,0.349099,-0.181581,-0.567419,-0.055743,0.754549,0.506033,0.170072,0.176574,1.168651,...,-0.076131,0.479961,-1.147999,1.031148,-0.868456,-1.414637,-0.752959,0.144984,0.524764,-0.484389


# Genomic PCA

In [9]:
GENOMIC_PC_FILE = f'''{os.environ["HOME"]}/01_repos/GWAS_pipeline/data/transforms/GenomicPCA/pcs.txt'''

In [10]:
genomic_pca_df = pd.read_csv(GENOMIC_PC_FILE, sep="\t")
genomic_pca_df = genomic_pca_df.set_index("IID").drop("FID", axis=1)

In [11]:
genomic_pca_df

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
IID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000336,0.009439,0.020864,0.005781,-0.018697,-0.000904,0.022974,0.005486,-0.013221,-0.017238,-0.016062
1000363,0.023638,0.018311,0.044009,0.002280,-0.030949,-0.025851,0.001375,-0.019926,-0.002900,-0.027202
1000380,0.035396,0.011719,-0.008578,-0.004962,-0.003340,-0.004983,0.016331,-0.032201,-0.022172,0.010668
1000407,0.011639,0.026969,-0.050002,0.034824,-0.005536,0.012261,-0.055735,0.021770,-0.004737,0.015074
1000739,0.037332,-0.036103,-0.013930,0.011619,-0.000667,0.006000,-0.008560,-0.009928,-0.012613,0.000171
...,...,...,...,...,...,...,...,...,...,...
4999331,0.017444,0.002880,-0.040644,-0.010896,0.000275,-0.018865,0.004250,0.039717,0.030166,-0.005031
4999373,0.027891,-0.017130,0.002323,0.015145,-0.003229,-0.002498,-0.024503,0.030038,-0.016532,0.023511
4999543,0.001270,-0.009798,-0.007581,-0.022535,-0.023158,0.015697,-0.016623,0.009144,0.009319,-0.005967
4999719,0.025469,0.021132,-0.007243,-0.004814,0.014104,0.029959,0.013883,-0.051551,-0.005785,0.004260


### Correlation genomic PCs vs. latent variables

In [12]:
import statsmodels.api as sm

In [13]:
from scipy.stats import spearmanr

In [14]:
genomic_pca_df.loc[z_all_df.index]

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2471198,-0.051873,0.001545,-0.009844,-0.012530,-0.020395,0.020359,0.019912,-0.002931,-0.001709,-0.020747
1746709,-0.055044,-0.038579,-0.009757,0.008254,-0.011455,-0.000482,0.002033,-0.034306,0.013822,0.023042
2008596,0.027035,-0.005019,-0.005780,0.018832,0.021634,-0.029754,0.019613,-0.003437,-0.008844,-0.031998
4917788,0.018510,0.000287,-0.010316,-0.003669,-0.007625,0.009048,-0.014913,0.028892,-0.005289,0.026850
1780889,0.031780,-0.005839,-0.007639,-0.027031,-0.003665,-0.023096,0.016856,-0.024841,-0.013299,-0.018932
...,...,...,...,...,...,...,...,...,...,...
4485380,0.012538,0.030180,-0.012806,0.004318,-0.002375,-0.011069,0.001156,0.025411,0.011179,-0.005765
3878790,0.010731,0.023344,0.012479,-0.022059,0.004118,0.018627,-0.050603,0.019430,-0.013923,0.018948
1703522,-0.042462,0.010479,-0.012252,0.049719,0.033033,-0.012664,-0.045234,-0.021413,-0.002971,-0.019009
2034360,-0.025799,0.035614,-0.021480,-0.005922,-0.010112,-0.026073,-0.039966,-0.038286,-0.011845,-0.009643


In [15]:
spearman_coef, spearman_pvalue = spearmanr(a=z_all_df, b=genomic_pca_df.loc[z_all_df.index])

In [16]:
# pd.DataFrame(spearman_coef)
np.log10(pd.DataFrame(spearman_pvalue[:-10,-10:])).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0
mean,-0.964088,-0.39068,-0.875388,-1.842624,-0.485954,-0.235492,-0.418303,-0.869153,-0.701453,-0.431343
std,1.379551,0.502829,0.724153,1.073164,0.301226,0.215412,0.285657,0.441398,0.472094,0.53203
min,-8.726355,-4.20097,-3.708875,-5.396731,-1.739564,-1.792114,-1.97204,-2.073406,-2.747475,-2.766425
25%,-1.181492,-0.452812,-1.233469,-2.557068,-0.627332,-0.309444,-0.564718,-1.209094,-0.957526,-0.584666
50%,-0.408849,-0.223124,-0.682486,-1.823582,-0.455533,-0.170018,-0.369654,-0.98602,-0.677228,-0.218208
75%,-0.127852,-0.107934,-0.328792,-1.016941,-0.280242,-0.086728,-0.210622,-0.469428,-0.343121,-0.08146
max,-0.001764,-0.000703,-0.000512,-0.001768,-0.000203,-0.001357,-0.002743,-0.002909,-0.001238,-3.8e-05


In [17]:
sm.regression.linear_model.OLS()

TypeError: __init__() missing 1 required positional argument: 'endog'

### Correlation genomic PCs vs. traditional cardiac indices

In [None]:
timeframe = "1".zfill(3) # 001 --> end-diastole
datafolder = "data/cardio/cardiac_indices"

df = pd.concat([
    pd.read_csv(f"{datafolder}/G{i}/LVRV_time{timeframe}.csv", index_col="case_id") 
    for i in range(1,5)
])

df.index = df.index.astype(str)

df.head()

In [None]:
sph_df = pd.read_csv("data/cardio/sphericity.csv").set_index("id")
sph_df.index = sph_df.index.astype(str)

In [None]:
cardiac_indices_df = df.merge(sph_df, left_index=True, right_index=True)