In [3]:
from isotools import Transcriptome
from isotools import __version__ as isotools_version
from isotools._utils import _filter_event

import os
from pathlib import Path
import logging
from collections import Counter
from urllib.request import urlretrieve
import itertools
from tqdm.notebook import tqdm_notebook as tqdm

import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pysam

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
logger=logging.getLogger('isotools')
logger.info(f'This is isootools version {isotools_version}')

INFO:This is isootools version 0.2.11rc1


In [4]:
def eval_trid(df):
    df.priA_priB_trID=[eval(e) for e in df.priA_priB_trID]
    df.priA_altB_trID=[eval(e) for e in df.priA_altB_trID]
    df.altA_priB_trID=[eval(e) for e in df.altA_priB_trID]
    df.altA_altB_trID=[eval(e) for e in df.altA_altB_trID]
    pass

In [7]:
tissue_names=['left ventricle myocardium superior', 'Right ventricle myocardium superior',
              'mucosa of descending colon', 'heart left ventricle',
              'posterior vena cava', 'left cardiac atrium',
              'aorta', 'ovary', 'adrenal gland',
              'mesenteric fat pad', 'upper lobe of right lung',
              'heart right ventricle', 'left ventricle myocardium inferior',
              'left colon', 'right cardiac atrium',
              'lower lobe of right lung', 'lower lobe of left lung',
              'kidney', 'psoas muscle',
              'Right ventricle myocardium inferior', 'cardiac septum']

In [8]:
tbl_path="./tables/tissue/"
files=[x.name for x in Path(tbl_path).iterdir()]

tissue_dict={}

for tissue in tissue_names:
    
    tissue_key=tissue.replace(' ','_')
    fname=tissue_key+'.csv'
    
    logger.info(f"reading {fname}")
    tissue_dict[tissue]=pd.read_csv(tbl_path+fname)
    eval_trid(tissue_dict[tissue])

INFO:reading left_ventricle_myocardium_superior.csv
INFO:reading Right_ventricle_myocardium_superior.csv
INFO:reading mucosa_of_descending_colon.csv
INFO:reading heart_left_ventricle.csv
INFO:reading posterior_vena_cava.csv
INFO:reading left_cardiac_atrium.csv
INFO:reading aorta.csv
INFO:reading ovary.csv
INFO:reading adrenal_gland.csv
INFO:reading mesenteric_fat_pad.csv
INFO:reading upper_lobe_of_right_lung.csv
INFO:reading heart_right_ventricle.csv
INFO:reading left_ventricle_myocardium_inferior.csv
INFO:reading left_colon.csv
INFO:reading right_cardiac_atrium.csv
INFO:reading lower_lobe_of_right_lung.csv
INFO:reading lower_lobe_of_left_lung.csv
INFO:reading kidney.csv
INFO:reading psoas_muscle.csv
INFO:reading Right_ventricle_myocardium_inferior.csv
INFO:reading cardiac_septum.csv


## Make event IDs

In [9]:
def make_event_ids(coor_tbl,progress_bar=True):
    
    event_ids=[]
    
    if progress_bar==True:
        for i,r in tqdm(coor_tbl.iterrows(),total=coor_tbl.shape[0]):
            event_ids.append(':'.join([r.ase1_type,r.ase2_type,str(r.ase1_start),
                                       str(r.ase1_end),str(r.ase2_start),str(r.ase2_end)]))
    else:
        for i,r in coor_tbl.iterrows():
            event_ids.append(':'.join([r.ase1_type,r.ase2_type,str(r.ase1_start),
                                       str(r.ase1_end),str(r.ase2_start),str(r.ase2_end)]))
        
    return event_ids

In [10]:
def make_group_ids(tabl_dict,progress_bar=True):
    group_ids={}
    
    if progress_bar==True:
        for k in tqdm(tabl_dict.keys()):
            group_ids[k]=make_event_ids(tabl_dict[k],progress_bar=False)
    else:
        for k in tabl_dict.keys():
            group_ids[k]=make_event_ids(tabl_dict[k],progress_bar=False)
    return group_ids

In [11]:
tissue_ids=make_group_ids(tissue_dict)

  0%|          | 0/21 [00:00<?, ?it/s]

In [20]:
for k,tab in tissue_dict.items():
    tab["pID"]=tissue_ids[k] # assign pair id

## Create the DataFrame for the Clustering

In [31]:
all_ids=[]
for k,tab in tissue_dict.items():
    all_ids.extend(list(tab.pID))
all_ids=list(set(all_ids))
print(len(all_ids))

27878


In [59]:
def make_cluster_df():
    
    data={k:[] for k in tissue_dict.keys()}
    
    error_count=0
    
    for pid in tqdm(all_ids):
        
        for k,tab in tissue_dict.items():
            
            r=tab[tab.pID==pid]
            
            if r.shape[0]!=0:
                
                if r.shape[0]>1:
                    error_count+=1
                    print(f"errors encountered: {error_count}",end="\r")
                    continue
                    
                data[k].append(float(r.log2OR))
            
            else:
                
                data[k].append(0)
                
    df=pd.DataFrame(data,index=all_ids)
    return df

  0%|          | 0/27878 [00:00<?, ?it/s]

errors encountered: 59

KeyboardInterrupt: 

In [60]:
tabl_path="/project/hfa_work/ceraolo/theco/tables/clustering"

In [54]:
r.log2OR

338    1.000000
340    0.981853
Name: log2OR, dtype: float64