In [1]:
import logging
import os
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import pathlib
from  isotools import Transcriptome
from isotools._utils import pairwise

import isotools
logger=logging.getLogger('isotools')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
logger.info(f'This is isotools version {isotools.__version__}')


INFO:This is isotools version 0.3.2rc2


In [2]:
path='/project/42/pacbio/golong'
date='2022_12'
project='golong_all'
out_path=f'{path}/06-isotools/{project}/results_{date}'
plot_path=f'{out_path}/06_sashimi_plots'
pathlib.Path(plot_path).mkdir(parents=True, exist_ok=True)

ref_fn=f'{path}/../references/gencode/gencode.v36.chr_patch_hapl_scaff.annotation_sorted'
genome_fn=f'{path}/../references/gencode/GRCh38.p13.genome.fa'

In [3]:
# set plot parameters
plt.rcParams['axes.unicode_minus']=False
plt.rcParams["font.size"]=8
plt.rcParams["font.family"]='sans-serif'#"Times New Roman"
plt.rcParams['figure.dpi']=200
plt.rcParams["figure.figsize"] = (8.3 , 11.7)


## Load isoseq data

In [4]:
isoseq=Transcriptome.load(f'{out_path}/01_transcriptome/golong_all_{date}_isotools_sparse.pkl')
isoseq.add_filter( 'SUBSTANTIAL_1', 'g.coverage.sum() * .01 < g.coverage[:,trid].sum()',context='transcript') # at least 1 % (default 5%)


INFO:loading transcriptome from /project/42/pacbio/golong/06-isotools/golong_all/results_2022_12/01_transcriptome/golong_all_2022_12_isotools_sparse.pkl


In [5]:
# define group and sample colors

groups={k:v for k,v in isoseq.groups().items() if k[:3] in ['CLL', 'MDS']}

group_dict={sa:gn for gn,sa_list in isoseq.groups().items() for sa in sa_list}
group_num={sa:i for i,sa in enumerate(['K562_mut','K562_wt','Nalm6_mut', 'Nalm6_wt','CLL_mut','CLL_wt', 'MDS_mut', 'MDS_wt','B-cell'])}

groups['CL_wt']=[sa for gr in ['K562_wt', 'Nalm6_wt'] for sa in isoseq.groups()[gr]]
groups['CL_mut']=[sa for gr in ['K562_mut', 'Nalm6_mut'] for sa in isoseq.groups()[gr]]
groups['B-cell']=isoseq.groups()['B-cell']
#groups['GM12878_ENCODE']=isoseq.groups()['GM12878']
group_colors={'CLL_mut':'#B2182B', 'CLL_wt':'#2166AC', 'MDS_mut':'#EF8A62', 'MDS_wt':'#67A9CF','CL_mut':'#FDDBC7' ,'CL_wt':'#D1E5F0','B-cell':'#543005','other':'#018571'}
group_colors['all_mut']=group_colors['MDS_mut']
group_colors['all_wt']=group_colors['MDS_wt']
group_colors['reference']=group_colors['other']

sample_colors={sa:group_colors[k] for sa in isoseq.samples for k,v in groups.items() if sa in v }
sample_colors={sa:sample_colors.get(sa,'#003C30') for sa in isoseq.samples}
sample_colors['reference']=group_colors['other']

groups_CL={k:v for k,v in groups.items() if k[:3] in ['CL_', 'GM1']}
groups_CLL={k:v for k,v in groups.items() if k in ['CLL_wt', 'CLL_mut', 'B-cell']}
groups_MDS={k:v for k,v in groups.items() if k in ['MDS_wt', 'MDS_mut', 'B-cell']}

group_same_colors={k:group_colors["all_"+k.split("_")[1]] for k in group_colors if 'wt' in k or 'mut' in k}
group_same_colors['B-cell']=group_colors['B-cell']


## load the differential splicing results

In [6]:
gois=set()
groups_test={}
res={}
gr=isoseq.groups()
gr.update({'all_mut':gr['CLL_mut']+gr['MDS_mut']+gr['K562_mut']+gr['Nalm6_mut'],'all_wt':gr['CLL_wt']+gr['MDS_wt']+gr['K562_wt']+gr['Nalm6_wt']})
gr.update({'CL_mut':gr['K562_mut']+gr['Nalm6_mut'],'CL_wt':gr['K562_wt']+gr['Nalm6_wt']})
# gr.update({'pat_mut':gr['CLL_mut']+gr['MDS_mut'],'pat_wt':gr['CLL_wt']+gr['MDS_wt']})
version='less_strict'


for diff_cmp in ["all", "CL","CLL",'MDS']:
    groups_test[diff_cmp]={f'{diff_cmp}_{gt}':gr[f'{diff_cmp}_{gt}'] for gt in ['mut', 'wt']}
    res[diff_cmp]=pd.read_csv(f'{out_path}/04_differential_splicing/tables/{project}_{date}_diff_betabinomial_{version}_{diff_cmp}.csv')
    sig=res[diff_cmp].padj<.1
    print(f'{sum(sig)} differential splice sites in {len(res[diff_cmp].loc[sig,"gene"].unique())} genes for {" vs ".join(groups_test[diff_cmp])}')
    gois.update(set(res[diff_cmp].loc[sig, 'gene_id'])) # this is not used
print(f'{len(gois)} genes with differential splicing')

782 differential splice sites in 531 genes for all_mut vs all_wt
83 differential splice sites in 63 genes for CL_mut vs CL_wt
286 differential splice sites in 205 genes for CLL_mut vs CLL_wt
218 differential splice sites in 176 genes for MDS_mut vs MDS_wt
640 genes with differential splicing


In [7]:
all_tab=res['all'].set_index(['gene','gene_id','chrom','strand','start','end','splice_type']).sort_index()
sig_union=pd.read_csv(f'{out_path}/04_differential_splicing/tables/{project}_{date}_diff_betabinomial_{version}_sig_union.csv')
sig_union['min_padj']=sig_union[[c for c in sig_union.columns if 'padj' in c]].min(axis=1)
sig_union=sig_union.sort_values('min_padj').reset_index(drop=True)

## make the sashimi plots

In [8]:
sashimi_groups={'K562_mut':['K03', 'K04'],'K562_wt':['K01', 'K02'],'Nalm_mut':['N03', 'N04'],'Nalm_wt':['N01', 'N02']}
sashimi_groups.update({k:v for k,v in groups.items() if 'CLL' in k or 'MDS' in k})
               


#todo: make sure to include 
#goi=['SETD4', 'THOC1', 'MLH3', 'TTI1', 'ENOSF1']
#goi=['BRD9']
#goi=res['all'].head().gene
#regOI=dict(res['all'].loc[95]) #BRD9
#idxoi=res['all'].reset_index().loc[res['all'].gene.isin(goi)].groupby('gene').first()['index'].tolist()
#plot_folder='sashimi_splicesosome'
#spliceosome_events=pd.read_csv(f'{path}/06-isotools/{project}/20221105_MDS_CLL_sigOnly_Spliceosome.csv', sep=',')
#spliceosome_idx=spliceosome_events.set_index(['chrom', 'start','end','splice_type']).index
#idxoi=list(res['all'].reset_index().set_index(['chrom', 'start','end','splice_type']).loc[spliceosome_idx,'index'])

logscale=False
plot_subfolder=f'{plot_path}/sig_union'
pathlib.Path(plot_subfolder).mkdir(parents=True, exist_ok=True)
issues={}
for i, row in sig_union.iterrows():
    print(f"{i}: {row['gene']}")
    estart, eend=row['start'], row['end']
    g=isoseq[row['gene_id']]
    trA, trB=row['trA'], row['trB']
    if pd.isna(trA) or pd.isna(trB):
        issues[i]='no transcripts'
        print('no transcripts...')
        continue
    if isinstance(trA, str):
        trA=list(map(int,trA[1:-1].split(', ')))
    if isinstance(trB, str):
        trB=list(map(int,trB[1:-1].split(', ')))
    
    trA.sort(key=lambda x:-g.coverage[:,x].sum())
    trB.sort(key=lambda x:-g.coverage[:,x].sum())
    substantial=g.filter_transcripts('SUBSTANTIAL_1')
    joi=[(e1[1], e2[0]) for trid in trA+trB if trid in substantial for e1,e2 in pairwise(g.transcripts[trid]['exons']) if e1[1]>=estart and e2[0]<=eend]
    #joi.extend([(e1[1], e2[0]) for e1,e2 in pairwise(g.transcripts[trB[0]]['exons']) if e1[1]>=estart and e2[0]<=eend])
    ref_tr=[]
    for trL in trA, trB:
        for tr_i in trL:
            anno=g.transcripts[tr_i]['annotation'][1]
            if 'FSM' in anno:
                ref_tr.append(anno['FSM'][0])
                break
        else:
            seek=[(e1[1], e2[0]) for e1,e2 in pairwise(g.transcripts[trL[0]]['exons']) if e1[1]>=estart and e2[0]<=eend]
            for ref_id,tr in enumerate(g.ref_transcripts):
                found=[(e1[1], e2[0]) for e1,e2 in pairwise(tr['exons']) if (e1[1], e2[0]) in seek]
                if len(found)==len(seek):
                    ref_tr.append(ref_id)
                    break


    try:
        fig, axs=g.sashimi_figure(samples=sashimi_groups, x_range=[estart-500,eend+500], junctions_of_interest=joi, long_read_params={'log_y':logscale, 'text_width':2, 'text_height':2})
        axs[0].cla()
        g.gene_track(select_transcripts=ref_tr, x_range=[estart-500,eend+500], ax=axs[0],label_fontsize=6)
        fig.tight_layout()
        fig.savefig(f"{plot_subfolder}/diff_union_{i:03d}_{row['gene']}_{row['splice_type']}_{'log' if logscale else 'lin'}_sashimi.pdf")
    except Exception as e:
        issues[i]=str(e)
        print(e)
    plt.close()
    


0: SEPTIN6
1: MAP3K7
2: PRPF38A
3: SERBP1
4: ENOX2
5: NFYA
6: RBM18
7: OXA1L
8: SLTM
9: ANKHD1
10: DLST
11: COG1
12: TMEM14C
13: DDX5
14: ZNF91
15: NET1
16: NONO
17: SLC3A2
18: CEP135
19: GCC2
20: ZNF410
21: ZNF91
22: ZNF91
23: FAR2
24: VEZT
25: USP15
26: ZBED5
27: RWDD4
28: SLC7A7
29: SLC7A7
30: UXS1
31: TRIM37
32: SNRPN
33: PDHX
34: CD2BP2
35: ZNF561
36: SNW1
37: ENOSF1
38: NDRG3
39: SLC7A7
40: TNPO3
41: ZNF91
42: NSMCE4A
43: TOR1AIP2
44: CCDC18-AS1
45: WASHC4
46: RNF2
47: TUT4
48: PIP5K1B
49: NDRG3
50: CNTRL
51: PPP2R5A
52: GTF2I
53: TBC1D15
54: UBC
55: UBC
56: ELP2
57: DYNLL1
58: SMNDC1
59: RUBCNL
60: ZFAND5
61: TMEM14C
62: NDRG3
63: CDK8
64: SMNDC1
65: CD2BP2
66: GSAP
67: WASHC5
68: SUN1
69: STAU2
70: ZBED5
71: ATAD1
72: GGNBP2
73: ZNF548
74: MYL4
75: HLTF
76: PGBD1
77: SLC36A4
78: TAF2
79: ZNF410
80: BUB1B
81: RDX
82: ITFG1
83: USP15
84: COASY
85: SNW1
86: TOR1AIP2
87: SUN1
88: GEN1
89: FCRL1
90: TPT1
91: YWHAB
92: DNAJC3
93: FAM122B
94: DDHD2
95: SEPTIN2
96: YWHAB
97: EI24
98: C

In [9]:
g.coverage.shape

(58, 524)

In [10]:
row

gene                                                        TBC1D15
gene_id                                          ENSG00000121749.15
chrom                                                         chr12
strand                                                            +
start                                                      71873003
end                                                        71884810
splice_type                                                      ES
novel                                                         False
trA               [54, 270, 361, 28, 207, 105, 201, 458, 365, 51...
trB               [0, 38, 11, 22, 45, 27, 6, 24, 244, 17, 158, 1...
all_padj                                                        NaN
all_pvalue                                                      NaN
all_mut_PSI                                                     NaN
all_mut_disp                                                    NaN
all_wt_PSI                                      

In [11]:
row.trB

'[0, 38, 11, 22, 45, 27, 6, 24, 244, 17, 158, 112, 172, 166, 12, 354, 49, 122, 50, 256, 66, 40, 352, 32, 84, 155, 294, 326, 402, 20, 150, 355, 90, 97, 157, 10, 18, 39, 53, 76, 104, 184, 272, 275, 327, 31, 160, 206, 267, 280, 283, 341, 350, 378, 407, 445, 463, 62, 170, 189, 237, 246, 251, 253, 260, 269, 276, 278, 285, 288, 291, 305, 306, 313, 314, 323, 324, 339, 346, 349, 351, 362, 364, 387, 388, 397, 399, 404, 411, 412, 415, 424, 427, 447, 449, 450, 456, 468, 470, 473, 477, 479, 13, 61, 72, 130, 193, 197, 198, 226, 241, 242, 243, 245, 247, 249, 250, 254, 258, 261, 264, 271, 279, 281, 287, 289, 293, 295, 299, 301, 302, 303, 315, 317, 320, 330, 331, 333, 335, 336, 337, 340, 343, 345, 347, 348, 357, 358, 359, 360, 366, 369, 374, 376, 377, 379, 381, 382, 383, 384, 385, 386, 389, 391, 393, 395, 396, 400, 406, 409, 416, 417, 418, 423, 425, 430, 432, 434, 435, 436, 437, 438, 440, 441, 442, 446, 451, 452, 453, 457, 459, 460, 461, 464, 465, 466, 467, 469, 471, 474, 476, 478, 480, 481, 482, 483,