In [1]:
from eda_import import *

In [2]:
from utils.parse_gtf_utils import extract_info
from utils.calc_clv_sc import get_strand, get_num_sc

import utils.plot_bars as PB
import utils.plot_arcs as PA
import utils.plot_utrs as PU
import utils.plot_clvs as PC

In [3]:
%%time
df_clv = pd.read_csv('./results_data/all_cba.KLEAT.on-target-cleaned.filtered.clustered.csv.gz', 
                     compression='gzip',
                     usecols=['analysis_id', 'disease', 'sstype', 'mkid', 'gene_name',
                              'participant_id', 'mclv', 'strand'])

CPU times: user 16.6 s, sys: 266 ms, total: 16.9 s
Wall time: 16.9 s


In [4]:
%time df_expr = pd.read_csv('./results_data/RPKMS.csv.gz', compression='gzip')

CPU times: user 488 ms, sys: 22.8 ms, total: 510 ms
Wall time: 518 ms


In [5]:
clv_sc_df = pd.read_csv(
    './reference_data/annotated-clv-sc-mapping.csv.gz', compression='gzip',
    usecols=['seqname', 'gene_name', 'strand', 'source', 'aclv', 'sc']
)
clv_sc_df.drop_duplicates(inplace=True)
clv_sc_df.reset_index(drop=True, inplace=True)

In [6]:
clv_sc_df.head()

Unnamed: 0,gene_name,seqname,source,sc,strand,aclv
0,ABL1,9,protein_coding,133761070,+,133763062
1,AKT1,14,protein_coding,105236678,-,105235686
2,AKT1,14,protein_coding,105236678,-,105236492
3,AKT1,14,protein_coding,105236678,-,105236309
4,AKT1,14,protein_coding,105236678,-,105236677


In [7]:
clv_sc_df.query('gene_name == "CDKN2A"').sc.unique()

array([21968228, 21971002, 21974476, 21968724, 21969731, 21970890,
       21968208])

In [8]:
assert clv_sc_df.shape[0] == 467

In [9]:
clv_sc_df.head(2)

Unnamed: 0,gene_name,seqname,source,sc,strand,aclv
0,ABL1,9,protein_coding,133761070,+,133763062
1,AKT1,14,protein_coding,105236678,-,105235686


In [10]:
def calc_offset(grp_by_gene):
    grp = grp_by_gene.copy()
#     grp = grp_by_gene.query('source == "protein_coding"')
    if grp.shape[0] == 0:
        grp = grp_by_gene
    assert grp.strand.unique().shape[0] == 1
    strand = grp.strand.unique()[0]
    if strand == '+':
        return grp.sc.values.min()
    elif strand == '-':
        return grp.sc.values.max()
    else:
        raise
        
# offset to the first protein_coding stop codon
offsets = clv_sc_df[['gene_name', 'sc', 'aclv', 'strand', 'source']]\
    .groupby('gene_name')\
    .apply(calc_offset)\
    .to_frame(name='offset')\
    .reset_index()

clv_sc_df = clv_sc_df.merge(offsets, on='gene_name', how='left')

clv_sc_df['sc_t'] = (clv_sc_df.sc - clv_sc_df.offset)
clv_sc_df['aclv_t'] = (clv_sc_df.aclv - clv_sc_df.offset)
clv_sc_df['adiff'] = clv_sc_df.aclv - clv_sc_df.sc
clv_sc_df['alen'] = (clv_sc_df.aclv - clv_sc_df.sc).abs()

In [11]:
clv_sc_df.head(1)

Unnamed: 0,gene_name,seqname,source,sc,strand,aclv,offset,sc_t,aclv_t,adiff,alen
0,ABL1,9,protein_coding,133761070,+,133763062,133761070,0,1992,1992,1992


In [12]:
clv_sc_df.source.value_counts()

protein_coding             364
nonsense_mediated_decay    103
Name: source, dtype: int64

In [13]:
%time adf = pd.read_csv('./results/aca_pval_with_sc_info.csv')

CPU times: user 36.6 ms, sys: 2.69 ms, total: 39.3 ms
Wall time: 48 ms


Surprisingly, for these pairs, cleavage sites are only detected in tumour samples 

In [14]:
missing_gd_pair = adf[['gene_name', 'disease']].drop_duplicates().values.tolist()
for i in adf.disease.unique():
    for j in adf.gene_name.unique():
        if [j, i] not in missing_gd_pair:
            print(j, i)
            unique_sstype = df_clv.query('gene_name == "{0}"'.format(j)).query('disease == "{0}"'.format(i)).sstype.unique()
            assert unique_sstype.tolist() == ['tumour']

NKX2-1 COAD
TERT KICH
TERT KIRP


In [15]:
assert adf.gene_name.unique().shape[0] == 114

In [16]:
adf.head(2)

Unnamed: 0,disease,gene_name,mkid,mclv,strand,aclv,N_on,T_on,N_off,T_off,fisher_exact_p,N_base,T_base,N_on_ratio,T_on_ratio,N2T_ratio_diff,diff_is_significant,N2T_ratio_change,N2T_ratio_change_sig,has_diff_aca,sc_list,src_list,num_sc
0,BLCA,ABL1,chr9|ABL1|+|133589972,133589972,+,133763062,1,23,18,382,1.0,19,405,0.052632,0.05679,0.004159,False,up,,False,133761070,protein_coding,1
1,BLCA,ABL1,chr9|ABL1|+|133763062,133763062,+,133763062,19,405,0,0,1.0,19,405,1.0,1.0,0.0,False,down,,False,133761070,protein_coding,1


In [17]:
_mkids = adf.query('has_diff_aca == True').query('gene_name == "FGF2"').mkid.unique()
_mkids

array(['chr4|FGF2|+|123813648', 'chr4|FGF2|+|123815953',
       'chr4|FGF2|+|123816621', 'chr4|FGF2|+|123817951',
       'chr4|FGF2|+|123818762', 'chr4|FGF2|+|123819379',
       'chr4|FGF2|+|123814916'], dtype=object)

In [18]:
adf.query('mkid in {0}'.format(_mkids.tolist())).groupby('mkid').apply(lambda g: g[['N_on_ratio', 'T_on_ratio']].max()).sort_values('T_on_ratio', ascending=False)

Unnamed: 0_level_0,N_on_ratio,T_on_ratio
mkid,Unnamed: 1_level_1,Unnamed: 2_level_1
chr4|FGF2|+|123819379,0.947368,0.897222
chr4|FGF2|+|123816621,0.793103,0.774436
chr4|FGF2|+|123813648,0.210526,0.208861
chr4|FGF2|+|123815953,0.064516,0.058824
chr4|FGF2|+|123818762,0.055556,0.012048
chr4|FGF2|+|123817951,0.0,0.011628
chr4|FGF2|+|123814916,0.0,0.006944


In [19]:
adf.query('gene_name == "CDKN2A"').query('disease == "HNSC"').sort_values('mclv')

Unnamed: 0,disease,gene_name,mkid,mclv,strand,aclv,N_on,T_on,N_off,T_off,fisher_exact_p,N_base,T_base,N_on_ratio,T_on_ratio,N2T_ratio_diff,diff_is_significant,N2T_ratio_change,N2T_ratio_change_sig,has_diff_aca,sc_list,src_list,num_sc
2528,HNSC,CDKN2A,chr9|CDKN2A|-|21967755,21967755,-,21967752,18,323,22,183,0.02630013,40,506,0.45,0.63834,0.18834,False,up,,True,"21968228, 21971002",protein_coding,2
2529,HNSC,CDKN2A,chr9|CDKN2A|-|21967992,21967992,-,21967995,28,468,12,38,7.536333e-05,40,506,0.7,0.924901,0.224901,True,up,up,True,"21968208, 21968228, 21971002",protein_coding,3
2530,HNSC,CDKN2A,chr9|CDKN2A|-|21968200,21968200,-,21968180,3,235,37,271,4.484753e-07,40,506,0.075,0.464427,0.389427,True,up,up,True,"21968228, 21974476","nonsense_mediated_decay, protein_coding",2
2531,HNSC,CDKN2A,chr9|CDKN2A|-|21968733,21968733,-,21968723,9,111,31,395,1.0,40,506,0.225,0.219368,-0.005632,False,down,,True,21968724,protein_coding,1
2532,HNSC,CDKN2A,chr9|CDKN2A|-|21969564,21969564,-,21969568,8,127,32,379,0.5702024,40,506,0.2,0.250988,0.050988,False,up,,True,21969731,protein_coding,1
2533,HNSC,CDKN2A,chr9|CDKN2A|-|21970697,21970697,-,21970715,16,90,24,416,0.001534828,40,506,0.4,0.177866,-0.222134,True,down,down,True,21970890,protein_coding,1
2534,HNSC,CDKN2A,chr9|CDKN2A|-|21994884,21994884,-,21970715,0,1,40,505,1.0,40,506,0.0,0.001976,0.001976,False,up,,True,21970890,protein_coding,1


In [20]:
adf.gene_name.unique().shape

(114,)

In [21]:
cdf = adf.copy()

In [22]:
cdf[['gene_name', 'disease']].drop_duplicates().shape

(1593, 2)

In [23]:
assert cdf[cdf.num_sc.isnull()].shape[0] == 0

So it's mostly zeros, which is a justification for separating them

**Note**: for those with N2T_ratio_diff < 0.02 (the arrow head length), none is a significant change, so just leave it as it is.

In [24]:
ddf = cdf.merge(clv_sc_df[['aclv', 'offset']].drop_duplicates(), on='aclv')
ddf['mclv_t'] = ddf['mclv'] - ddf['offset']
assert ddf.shape[0] == cdf.shape[0]

In [25]:
arc_df = clv_sc_df.copy()
arw = ddf.copy() # arw: arrow

In [26]:
# BS: bootstrap
BS_BLUE = (np.array([2, 117, 216]) / 255).tolist()
BS_RED  = (np.array([217, 83, 79]) / 255).tolist()

def trend2bgcol(grp):
    assert grp.trend.unique().shape[0] == 1
    dd = {0: 'white', -1: BS_BLUE, 1: BS_RED}
    return dd[grp.trend.values[0]]

trends_df = pd.read_csv('./results/aca_trends.csv')
bg_color_dd = trends_df.groupby(['gene_name', 'disease']).apply(trend2bgcol).to_dict()

In [27]:
trends_df.query('trend != 0').gene_name.unique().shape

(17,)

In [28]:
trends_df.trend.value_counts()

 0    45
-1    16
 1    16
Name: trend, dtype: int64

In [29]:
df_cytoband = pd.read_csv('./reference_data/GRCh37.p13.mart_export.cytoband_info.txt.gz', 
                          compression='gzip', sep='\t')
df_cytoband.columns = [_.lower().replace(' ', '_') for _ in df_cytoband.columns.values.tolist()]

In [30]:
TARGET_GENES = np.sort(df_clv.gene_name.unique()).tolist()
df_cyb = df_cytoband.query('gene_name in {0}'.format(TARGET_GENES)).copy()
assert df_cyb.gene_name.unique().shape[0] == 114
assert df_cyb.groupby('gene_name').apply(lambda grp: grp.karyotype_band.unique().shape[0]).unique()[0] == 1

CB_DD = dict(df_cyb[['gene_name', 'karyotype_band']].drop_duplicates().values)

In [31]:
gridspec.GridSpec?

In [32]:
mpl.rc('text.latex', preamble=r'\usepackage{sfmath}')
mpl.rc('text', usetex = True)
mpl.rc('font', size=12, family='sans-serif')

gd_pairs = ddf[['gene_name', 'disease']].drop_duplicates().sort_values(['gene_name', 'disease']).values.tolist()

print(len(gd_pairs))

# gd_pairs = [['AMER1', 'LUAD']]
# gd_pairs = [['GNAS', 'BRCA']]
ymax = 1

num_rows = 20
num_cols = 12
split_row1 = 9
split_row2 = 14
split_col = 4
gki = 0

for k, (gene, dise) in enumerate(gd_pairs):
#     if (gene, dise) not in PA.MAIN_PLOT_GD_PAIRS:
#         continue
    
    print('{0}: {1}, {2}'.format(k, gene, dise), end=',')
    fig = plt.figure(figsize=(11, 4))
    outer_grid = gridspec.GridSpec(4, 1, wspace=0, hspace=0.5)
    outer_grid = gridspec.GridSpec(1, 1, wspace=0.0, hspace=0.0)
    
    inner_grid = gridspec.GridSpecFromSubplotSpec(
        num_rows, num_cols, wspace=2, hspace=0.5, subplot_spec=outer_grid[gki])

    ax_bar = plt.Subplot(fig, inner_grid[3:-3, :split_col])
    ax_arc = plt.Subplot(fig, inner_grid[:split_row1, split_col:])
    ax_utr = plt.Subplot(fig, inner_grid[split_row1:split_row2, split_col:])
    ax_arw = plt.Subplot(fig, inner_grid[split_row2:, split_col:])

    fig.add_subplot(ax_bar)
    fig.add_subplot(ax_arc)
    fig.add_subplot(ax_utr)
    fig.add_subplot(ax_arw)

    # left part
    gd_arw_df = ddf.query('gene_name == "{0}"'.format(gene)).query('disease == "{0}"'.format(dise))
    gd_bar_df = gd_arw_df.copy()
    
    gd_expr_df = PB.filter_expr_df(df_expr, df_clv, gene, dise)
    set_xlabel = True if gki == len(gd_pairs) - 1 else False
    PB.plot_bar_and_expr(ax_bar, gd_bar_df, gd_expr_df, set_expr_xlabel=set_xlabel, bg_color_dd=bg_color_dd, do_plot_expr=True)
    ax_bar.set_ylim([0, 1.4])
    ax_bar.tick_params(axis='x', labelsize=7)
    
    # right part
    gd_arc_df = arc_df.query('gene_name == "{0}"'.format(gene))
    gd_arw_df = ddf.query('gene_name == "{0}"'.format(gene)).query('disease == "{0}"'.format(dise))

    xlim = PA.calc_xlim(gd_arc_df, gd_arw_df)
    gd_arc_df = PA.process_arc_df(gd_arc_df, ymax)
    # xlim is needed for calculating arrow head width
    gd_arw_df = PA.process_arrow_df(gd_arw_df, xlim=xlim)

    ax_arc.set_zorder(1)
    ax_utr.set_zorder(5)
    ax_arw.set_zorder(9)
    
    ax_arc.xaxis.offsetText.set_visible(True)

    PA.plot_arcs_stop_codons_and_strand(ax_arc, gd_arc_df, xlim)
    PU.plot_utrs(ax_utr, gd_arc_df, xlim)
    PC.plot_clvs(ax_arw, gd_arw_df, xlim, supp=True)
    
    ticks = ax_arc.get_xticks()
    # magnitude = np.floor(np.log10(ticks[-1] - ticks[0]))
    magnitude = np.floor(np.log10(max(abs(ticks[-1]), abs(ticks[0]))))
    
    seqname = gd_bar_df.mkid.apply(lambda v: v.split('|')[0]).unique()[0]
    
#     latex version: seems buggy, crash in the middle of for loop for no apparent reason...
    txt = r'{0} ($\mathrm{{10^{1}}}$ bp)'.format('{0}{1}'.format(
        seqname.replace('chr', ''), CB_DD[gene]), int(magnitude))
    ax_arw.text(0.5, 0, txt, va='top', ha='center',transform=ax_arw.transAxes)

#     # no latex version; additional changes need made in plot_bars.py
#     ax_arw.text(0.5, 0, r'{0} (10^{1} bp)'.format(
#             '{0}{1}'.format(seqname.replace('chr', ''), CB_DD[gene]),
#             int(magnitude)), va='top', ha='center',transform=ax_arw.transAxes)

    output = './results/figs/all-apa-cases-figures/{0}_{1}.png'.format(gene, dise)
    plt.savefig(output, bbox_inches='tight', dpi=200)
    print(output)
    # release resources: 
    # https://stackoverflow.com/questions/7101404/how-can-i-release-memory-after-creating-matplotlib-figures
    # it turns out fig.clf() is crucial, without which the memory would overflow.
    fig.clf()
    plt.close(fig)

1593
0: ABL1, BLCA,./results/figs/diff-apa-all-114-genes/ABL1_BLCA.png
1: ABL1, BRCA,./results/figs/diff-apa-all-114-genes/ABL1_BRCA.png
2: ABL1, COAD,./results/figs/diff-apa-all-114-genes/ABL1_COAD.png
3: ABL1, HNSC,./results/figs/diff-apa-all-114-genes/ABL1_HNSC.png
4: ABL1, KICH,./results/figs/diff-apa-all-114-genes/ABL1_KICH.png
5: ABL1, KIRC,./results/figs/diff-apa-all-114-genes/ABL1_KIRC.png
6: ABL1, KIRP,./results/figs/diff-apa-all-114-genes/ABL1_KIRP.png
7: ABL1, LIHC,./results/figs/diff-apa-all-114-genes/ABL1_LIHC.png
8: ABL1, LUAD,./results/figs/diff-apa-all-114-genes/ABL1_LUAD.png
9: ABL1, LUSC,./results/figs/diff-apa-all-114-genes/ABL1_LUSC.png
10: ABL1, PRAD,./results/figs/diff-apa-all-114-genes/ABL1_PRAD.png
11: ABL1, STAD,./results/figs/diff-apa-all-114-genes/ABL1_STAD.png
12: ABL1, THCA,./results/figs/diff-apa-all-114-genes/ABL1_THCA.png
13: ABL1, UCEC,./results/figs/diff-apa-all-114-genes/ABL1_UCEC.png
14: AKT1, BLCA,./results/figs/diff-apa-all-114-genes/AKT1_BLCA.png
