In [None]:
import pandas as pd
import datetime
import numpy as np
import seaborn as sns
import matplotlib.ticker as tick
import matplotlib.pyplot as plt
import matplotlib.patches as pplt
import matplotlib.cm as cm
import matplotlib.lines as lines
from matplotlib import rcParams as rcp
import plotly.graph_objects as go

# Setting variables

In [None]:
#gene_name1 = input('Gene name 1: ')
#gene_name2 = input('Gene name 2 (if wanted, else press enter): ')

gene_name1 = 'brca1'
gene_name2 = 'brca2'

date = str(datetime.date.today())

# joint file of gene
df1 = pd.read_csv(r'joint_files_out/' + gene_name1.lower() + '_joint_out.csv')
dfs = [df1]
gene_names = [gene_name1]
if gene_name2 != '':
    df2 = pd.read_csv(r'joint_files_out/' + gene_name2.lower() + '_joint_out.csv')
    dfs.append(df2)
    gene_names.append(gene_name2)

# SIFT_VEP and PolyPhen_VEP combined str and float values -> create one column for each
for df in dfs:
    df[['SIFT_VEP_str', 'SIFT_VEP_num']] = df['SIFT_VEP'].str.split('(', expand = True)
    df['SIFT_VEP_num'] = df['SIFT_VEP_num'].str.strip(')').astype(float)
    df[['PolyPhen_VEP_str', 'PolyPhen_VEP_num']] = df['PolyPhen_VEP'].str.split('(', expand = True)
    df['PolyPhen_VEP_num'] = df['PolyPhen_VEP_num'].str.strip(')').astype(float)
    df['Consequence_VEP'] = df['Consequence_VEP'].str.replace('_', ' ')
    df['Consequence_VEP'] = df['Consequence_VEP'].str.replace(',', ', ')

In [None]:
# coding sequences as df
coding_df1 = df1.loc[df1['Protein_position'].notnull()].copy()
coding_dfs = [coding_df1]
# unique positions (+ introns)
all_pos1 = list(df1['POS'].unique())
all_pos = [all_pos1]

if gene_name2 != '':
    coding_df2 = df2.loc[df2['Protein_position'].notnull()].copy()
    coding_dfs.append(coding_df2)
    all_pos2 = list(df2['POS'].unique())
    all_pos.append(all_pos2)

revs = []
for df, cod in zip(dfs, coding_dfs):
    if df.iloc[0]['Strand'] == '+':
        cod.sort_values(by = ['POS'], ascending = True, ignore_index = True, inplace = True)
        revs.append(False)
    elif df.iloc[0]['Strand'] == '-':
        cod.sort_values(by = ['POS'], ascending = False, ignore_index = True, inplace = True)
        revs.append(True)
    cod['Protein_position'] = cod['Protein_position'].astype(int)
    cod['POS'] = cod['POS'].astype(int)

# unique coding positions list and length
codpos1 = list(coding_df1['POS'].unique())
codpos = [codpos1]
cod1_len = len(codpos1)
cod_len = [cod1_len]
if gene_name2 != '':
    codpos2 = list(coding_df2['POS'].unique())
    codpos.append(codpos2)
    cod2_len = len(codpos2)-1
    cod_len.append(cod2_len)

for pos,cpos,r in zip(all_pos, codpos, revs):
    pos.sort(reverse = r)
    cpos.sort(reverse = r)

# aa length per gene
aa1_len = len(list(coding_df1['Protein_position'].unique()))
aa2_len = len(list(coding_df2['Protein_position'].unique()))
aa_len = [aa1_len, aa2_len]

In [None]:
# search for exon start/stop
tuple_exons = []
for df, cpos in zip(dfs, codpos):
    exons_beg = []
    exons_end = []
    tuple_exons_in = []
    if df.iloc[0]['Strand'] == '+':
        for el in cpos:
            if el-1 not in cpos:
                exons_beg.append(el)
            elif el+1 not in cpos:
                exons_end.append(el)
            else:
                continue
    elif df.iloc[0]['Strand'] == '-':
        for el in cpos:
            if el+1 not in cpos:
                exons_beg.append(el)
            elif el-1 not in cpos:
                exons_end.append(el)
            else:
                continue
    else:
        pass
    for x, y in zip(exons_beg, exons_end):
        tuple_exons_in.append((x,y))
    tuple_exons.append(tuple_exons_in)

tuple_exons1 = tuple_exons[0]
if len(tuple_exons) > 1:
    tuple_exons2 = tuple_exons[1]

In [None]:
# exon numbers position
txt_pos1 = []
for x in range(len(tuple_exons1)):
    if (int(x)+1)%2 != 0:
        txt_pos1.append(-0.4)
    elif (int(x)+1)%2 == 0:
        txt_pos1.append(0.4)
txt_pos = [txt_pos1]
if gene_name2 != '':
    txt_pos2 = []
    for x in range(len(tuple_exons2)):
        if (int(x)+1)%2 != 0:
            txt_pos2.append(-0.4)
        elif (int(x)+1)%2 == 0:
            txt_pos2.append(0.4)
    txt_pos.append(txt_pos2)
else:
    pass

# Overview of gene

In [None]:
# figure for exons and introns
def overview_fig(gene):
    ind = gene_names.index(str(gene))
    fig,ax = plt.subplots(figsize=(20, 7))
    plt.ylim([-1, 1])
    plt.axis('off')
    fig.suptitle('Gene model of $%s$ gene in reading direction, CDS = %s kb, protein = %s aa' 
                 %(gene_names[ind].upper(), str(cod_len[ind]/1000), str('{:,.0f}'.format(aa_len[ind]))), 
                 fontsize = 25, fontweight = 'bold')
    plt.xlim(tuple_exons[ind][0][0], tuple_exons[ind][-1][1])
    if dfs[ind].iloc[0]['Strand'] == '+':
        ax.add_patch(pplt.Rectangle((tuple_exons[ind][0][0], -.1), tuple_exons[ind][-1][1], .2, 
                                    fc = 'dodgerblue', ec = 'dodgerblue', alpha = 1, label = 'Introns'))
        ax.arrow(tuple_exons[ind][0][0], -0.7, abs(tuple_exons[ind][-1][1]-tuple_exons[ind][0][0])-1000, 0, 
                 head_width = 0.05, head_length = 1000, fc = 'k', ec = 'k')
        for tup,l in zip(tuple_exons[ind], range(len(tuple_exons[ind]))):
            ax.add_patch(pplt.Rectangle((int(tup[0]), -.3), abs(int(tup[0])-int(tup[1])), .6, fc = 'orange', 
                                        ec = 'orange', alpha = 1))
            plt.text(int(tup[0]-300), txt_pos[ind][int(l)], str((int(l)+2)), size = 22)
        plt.text(tuple_exons[ind][-1][1]-23000, -0.6, 'End: CHROM ' + str(dfs[ind].iloc[0]['CHROM']) + 
                 ', POS ' + str('{:,.0f}'.format(tuple_exons[ind][-1][1])), size = 25)
        plt.text((tuple_exons[ind][0][0] + abs(tuple_exons[ind][-1][1]-tuple_exons[ind][0][0])/2)-8000, -0.6, 
                 'Gene Length: ' + str(abs(tuple_exons[ind][-1][1]-tuple_exons[ind][0][0])/1000) + ' kb', 
                 size = 25)
    elif dfs[ind].iloc[0]['Strand'] == '-':
        ax.add_patch(pplt.Rectangle((tuple_exons[ind][-1][1], -.1), tuple_exons[ind][0][0], .2, 
                                    fc = 'dodgerblue', ec = 'dodgerblue', alpha = 1, label = 'Introns'))
        ax.arrow(tuple_exons[ind][0][0], -0.7, -abs(tuple_exons[ind][-1][1]-tuple_exons[ind][0][0])+1000, 0, 
                 head_width = 0.05, head_length = 1000, fc = 'k', ec = 'k')
        for tup,l in zip(tuple_exons[ind], range(len(tuple_exons[ind]))):
            ax.add_patch(pplt.Rectangle((int(tup[1]), -.3), abs(int(tup[0])-int(tup[1])), .6, fc = 'orange', 
                                        ec = 'orange', alpha = 1))
            plt.text(int(tup[1]+300), txt_pos[ind][int(l)], str((int(l)+2)), size = 22)
        plt.text(tuple_exons[ind][-1][1]+23000, -0.6, 'End: CHROM ' + str(dfs[ind].iloc[0]['CHROM']) + 
                 ', POS ' + str('{:,.0f}'.format(tuple_exons[ind][-1][1])), size = 25)
        plt.text((tuple_exons[ind][0][0] - abs(tuple_exons[ind][-1][1]-tuple_exons[ind][0][0])/2)+8000, -0.6, 
                 'Gene Length: ' + str(abs(tuple_exons[ind][-1][1]-tuple_exons[ind][0][0])/1000) + ' kb', 
                 size = 25)

    plt.text(tuple_exons[ind][0][0], 0.73, 'Exons', c = 'orange', size = 25, fontweight = 'bold')
    plt.text(tuple_exons[ind][0][0], 0.6, 'Introns', c = 'dodgerblue', size = 25, fontweight = 'bold')

    plt.text(tuple_exons[ind][0][0], -0.9, ('Reading Direction, "' + str(dfs[ind].iloc[0]['Strand'])+'" Strand'+
                                            ', Transcript ' + str(dfs[ind].iloc[0]['Ref_Trans_VEP'])), size = 25)

    plt.text(tuple_exons[ind][0][0], -0.6, 'Start: CHROM ' + str(dfs[ind].iloc[0]['CHROM']) + ', POS ' + 
             str('{:,.0f}'.format(tuple_exons[ind][0][0])), size = 25)


    plt.gcf().subplots_adjust(bottom = 0.05, top = 0.95, left = 0.01, right = 0.95);

    plt.savefig(r'figures/' + date + '_' + gene_names[ind].lower() + '_exins_ov.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[ind].lower() + '_exins_ov.jpg')
    plt.savefig(r'figures/' + date + '_' + gene_names[ind].lower() + '_exins_ov.svg')

for gene in gene_names:
    overview_fig(gene)

# Overview of annotations

In [None]:
# for pie chart with origin of variants, plus info of exon/intron
sizes = []
labels = []
cols = []
sizes_sg = []
labels_sg = []
cols_sg = []

for df in dfs:
    sizes.append([len(df[df['AC_gnomAD'].notnull() & df['ClinVar_gnomAD'].notnull()]), 
                  len(df[df['AC_gnomAD'].notnull() & df['ClinVar_gnomAD'].isnull()]), 
                  len(df[df['Cancer_Type_cBP'].notnull()]),
                  len(df[df['Trans_Version_Flossies'].notnull()]), 
                  len(df[df['NCT_SAMPLE_NAME'].notnull()]), 
                 ])
    labels.append(['gnomAD (+ ClinVar)',
                    'gnomAD (w/o ClinVar)',
                    'cBioPortal',
                    'FLOSSIES',
                    'MASTER/NCT project',
                  ])
    cols.append(['red', 'orange', 'green', 'dodgerblue', 'magenta'])
    sizes_sg.append([len(df[df['AC_gnomAD'].notnull() & df['ClinVar_gnomAD'].notnull() & df['EXON'].notnull()]), 
            len(df[df['AC_gnomAD'].notnull() & df['ClinVar_gnomAD'].notnull() & df['EXON'].isnull()]), 
            len(df[df['AC_gnomAD'].notnull() & df['ClinVar_gnomAD'].isnull() & df['EXON'].notnull()]), 
            len(df[df['AC_gnomAD'].notnull() & df['ClinVar_gnomAD'].isnull() & df['EXON'].isnull()]), 
            len(df[df['Cancer_Type_cBP'].notnull() & df['EXON'].notnull()]), 
            len(df[df['Cancer_Type_cBP'].notnull() & df['EXON'].isnull()]), 
            len(df[df['Trans_Version_Flossies'].notnull() & df['EXON'].notnull()]), 
            len(df[df['Trans_Version_Flossies'].notnull() & df['EXON'].isnull()]), 
            len(df[df['NCT_SAMPLE_NAME'].notnull() & df['EXON'].notnull()]),
            len(df[df['NCT_SAMPLE_NAME'].notnull() & df['EXON'].isnull()]),
           ])
    labels_sg.append(['E', 'I', 'E', 'I', 'E', 'I', 'E', 'I', 'E', 'I'])
    cols_sg.append(['lightcoral', 'lightsalmon', 'gold', 'bisque', 'yellowgreen', 'palegreen', 
          'deepskyblue', 'lightskyblue', 'violet', 'lightpink'])

In [None]:
fig = plt.subplots(figsize=(10, 12))
# pie plot in coding sequence + 75 bp padding for gene 1, exon vs. intron
if gene_name2 != '':
    ax1 = plt.subplot(2,1,1)
else:
    ax1 = plt.subplot(1,1,1)
ax1.set_title('A: Origin of variants in $%s$' %gene_name1.upper(), fontsize = 18, fontweight = 'bold', 
              loc = 'left', )
ax1.pie(sizes[0], labels = labels[0], autopct = '%1.1f%%', startangle = 180, pctdistance = 0.85, 
        colors = cols[0], counterclock = False, radius = 1, wedgeprops = dict(width = 0.3), 
        textprops = {'fontsize':15})
piech1 = ax1.pie(sizes_sg[0], autopct = '%1.1f%%', startangle = 180, pctdistance = 0.8, 
            colors = cols_sg[0], textprops = {'fontsize':12, 'fontweight':'bold'},
            counterclock = False, radius = 0.7, wedgeprops = dict(width = 0.3))
for i in range(len(piech1[0])):
    if (i+1)%2 == 0:
        piech1[2][i].set_color('mediumblue')
    if (i+1)%2 != 0:
        piech1[2][i].set_color('white');
ax1.legend(handles = [pplt.Patch(label = 'Exons', fill = False, edgecolor = 'white'), 
                      pplt.Patch(label = 'Introns', fill = False, edgecolor = 'mediumblue')], 
            bbox_to_anchor = (0.7, 0.88), loc = 'lower left', ncol = 2, 
           labelcolor = ['darkgray', 'mediumblue'], prop = {'weight':'bold', 'size':15})

# pie plot in coding sequence + 75 bp padding for gene 2, exon vs. intron
if gene_name2 != '':
    ax2 = plt.subplot(2,1,2)
    ax2.set_title('B: Origin of variants in $%s$' %gene_name2.upper(), fontsize = 18, 
                  fontweight = 'bold', loc = 'left')
    ax2.pie(sizes[1], labels = labels[1], autopct = '%1.1f%%', startangle = 180, pctdistance = 0.85, 
            colors = cols[1], counterclock = False, radius = 1, wedgeprops = dict(width = 0.3), 
            textprops = {'fontsize':15})
    piech2 = ax2.pie(sizes_sg[1], autopct = '%1.1f%%', startangle = 180, pctdistance = 0.8, 
                colors = cols_sg[1], textprops = {'fontsize':12, 'fontweight':'bold'},
                counterclock = False, radius = 0.7, wedgeprops = dict(width = 0.3))
    for i in range(len(piech2[0])):
        if (i+1)%2 == 0:
            piech2[2][i].set_color('mediumblue')
        if (i+1)%2 != 0:
            piech2[2][i].set_color('white');
    plt.tight_layout()
    plt.gcf().subplots_adjust(bottom = 0.03, top = 0.95, left = 0.01, right = 0.97)
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() + '_orig_vars.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() + '_orig_vars.jpg')
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() + '_orig_vars.svg')
else:
    plt.tight_layout()
    plt.gcf().subplots_adjust(bottom = 0.03, top = 0.95, left = 0.01, right = 0.97)
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_orig_vars.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_orig_vars.jpg')

# Overview of ClinVar/VEP impact

In [None]:
#synthetic variants (expected) vs. variants in databases (observed)
all_vars = []
syn_vars_in = []
syn_vars_ex = []
db_vars_in = []
db_vars_ex = []
# value counts as df for all groups
for df in dfs:
    all_vars.append(pd.DataFrame(df['Consequence_VEP'].value_counts()))
    syn_vars_in.append(pd.DataFrame(df['Consequence_VEP'][(df['Trans_Version_Syn'].notnull())&
                                             (df['INTRON'].notnull())].value_counts()))
    syn_vars_ex.append(pd.DataFrame(df['Consequence_VEP'][(df['Trans_Version_Syn'].notnull())&
                                             (df['EXON'].notnull())].value_counts()))
    db_vars_in.append(pd.DataFrame(df['Consequence_VEP'][(df['Trans_Version_Flossies'].notnull())|
                                            (df1['Cancer_Type_cBP'].notnull())|
                                            (df1['AF_gnomAD'].notnull())|
                                            (df1['NCT_SAMPLE_NAME'].notnull())&
                                            (df['INTRON'].notnull())].value_counts()))
    db_vars_ex.append(pd.DataFrame(df['Consequence_VEP'][(df['Trans_Version_Flossies'].notnull())|
                                            (df1['Cancer_Type_cBP'].notnull())|
                                            (df1['AF_gnomAD'].notnull())|
                                            (df1['NCT_SAMPLE_NAME'].notnull())&
                                            (df['INTRON'].notnull())].value_counts()))
# index as column, append right suffixes
jnt_vars = []
sufx = ['_all', '_syn_in', '_syn_ex', '_db_in', '_db_ex']
for i,el in enumerate([all_vars,syn_vars_in,syn_vars_ex,db_vars_in,db_vars_ex]):
    for data in el:
        data['index_1'] = data.index
    jnt_vars.append(el[0].merge(el[1], on = 'index_1', suffixes = [(sufx[i]+'_1'), (sufx[i]+'_2')], 
                                how = 'outer'))
# merge all together
merged_vars = jnt_vars[0].merge(jnt_vars[1], on = 'index_1', how = 'outer').merge(jnt_vars[2], on = 'index_1',
                            how = 'outer').merge(jnt_vars[3], on = 'index_1', how = 'outer').merge(jnt_vars[4],
                            on = 'index_1', how = 'outer')
inex_cols = []
for x in merged_vars['index_1']:
    if x in ['intron variant', 'splice region variant, intron variant', 'splice donor variant', 
             'splice acceptor variant']:
        inex_cols.append('dodgerblue')
    else:
        inex_cols.append('orange')

In [None]:
# variant consequences: relative values
ct_syn_all1 = merged_vars['Consequence_VEP_syn_in_1'].sum() + merged_vars['Consequence_VEP_syn_ex_1'].sum()
ct_syn_all2 = merged_vars['Consequence_VEP_syn_in_2'].sum() + merged_vars['Consequence_VEP_syn_ex_2'].sum()
ct_db_all1 = merged_vars['Consequence_VEP_db_in_1'].sum() + merged_vars['Consequence_VEP_db_ex_1'].sum()
ct_db_all2 = merged_vars['Consequence_VEP_db_in_2'].sum() + merged_vars['Consequence_VEP_db_ex_2'].sum()
#SynToDb1 = ct_db_all1/ct_syn_all1
#SynToDb2 = ct_db_all2/ct_syn_all2

# in percent per each group
for cols1,cold1,cols2,cold2 in zip(['Consequence_VEP_syn_in_1', 'Consequence_VEP_syn_ex_1'], 
                                   ['Consequence_VEP_db_in_1', 'Consequence_VEP_db_ex_1'], 
                                   ['Consequence_VEP_syn_in_2', 'Consequence_VEP_syn_ex_2'], 
                                   ['Consequence_VEP_db_in_2', 'Consequence_VEP_db_ex_2']):
    merged_vars[(cols1 + '_relsyn')] = merged_vars[cols1]/ct_syn_all1*100
    merged_vars[(cold1 + '_reldb')] = merged_vars[cold1]/ct_db_all1*100
    merged_vars[(cols2 + '_relsyn')] = merged_vars[cols2]/ct_syn_all2*100
    merged_vars[(cold2 + '_reldb')] = merged_vars[cold2]/ct_db_all2*100

In [None]:
# for pathogenicity by ClinVar, all, exons, introns
path1_val = df1['ClinVar_gnomAD'][df1['ClinVar_gnomAD'].notnull()].value_counts()
path1_val_df = pd.DataFrame(path1_val)
path1_val_ex1_df = pd.DataFrame(df1['ClinVar_gnomAD'][(df1['ClinVar_gnomAD'].notnull()) &
                                      (df1['EXON'].notnull())].value_counts())
path1_val_in1_df = pd.DataFrame(df1['ClinVar_gnomAD'][(df1['ClinVar_gnomAD'].notnull()) &
                                      (df1['INTRON'].notnull())].value_counts())
# for impact by VEP
imp1 = df1['Consequence_VEP'][df1['Consequence_VEP'].notnull()].value_counts()

# for gene 2
if gene_name2 != '':
    path2_val = df2['ClinVar_gnomAD'][df2['ClinVar_gnomAD'].notnull()].value_counts()
    path2_val_df = pd.DataFrame(path2_val)
    path2_val_ex2_df = pd.DataFrame(df2['ClinVar_gnomAD'][(df2['ClinVar_gnomAD'].notnull()) &
                                          (df2['EXON'].notnull())].value_counts())
    path2_val_in2_df = pd.DataFrame(df2['ClinVar_gnomAD'][(df2['ClinVar_gnomAD'].notnull()) &
                                          (df2['INTRON'].notnull())].value_counts())
    imp2 = df2['Consequence_VEP'][df2['Consequence_VEP'].notnull()].value_counts()

In [None]:
# pathogenicity, ClinVar order
for el in [path1_val_df, path1_val_ex1_df, path1_val_in1_df, path2_val_df, path2_val_ex2_df, 
           path2_val_in2_df]:
    el['index_1'] = el.index
all_pat1_df = path1_val_df.merge(path1_val_ex1_df, on = 'index_1', suffixes = ['_all1', '_ex1']
                  ).merge(path1_val_in1_df, on = 'index_1')
all_pat1_df = all_pat1_df.rename(columns = {'ClinVar_gnomAD':'ClinVar_gnomAD_in1'})
all_pat2_df = path2_val_df.merge(path2_val_ex2_df, on = 'index_1', suffixes = ['_all2', '_ex2']
                  ).merge(path2_val_in2_df, on = 'index_1')
all_pat2_df = all_pat2_df.rename(columns = {'ClinVar_gnomAD':'ClinVar_gnomAD_in2'})
all_pat_df = all_pat1_df.merge(all_pat2_df, on = 'index_1', how = 'outer')
# --> with these stacked bar plot
clinvar_order = pd.CategoricalDtype(['Pathogenic', 'Pathogenic/Likely pathogenic', 
                                     'Likely pathogenic',
                                     'Conflicting interpretations of pathogenicity',
                                     'Likely benign', 'Benign/Likely benign', 'Benign',
                                     'Uncertain significance', 'not provided'], ordered = True)
all_pat_df['index_1'] = all_pat_df['index_1'].astype(clinvar_order)
all_pat_df = all_pat_df.sort_values('index_1')

# relative to 1 kb
for el in ['ClinVar_gnomAD_all1', 'ClinVar_gnomAD_ex1', 'ClinVar_gnomAD_in1']:
    all_pat_df[el+'_rel'] = all_pat_df[el]/(cod1_len/1000)

for el in ['ClinVar_gnomAD_all2', 'ClinVar_gnomAD_ex2', 'ClinVar_gnomAD_in2']:
    all_pat_df[el+'_rel'] = all_pat_df[el]/(cod2_len/1000)

In [None]:
fig = plt.figure()
axes = plt.subplots(figsize=(12, 12))
ax1 = plt.subplot(2,2,1)
ax2 = plt.subplot(2,2,3, sharey = ax1, sharex = ax1)
ax3 = plt.subplot(2,2,2)
ax4 = plt.subplot(2,2,4, sharey = ax3, sharex = ax3)

or_leg = pplt.Patch(color = 'orange', label = 'in exons')
blue_leg = pplt.Patch(color = 'dodgerblue', label = 'in introns')
g1_leg = pplt.Patch(color = 'black', label = 'all possible variants', fill = False)
g2_leg = pplt.Patch(color = 'black', hatch = 'x', label = 'observed variants', fill = False)

# impact of variants gene 1 (expected vs. observed)
ar1 = np.arange(len(merged_vars['index_1']))
wid1 = 0.35
ax1.set_xticks(ar1)
#ax1.set_xticklabels(merged_vars['index_1'], rotation = 45, ha = 'right')
ax1.set_title('A: Impact $%s$' %gene_name1.upper(), fontsize = 14, fontweight = 'bold', loc = 'left')
bar1 = ax1.bar(ar1-wid1/2, merged_vars['Consequence_VEP_syn_in_1_relsyn'], color = inex_cols, width = wid1, 
               label = '$%s$' %gene_name1.upper(), edgecolor = 'black')
bar2 = ax1.bar(ar1-wid1/2, merged_vars['Consequence_VEP_syn_ex_1_relsyn'], color = inex_cols, width = wid1, 
               label = '$%s$' %gene_name1.upper(), edgecolor = 'black')
bar3 = ax1.bar(ar1+wid1/2, merged_vars['Consequence_VEP_db_in_1_reldb'], color = inex_cols, width = wid1, 
               label = '$%s$' %gene_name1.upper(), hatch = 'x', edgecolor = 'black')
bar4 = ax1.bar(ar1+wid1/2, merged_vars['Consequence_VEP_db_ex_1_reldb'], color = inex_cols, width = wid1, 
               label = '$%s$' %gene_name1.upper(), hatch = 'x', edgecolor = 'black')
ax1.get_xaxis().set_visible(False)
ax1.set_ylabel('Amount in % (log scale)', fontsize = 13)
ax1.set_yscale('log')

# impact of variants gene 2 (expected vs. observed)
ar2 = np.arange(len(merged_vars['index_1']))
wid2 = 0.35
ax2.set_xticks(ar2)
ax2.set_xticklabels(merged_vars['index_1'], rotation = 45, ha = 'right')
ax2.set_title('C: Impact $%s$' %gene_name2.upper(), fontsize = 14, fontweight = 'bold', loc = 'left')
bar21 = ax2.bar(ar2-wid2/2, merged_vars['Consequence_VEP_syn_in_2_relsyn'], color = inex_cols, width = wid2, 
               label = '$%s$' %gene_name2.upper(), edgecolor = 'black')
bar22 = ax2.bar(ar2-wid2/2, merged_vars['Consequence_VEP_syn_ex_2_relsyn'], color = inex_cols, width = wid2, 
               label = '$%s$' %gene_name2.upper(), edgecolor = 'black')
bar23 = ax2.bar(ar2+wid2/2, merged_vars['Consequence_VEP_db_in_2_reldb'], color = inex_cols, width = wid2, 
               label = '$%s$' %gene_name2.upper(), hatch = 'x', edgecolor = 'black')
bar24 = ax2.bar(ar2+wid2/2, merged_vars['Consequence_VEP_db_ex_2_reldb'], color = inex_cols, width = wid2, 
               label = '$%s$' %gene_name2.upper(), hatch = 'x', edgecolor = 'black')
ax2.set_ylabel('Amount in % (log scale)', fontsize = 13)
ax2.set_yscale('log')
plt.tight_layout()
#'''
ar3 = np.arange(len(all_pat_df['index_1']))
#wid3 = 0.35
ax3.set_xticks(ar3)
ax3.set_xticklabels(all_pat_df['index_1'], rotation = 45, ha = 'right')
ax3.set_title(('B: Pathogenicity $%s$' %gene_name1.upper()), fontsize = 14, fontweight = 'bold', 
              loc = 'left')
ax3.bar(ar3, all_pat_df['ClinVar_gnomAD_ex1_rel'],
       label = 'exons', color = 'orange', edgecolor = 'black')
ax3.bar(ar3, all_pat_df['ClinVar_gnomAD_in1_rel'],
        label = 'introns', bottom = all_pat_df['ClinVar_gnomAD_ex1_rel'], 
        color = 'dodgerblue', edgecolor = 'black')
ax3.get_xaxis().set_visible(False)
#ax3.set_yscale('log')
ax3.set_ylabel('Variant count (rel. to 1 kb CDS)', fontsize = 13)
#'''
#'''
ar4 = np.arange(len(all_pat_df['index_1']))
ax4.set_xticks(ar3)
ax4.set_xticklabels(all_pat_df['index_1'], rotation = 45, ha = 'right')
ax4.set_title(('D: Pathogenicity $%s$' %gene_name2.upper()), fontsize = 14, fontweight = 'bold', 
              loc = 'left')
ax4.bar(ar4, all_pat_df['ClinVar_gnomAD_ex2_rel'],
       label = 'exons', color = 'orange', edgecolor = 'black')
ax4.bar(ar4, all_pat_df['ClinVar_gnomAD_in2_rel'],
        label = 'introns', bottom = all_pat_df['ClinVar_gnomAD_ex2_rel'], 
        color = 'dodgerblue', edgecolor = 'black')
#ax3.get_yaxis().set_visible(False)
#ax4.set_yscale('log')
ax4.set_ylabel('Variant count (rel. to 1 kb CDS)', fontsize = 13)

ax1.legend(handles = [g1_leg, g2_leg, blue_leg, or_leg], bbox_to_anchor = (-0.2, 1.08), loc = 3, ncol = 4)
#'''
plt.tight_layout()
plt.gcf().subplots_adjust(bottom = 0.25, top = 0.92, left = 0.08, right = 0.97)

plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() +
            '_vars_cons_impact.pdf')
plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() +
            '_vars_cons_impact.jpg')
plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() +
            '_vars_cons_impact.svg')


# Consequences of pathogenic variants

In [None]:
# consequences of pathogenic variants
cons_pat1_df = pd.DataFrame(df1['Consequence_VEP'][df1['ClinVar_gnomAD'] == 'Pathogenic'].value_counts())
# clinvar annotations of most common consequence
clinv_miss1_df = pd.DataFrame(df1['ClinVar_gnomAD'][df1['Consequence_VEP'] == 
                                                    'missense variant'].value_counts())

if gene_name2 != '':
    cons_pat2_df = pd.DataFrame(df2['Consequence_VEP'][df2['ClinVar_gnomAD'] == 'Pathogenic'].value_counts())
    clinv_miss2_df = pd.DataFrame(df2['ClinVar_gnomAD'][df2['Consequence_VEP'] 
                                                        == 'missense variant'].value_counts())
# index, merge
for el in [cons_pat1_df, cons_pat2_df, clinv_miss1_df, clinv_miss2_df]:
    el['index_1'] = el.index
cons_patall_df = cons_pat1_df.merge(cons_pat2_df, on = 'index_1', how = 'outer', 
                                    suffixes = ('_1', '_2'))

clinv_missall_df = clinv_miss1_df.merge(clinv_miss2_df, on = 'index_1', how = 'outer', 
                                        suffixes = ('_1', '_2'))
clinv_missall_df['index_1'] = clinv_missall_df['index_1'].astype(clinvar_order)
clinv_missall_df = clinv_missall_df.sort_values('index_1')

In [None]:
# variant consequences: relative values
ct_p_all1 = cons_patall_df['Consequence_VEP_1'].sum()
ct_p_all2 = cons_patall_df['Consequence_VEP_2'].sum()

# per kb
cons_patall_df['Consequence_VEP_1_rel'] = cons_patall_df['Consequence_VEP_1']/(cod1_len/1000)
cons_patall_df['Consequence_VEP_2_rel'] = cons_patall_df['Consequence_VEP_2']/(cod2_len/1000)

In [None]:
# for splice variants

cons_patall_df_onan = cons_patall_df.copy()
cons_patall_df_onan.fillna(0, inplace = True)

# sum of relative splice values
spl_cons_pat1 = sum([cons_patall_df_onan[cons_patall_df_onan['index_1']==i].iloc[0]['Consequence_VEP_1_rel'] 
                     for i in [el for el in cons_patall_df_onan.index_1 if 'splice' in el]])
spl_cons_pat2 = sum([cons_patall_df_onan[cons_patall_df_onan['index_1']==i].iloc[0]['Consequence_VEP_2_rel'] 
                     for i in [el for el in cons_patall_df_onan.index_1 if 'splice' in el]])
# sum of whole splice values
spl_cons_pat1_wv = sum([cons_patall_df_onan[cons_patall_df_onan['index_1']==i].iloc[0]['Consequence_VEP_1']
                        for i in [el for el in cons_patall_df_onan.index_1 if 'splice' in el]])
spl_cons_pat2_wv = sum([cons_patall_df_onan[cons_patall_df_onan['index_1']==i].iloc[0]['Consequence_VEP_2']
                        for i in [el for el in cons_patall_df_onan.index_1 if 'splice' in el]])
cons_patall_df_onan = cons_patall_df_onan.append({'index_1':'all splice variants', 
                                                  'Consequence_VEP_1':spl_cons_pat1_wv, 
                                                  'Consequence_VEP_2':spl_cons_pat2_wv, 
                                                  'Consequence_VEP_1_rel':spl_cons_pat1, 
                                                  'Consequence_VEP_2_rel':spl_cons_pat2}, ignore_index=True)

In [None]:
fig = plt.subplots(figsize=(11, 6))
# consequences of pathogenic variants in gene 1,2
ar4 = np.arange(len(cons_patall_df['index_1']))
wid4 = 0.35
ax4 = plt.subplot(1,1,1)
ax4.set_xticks(ar4)
ax4.set_xticklabels(cons_patall_df['index_1'], rotation = 45, ha = 'right', fontsize = 13)

ax4.set_title('Consequences of pathogenic variants in $%s$ and $%s$ (relative to 1 kb CDS)' 
              %(gene_name1.upper(),gene_name2.upper()), fontsize = 15, fontweight = 'bold')

ax4.bar(ar4-wid4/2, cons_patall_df['Consequence_VEP_1_rel'], width = wid4, color = 'white', 
        edgecolor = 'black', label = '$%s$' %gene_name1.upper())
ax4.bar(ar4+wid4/2, cons_patall_df['Consequence_VEP_2_rel'], width = wid4, color = 'gray',
        edgecolor = 'black', label = '$%s$' %gene_name2.upper())
ax4.set_ylabel('Variant count (log scale)', fontsize = 13)
ax4.set_yscale('log')
plt.legend(ncol = 2, loc = 'upper center')
plt.tight_layout();

plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() +
            '_cons_pat.pdf')
plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() +
            '_cons_pat.jpg')

In [None]:
# for splice variants, together

not_spl_index = [el for el in cons_patall_df_onan.index_1 if not 'splice' in el]
not_spl_index.append('all splice variants')

fig = plt.subplots(figsize=(11, 6))
# consequences of pathogenic variants in gene 1,2
ar4 = np.arange(len(cons_patall_df_onan['index_1'][cons_patall_df_onan['index_1'].isin(not_spl_index)]))
wid4 = 0.35
ax4 = plt.subplot(1,1,1)
ax4.set_xticks(ar4)
ax4.set_xticklabels(cons_patall_df_onan['index_1'][cons_patall_df_onan['index_1'].isin(not_spl_index)], 
                    rotation = 45, ha = 'right', fontsize = 13)

ax4.set_title('Consequences of pathogenic variants in $%s$ and $%s$ (relative to 1 kb CDS)' 
              %(gene_name1.upper(),gene_name2.upper()), fontsize = 15, fontweight = 'bold')

ax4.bar(ar4-wid4/2, cons_patall_df_onan['Consequence_VEP_1_rel'][cons_patall_df_onan['index_1'].isin(not_spl_index)], 
        width = wid4, color = 'white', edgecolor = 'black', label = '$%s$' %gene_name1.upper())
ax4.bar(ar4+wid4/2, cons_patall_df_onan['Consequence_VEP_2_rel'][cons_patall_df_onan['index_1'].isin(not_spl_index)], 
        width = wid4, color = 'gray', edgecolor = 'black', label = '$%s$' %gene_name2.upper())
ax4.set_ylabel('Variant count (log scale)', fontsize = 13)
ax4.set_yscale('log')
plt.legend(ncol = 2, loc = 'upper center')
plt.tight_layout();

plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() +
            '_cons_pat_spl.pdf')
plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() +
            '_cons_pat_spl.jpg')

## Location of as pathogenic characterized variants

In [None]:
fig = plt.subplots(figsize=(12, 8))
if gene_name2 != '':
    ax1 = plt.subplot(2,1,1)
else:
    ax1 = plt.subplot(1,1,1)
# location of pathogenic variants with frameshift/stop gained gene 1
plt.suptitle('Location of as pathogenic characterized variants', fontsize=15, fontweight = 'bold', y = 0.95)
ax1.set_title('A: $%s$' %gene_name1.upper(), fontsize=12, 
              fontweight = 'bold', y=1.2, loc = 'left')
ax1.scatter(x = df1['POS'][(df1['ClinVar_gnomAD'] == 'Pathogenic') &
                           (df1['Consequence_VEP'] == 'frameshift variant')], 
            y = df1['CADD_PHRED_VEP'][(df1['ClinVar_gnomAD'] == 'Pathogenic') &
                                       (df1['Consequence_VEP'] == 'frameshift variant')], 
            alpha = 0.5, label = 'frameshift variant', marker = 'o')
ax1.scatter(x = df1['POS'][(df1['ClinVar_gnomAD'] == 'Pathogenic') &
                           (df1['Consequence_VEP'] == 'stop gained')], 
            y = df1['CADD_PHRED_VEP'][(df1['ClinVar_gnomAD'] == 'Pathogenic') &
                                      (df1['Consequence_VEP'] == 'stop gained')], 
            alpha = 0.5, label = 'stop gained', marker = 'v')
ax1.scatter(x = df1['POS'][(df1['ClinVar_gnomAD'] == 'Pathogenic') &
                           (df1['Consequence_VEP'] == 'splice acceptor variant')], 
            y = df1['CADD_PHRED_VEP'][(df1['ClinVar_gnomAD'] == 'Pathogenic') &
                                      (df1['Consequence_VEP'] == 'splice acceptor variant')], 
            alpha = 0.8, label = 'splice acceptor variant', marker = 'x')
ax1.scatter(x = df1['POS'][(df1['ClinVar_gnomAD'] == 'Pathogenic') &
                           (df1['Consequence_VEP'] == 'splice donor variant')], 
            y = df1['CADD_PHRED_VEP'][(df1['ClinVar_gnomAD'] == 'Pathogenic') &
                                      (df1['Consequence_VEP'] == 'splice donor variant')], 
            alpha = 0.8, label = 'splice donor variant', marker = 'd')
plt.legend(bbox_to_anchor = (0.07, 1.4), loc = 'lower left', ncol = 5, fontsize = 12)

ax1.set_xlabel('Nucleic position on chromosome %s at $%s$ region' %(str(df1.iloc[0]['CHROM']),gene_name1.upper()), fontsize = 12)
ax1.set_ylabel('CADD phred score (VEP)', fontsize = 12)
ax1.ticklabel_format(style = 'plain')
ax1.get_xaxis().set_major_formatter(tick.StrMethodFormatter('{x:,.0f}'))
#plt.legend(bbox_to_anchor = (1, 1.12), ncol = 2)

if gene_name2 != '':
    # consequences of pathogenic variants gene 2
    ax2 = plt.subplot(2,1,2, sharey = ax1)
    ax2.set_title('B: $%s$' %gene_name2.upper(), fontsize=12, 
                  fontweight = 'bold', y=1.2, loc = 'left')
    ax2.scatter(x = df2['POS'][(df2['ClinVar_gnomAD'] == 'Pathogenic') &
                               (df2['Consequence_VEP'] == 'frameshift variant')], 
                y = df2['CADD_PHRED_VEP'][(df2['ClinVar_gnomAD'] == 'Pathogenic') &
                                           (df2['Consequence_VEP'] == 'frameshift variant')], 
                alpha = 0.5, label = 'frameshift variant', marker = 'o')
    ax2.scatter(x = df2['POS'][(df2['ClinVar_gnomAD'] == 'Pathogenic') &
                               (df2['Consequence_VEP'] == 'stop gained')], 
                y = df2['CADD_PHRED_VEP'][(df2['ClinVar_gnomAD'] == 'Pathogenic') &
                                          (df2['Consequence_VEP'] == 'stop gained')], 
                alpha = 0.5, label = 'stop gained', marker = 'v')
    ax2.scatter(x = df2['POS'][(df2['ClinVar_gnomAD'] == 'Pathogenic') &
                               (df2['Consequence_VEP'] == 'splice acceptor variant')], 
                y = df2['CADD_PHRED_VEP'][(df2['ClinVar_gnomAD'] == 'Pathogenic') &
                                          (df2['Consequence_VEP'] == 'splice acceptor variant')], 
                alpha = 0.8, label = 'splice acceptor variant', marker = 'x')
    ax2.scatter(x = df2['POS'][(df2['ClinVar_gnomAD'] == 'Pathogenic') &
                               (df2['Consequence_VEP'] == 'splice donor variant')], 
                y = df2['CADD_PHRED_VEP'][(df2['ClinVar_gnomAD'] == 'Pathogenic') &
                                          (df2['Consequence_VEP'] == 'splice donor variant')], 
                alpha = 0.8, label = 'splice donor variant', marker = 'd')
    ax2.set_xlabel('Nucleic position on chromosome %s at $%s$ region' %(str(df2.iloc[0]['CHROM']),gene_name2.upper()), fontsize = 12)
    ax2.set_ylabel('CADD phred score (VEP)', fontsize = 12)
    ax2.ticklabel_format(style = 'plain')
    ax2.get_xaxis().set_major_formatter(tick.StrMethodFormatter('{x:,.0f}'))
    handles, labels = plt.gca().get_legend_handles_labels()
    patch = pplt.Patch(color='grey', label='Exons')
    handles.extend([patch])
    (y_min1, y_max1) = plt.ylim()
    if df1.iloc[0]['Strand'] == '+':
        for tup,l in zip(tuple_exons1, range(len(tuple_exons1))):
            ax1.add_patch(pplt.Rectangle((int(tup[0]), -.3), abs(int(tup[0])-int(tup[1])), 55, 
                                         fc = 'black', ec = 'black', alpha = 0.1))
            if (l+2)%2 == 0:
                ax1.text(int(tup[0])-500, y_max1+1, str((int(l)+2)), size = 10)
            else:
                ax1.text(int(tup[0])-500, y_max1+3, str((int(l)+2)), size = 10)
    elif df1.iloc[0]['Strand'] == '-':
        for tup,l in zip(tuple_exons1, range(len(tuple_exons1))):
            ax1.add_patch(pplt.Rectangle((int(tup[1]), -.3), abs(int(tup[0])-int(tup[1])), 55, 
                                         fc = 'black', ec = 'black', alpha = 0.1))
            if (l+2)%2 == 0:
                ax1.text(int(tup[0])-500, y_max1+1, str((int(l)+2)), size = 12)
            else:
                ax1.text(int(tup[0])-500, y_max1+5, str((int(l)+2)), size = 12)
    if df2.iloc[0]['Strand'] == '+':
        for tup,l in zip(tuple_exons2, range(len(tuple_exons2))):
            ax2.add_patch(pplt.Rectangle((int(tup[0]), -.3), abs(int(tup[0])-int(tup[1])), 55, fc = 'black', 
                                         ec = 'black', alpha = 0.1))
            if (l+2)%2 == 0:
                ax2.text(int(tup[0])-500, y_max1+1, str((int(l)+2)), size = 12)
            else:
                ax2.text(int(tup[0])-500, y_max1+5, str((int(l)+2)), size = 12)
    elif df2.iloc[0]['Strand'] == '-':
        for tup,l in zip(tuple_exons2, range(len(tuple_exons2))):
            ax2.add_patch(pplt.Rectangle((int(tup[1]), -.3), abs(int(tup[0])-int(tup[1])), 55, fc = 'black', 
                                         ec = 'black', alpha = 0.1))
            if (l+2)%2 == 0:
                ax2.text(int(tup[0])-500, y_max1+1, str((int(l)+2)), size = 12)
            else:
                ax2.text(int(tup[0])-500, y_max1+5, str((int(l)+2)), size = 12)
    plt.tight_layout();
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() + '_loc_pat.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() + '_loc_pat.jpg')
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_' + gene_name2.lower() + '_loc_pat.svg')
else:
    plt.tight_layout()
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_loc_pat.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_name1.lower() + '_loc_pat.jpg')

# Deleteriousness of variants

## GnomAD Allele Frequency vs. CADD phred score

In [None]:
df1_af_cadd1 = df1[(df1['AF_gnomAD'].notnull())&(df1['CADD_PHRED_VEP'].notnull())].copy()
df1_af_cadd1['AF_to_CADD'] = df1_af_cadd1['AF_gnomAD']/df1_af_cadd1['CADD_PHRED_VEP']
p1_l = np.percentile(df1_af_cadd1['AF_to_CADD'], 10)
p1_h = np.percentile(df1_af_cadd1['AF_to_CADD'], 90)
if gene_name2 != '':
    df2_af_cadd2 = df2[(df2['AF_gnomAD'].notnull())&(df2['CADD_PHRED_VEP'].notnull())].copy()
    df2_af_cadd2['AF_to_CADD'] = df2_af_cadd2['AF_gnomAD']/df2_af_cadd2['CADD_PHRED_VEP']
    p2_l = np.percentile(df1_af_cadd1['AF_to_CADD'], 5)
    p2_h = np.percentile(df2_af_cadd2['AF_to_CADD'], 95)

In [None]:
df_inall = []
df_clin_nct_cbp = []
df_nct_cbp = []
for df in dfs:
    df_inall.append(df[df['ClinVar_gnomAD'].notnull() & 
                   df['NCT_SAMPLE_NAME'].notnull() & 
                   df['Annotation_cBP'].notnull() & 
                   df['Trans_Version_Flossies'].notnull()]) #1:21, 2:17
    df_clin_nct_cbp.append(df[df['ClinVar_gnomAD'].notnull() & 
                   df['NCT_SAMPLE_NAME'].notnull() & 
                   df['Annotation_cBP'].notnull()]) #1:31, 2:30
    df_nct_cbp.append(df[df['NCT_SAMPLE_NAME'].notnull() & 
                  df['Annotation_cBP'].notnull()]) #1:38, 2:35


In [None]:
# dfs sorted by clinvar_order
afc1 = df1[df1['ClinVar_gnomAD'].notnull()].copy()
afc1['ClinVar_gnomAD'] = afc1['ClinVar_gnomAD'].astype(clinvar_order)
afc1 = afc1.sort_values('ClinVar_gnomAD')

if gene_name2 != '':
    afc2 = df2[df2['ClinVar_gnomAD'].notnull()].copy()
    afc2['ClinVar_gnomAD'] = afc2['ClinVar_gnomAD'].astype(clinvar_order)
    afc2 = afc2.sort_values('ClinVar_gnomAD')

In [None]:
# parameters for scatter plots
cataf = ['Pathogenic', 'Pathogenic/Likely pathogenic', 'Likely pathogenic', 
         'Conflicting interpretations of pathogenicity', 'Likely benign', 'Benign/Likely benign', 'Benign']
colaf = ['firebrick', 'red', 'orange', 'yellow', 'greenyellow', 'limegreen', 'green']
markaf = ['o', 'v', 'x', 'd', '<', 's', 'D']
zord = [7,6,5,4,3,2,1]

In [None]:
# allele frequency (gnomAD) and cadd score of variants
fig = plt.subplots(figsize = (12,5))
plt.suptitle('Allele Frequency (gnomAD) to CADD phred score (VEP)', fontsize = 15, fontweight = 'bold', 
             x = 0.4)
ax1 = plt.subplot(1,2,1)
ax1.set_title(label = 'A: $%s$' %gene_name1.upper(), fontsize = 12, fontweight = 'bold', loc = 'left')
for cat,col,mark,z in zip(cataf, colaf, markaf, zord):
    ax1.scatter(x = afc1['CADD_PHRED_VEP'][afc1['ClinVar_gnomAD'] == cat], 
                y = afc1['AF_gnomAD'][afc1['ClinVar_gnomAD'] == cat], 
                c = col, label = cat, marker = mark, alpha = 0.6, zorder = z)
ax1.set_yscale('log')
ax1.set_ylabel('Allele Frequency (gnomAD, log scale)', fontsize = 12)
ax1.set_xlabel('CADD phred score (VEP)', fontsize = 12)
ax1.tick_params(labelsize = 12)
plt.tight_layout();

ax2 = plt.subplot(1,2,2, sharex = ax1, sharey = ax1)
ax2.set_title(label = 'B: $%s$' %gene_name2.upper(), fontsize = 12, fontweight = 'bold', loc = 'left')
for cat,col,mark,z in zip(cataf, colaf, markaf, zord):
    ax2.scatter(x = afc2['CADD_PHRED_VEP'][afc2['ClinVar_gnomAD'] == cat], 
                y = afc2['AF_gnomAD'][afc2['ClinVar_gnomAD'] == cat], 
                c = col, label = cat, marker = mark, alpha = 0.6, zorder = z)
ax2.set_yscale('log')
ax2.set_ylabel('Allele Frequency (gnomAD, log scale)', fontsize = 12)
ax2.set_xlabel('CADD phred score (VEP)', fontsize = 12)
ax2.tick_params(labelsize = 12)
ax2.get_yaxis().set_visible(False)
plt.legend(bbox_to_anchor = (1.01, 1.02), loc = 'upper left')
plt.tight_layout();

if len(gene_names) > 1:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_cadd_af.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_cadd_af.jpg')
else:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_cadd_af.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_cadd_af.jpg')

In [None]:
# allele frequency of Flossies and cadd score of variants
fig = plt.subplots(figsize = (12,5))
plt.suptitle('Allele Frequency (FLOSSIES) to CADD phred score (VEP)', fontsize = 15, fontweight = 'bold', 
             x = 0.4)
ax1 = plt.subplot(1,2,1)
ax1.set_title(label = 'A: $%s$' % gene_name1.upper(), fontsize = 12, fontweight = 'bold', loc = 'left')
for cat,col,mark,z in zip(cataf, colaf, markaf, zord):
    ax1.scatter(x = afc1['CADD_PHRED_VEP'][afc1['ClinVar_gnomAD'] == cat], 
                y = afc1['Overall_Frequency_Flossies'][afc1['ClinVar_gnomAD'] == cat], 
                c = col, label = cat, marker = mark, alpha = 0.6, zorder = z)
ax1.set_yscale('log')
ax1.set_ylabel('Allele Frequency (FLOSSIES, log scale)', fontsize = 12)
ax1.set_xlabel('CADD phred score (VEP)', fontsize = 12)
ax1.tick_params(labelsize = 12)
plt.tight_layout();

ax2 = plt.subplot(1,2,2, sharex = ax1, sharey = ax1)
ax2.set_title(label = 'B: $%s$' % gene_name2.upper(), fontsize = 12, fontweight = 'bold', loc = 'left')
for cat,col,mark,z in zip(cataf, colaf, markaf, zord):
    ax2.scatter(x = afc2['CADD_PHRED_VEP'][afc2['ClinVar_gnomAD'] == cat], 
                y = afc2['Overall_Frequency_Flossies'][afc2['ClinVar_gnomAD'] == cat], 
                c = col, label = cat, marker = mark, alpha = 0.6, zorder = z)
ax2.set_yscale('log')
ax2.set_ylabel('Allele Frequency (FLOSSIES, log scale)', fontsize = 12)
ax2.set_xlabel('CADD phred score (VEP)', fontsize = 12)
ax2.tick_params(labelsize = 12)
ax2.get_yaxis().set_visible(False)
plt.legend(bbox_to_anchor = (1.01, 1.02), loc = 'upper left')
plt.tight_layout()

if len(gene_names) > 1:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_cadd_flos.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_cadd_flos.jpg')
else:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_cadd_flos.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_cadd_flos.jpg')

### Special variants, BRCA1

In [None]:
# CADD:AF striking
df1[['Identifier', 'EXON', 'Consequence_VEP', 'ClinVar_gnomAD', 'AF_gnomAD', 'Overall_Frequency_Flossies', 'CADD_PHRED_VEP', 'PolyPhen_VEP_num', 
     'SIFT_VEP_num']][
    (df1['ClinVar_gnomAD'].notnull())&
    (df1['AF_gnomAD'] > 0.001)&
    (df1['CADD_PHRED_VEP'] > 20)]

In [None]:
# CADD:AF striking + 1
df1[['Identifier', 'EXON', 'Consequence_VEP', 'ClinVar_gnomAD', 'AF_gnomAD', 'Overall_Frequency_Flossies', 'CADD_PHRED_VEP', 'PolyPhen_VEP_num', 
     'SIFT_VEP_num']][
    (df1['ClinVar_gnomAD'].notnull())&
    (df1['AF_gnomAD'] > 0.0001)&
    (df1['CADD_PHRED_VEP'] > 30)]

In [None]:
# high AF
df1[['Identifier', 'EXON', 'Consequence_VEP', 'ClinVar_gnomAD', 'AF_gnomAD', 'Overall_Frequency_Flossies', 'CADD_PHRED_VEP', 'PolyPhen_VEP_num', 
     'SIFT_VEP_num']][
    (df1['ClinVar_gnomAD'].notnull())&
    (df1['AF_gnomAD'] > 0.05)&
    (df1['CADD_PHRED_VEP'] > 11)]

In [None]:
# pathogenic FLOSSIES
df1[['Identifier', 'EXON', 'INTRON', 'Consequence_VEP', 'ClinVar_gnomAD', 'CADD_PHRED_VEP', 'AF_gnomAD', 
     'Overall_Frequency_Flossies', 'PolyPhen_VEP_num', 'SIFT_VEP_num', 'European_(n=7325)_Flossies', 
     'African_(n=2559)_Flossies']][
    (df1['Overall_Frequency_Flossies'].notnull())&
    (df1['ClinVar_gnomAD'] == 'Pathogenic')]

In [None]:
# benign high CADD
df1[['Identifier', 'EXON', 'INTRON', 'Consequence_VEP', 'ClinVar_gnomAD', 'CADD_PHRED_VEP', 'AF_gnomAD', 
     'Overall_Frequency_Flossies', 'PolyPhen_VEP_num', 'SIFT_VEP_num', 'European_(n=7325)_Flossies', 
     'African_(n=2559)_Flossies']][
    (df1['ClinVar_gnomAD'] == 'Benign')&
    (df1['CADD_PHRED_VEP'] > 30)&
    (df1['Overall_Frequency_Flossies'].notnull())]

### Special variants, BRCA2

In [None]:
# CADD:AF striking
df2[['Identifier', 'EXON', 'Consequence_VEP', 'ClinVar_gnomAD', 'AF_gnomAD', 'Overall_Frequency_Flossies', 'CADD_PHRED_VEP', 'PolyPhen_VEP_num', 
     'SIFT_VEP_num']][
    (df2['ClinVar_gnomAD'].notnull())&
    (df2['AF_gnomAD'] > 0.001)&
    (df2['CADD_PHRED_VEP'] > 20)]

In [None]:
# high AF
df2[['Identifier', 'EXON', 'Consequence_VEP', 'ClinVar_gnomAD', 'AF_gnomAD', 'Overall_Frequency_Flossies', 'CADD_PHRED_VEP', 'PolyPhen_VEP_num', 
     'SIFT_VEP_num']][
    (df2['ClinVar_gnomAD'].notnull())&
    (df2['AF_gnomAD'] > 0.05)&
    (df2['CADD_PHRED_VEP'] > 11)]

In [None]:
# pathogenic FLOSSIES
df2[['Identifier', 'EXON', 'INTRON', 'Consequence_VEP', 'ClinVar_gnomAD', 'CADD_PHRED_VEP', 'AF_gnomAD', 
     'Overall_Frequency_Flossies', 'PolyPhen_VEP_num', 'SIFT_VEP_num', 'European_(n=7325)_Flossies', 
     'African_(n=2559)_Flossies']][
    (df2['Overall_Frequency_Flossies'].notnull())&
    (df2['ClinVar_gnomAD'] == 'Pathogenic')]

In [None]:
# benign high CADD
df2[['Identifier', 'EXON', 'INTRON', 'Consequence_VEP', 'ClinVar_gnomAD', 'CADD_PHRED_VEP', 'AF_gnomAD', 
     'Overall_Frequency_Flossies', 'PolyPhen_VEP_num', 'SIFT_VEP_num', 'European_(n=7325)_Flossies', 
     'African_(n=2559)_Flossies']][
    (df2['ClinVar_gnomAD'] == 'Benign')&
    (df2['CADD_PHRED_VEP'] > 30)&
    (df2['Overall_Frequency_Flossies'].notnull())]

## Division in groups for prediction of deleteriousness

In [None]:
# SIFT_VEP vs. PolyPhen_VEP for all variants
fig = plt.subplots(figsize = (10,10))
plt.suptitle('SIFT score (VEP) to PolyPhen score (VEP)', fontsize = 15, fontweight = 'bold')

#for df, g in zip(dfs, gene_names):
ax1 = plt.subplot(2,1,1)
ax1.set_title('A: $%s$' %gene_name1.upper(), 
              fontsize = 12, fontweight = 'bold', loc = 'left')
ax1 = sns.kdeplot(data = df1, x = 'PolyPhen_VEP_num', y = 'SIFT_VEP_num', fill = True, thresh = 0, 
                  levels = 20, cmap = 'Blues', cbar = True)
ax1.set_ylabel('SIFT score (VEP)', fontsize = 12)
ax1.set_xlabel('PolyPhen score (VEP)', fontsize = 12)
#plt.legend()

ax2 = plt.subplot(2,1,2)
ax2.set_title('B: $%s$' %gene_name2.upper(), 
              fontsize = 12, fontweight = 'bold', loc = 'left')
ax2 = sns.kdeplot(data = df2, x = 'PolyPhen_VEP_num', y = 'SIFT_VEP_num', fill = True, thresh = 0, 
                  levels = 20, cmap = 'Blues', cbar = True)
ax2.set_ylabel('SIFT score (VEP)', fontsize = 12)
ax2.set_xlabel('PolyPhen score (VEP)', fontsize = 12)
plt.tight_layout()

if len(gene_names) > 1:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_sift_pp_dens.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_sift_pp_dens.jpg')
else:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_sift_pp_dens.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_sift_pp_dens.jpg')

In [None]:
# SIFT_VEP vs. PolyPhen_VEP for variants in gnomAD
fig = plt.subplots(figsize = (10,10))
plt.suptitle('SIFT score (VEP) to PolyPhen score (VEP) of gnomAD variants', fontsize = 15, fontweight = 'bold')

#for df, g in zip(dfs, gene_names):
ax1 = plt.subplot(2,1,1)
ax1.set_title('A: $%s$' %gene_name1.upper(), 
              fontsize = 12, fontweight = 'bold', loc = 'left')
ax1 = sns.kdeplot(data = df1[df1['AF_gnomAD'].notnull()], x = 'PolyPhen_VEP_num', y = 'SIFT_VEP_num', 
                  fill = True, thresh = 0, levels = 20, cmap = 'Blues', cbar = True)
ax1.set_ylabel('SIFT score (VEP)', fontsize = 12)
ax1.set_xlabel('PolyPhen score (VEP)', fontsize = 12)
#plt.legend()

ax2 = plt.subplot(2,1,2)
ax2.set_title('B: $%s$' %gene_name2.upper(), 
              fontsize = 12, fontweight = 'bold', loc = 'left')
ax2 = sns.kdeplot(data = df2[df2['AF_gnomAD'].notnull()], x = 'PolyPhen_VEP_num', y = 'SIFT_VEP_num', 
                  fill = True, thresh = 0, levels = 20, cmap = 'Blues', cbar = True)
ax2.set_ylabel('SIFT score (VEP)', fontsize = 12)
ax2.set_xlabel('PolyPhen score (VEP)', fontsize = 12)
plt.tight_layout()

if len(gene_names) > 1:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_sift_pp_dens_ga.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_sift_pp_dens_ga.jpg')
else:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_sift_pp_dens_ga.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_sift_pp_dens_ga.jpg')

In [None]:
# sift and polyphen score of variants
fig = plt.subplots(figsize = (12,5))
plt.suptitle('SIFT score (VEP) to PolyPhen score (VEP) with pathogenicity characterization (Clinvar)', fontsize = 15, fontweight = 'bold', 
             x = 0.5)
ax1 = plt.subplot(1,2,1)
ax1.set_title(label = 'A: $%s$' %gene_name1.upper(), fontsize = 12, fontweight = 'bold', loc = 'left')
for cat,col,mark,z in zip(cataf, colaf, markaf, zord):
    ax1.scatter(x = afc1['PolyPhen_VEP_num'][afc1['ClinVar_gnomAD'] == cat], 
                y = afc1['SIFT_VEP_num'][afc1['ClinVar_gnomAD'] == cat], 
                c = col, label = cat, marker = mark, alpha = 0.6, zorder = z)
ax1.set_ylabel('SIFT score (VEP)', fontsize = 12)
ax1.set_xlabel('PolyPhen score (VEP)', fontsize = 12)
ax1.tick_params(labelsize = 12)
plt.tight_layout();

ax2 = plt.subplot(1,2,2, sharex = ax1, sharey = ax1)
ax2.set_title(label = 'B: $%s$' %gene_name2.upper(), fontsize = 12, fontweight = 'bold', loc = 'left')
for cat,col,mark,z in zip(cataf, colaf, markaf, zord):
    ax2.scatter(x = afc2['PolyPhen_VEP_num'][afc2['ClinVar_gnomAD'] == cat], 
                y = afc2['SIFT_VEP_num'][afc2['ClinVar_gnomAD'] == cat], 
                c = col, label = cat, marker = mark, alpha = 0.6, zorder = z)
ax2.set_ylabel('SIFT score (VEP)', fontsize = 12)
ax2.set_xlabel('PolyPhen score (VEP)', fontsize = 12)
ax2.tick_params(labelsize = 12)
ax2.get_yaxis().set_visible(False)
plt.legend(bbox_to_anchor = (1.01, 1.02), loc = 'upper left');
plt.tight_layout()

#'''
if len(gene_names) > 1:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_sift_pp_pat.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_' + gene_names[1] + '_sift_pp_pat.jpg')
else:
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_sift_pp_pat.pdf')
    plt.savefig(r'figures/' + date + '_' + gene_names[0] + '_sift_pp_pat.jpg')
#'''

## Stop gain

In [None]:
# percentage of stop gains relative to length of CDS
stop_per_CDS1 = len(df1[df1['Consequence_VEP'] == 'stop gained'])/(cod1_len*3)
stop_per_CDS2 = len(df2[df2['Consequence_VEP'] == 'stop gained'])/(cod2_len*3)

# per exon
def stop_per_exon(coding_df):
    return [(e,len(coding_df[(coding_df['EXON'] == e)&(coding_df['Consequence_VEP'] == 'stop gained')]),
             len(coding_df[coding_df['EXON'] == e]['POS'].unique())*3,
             len(coding_df[(coding_df['EXON'] == e)&(coding_df['Consequence_VEP'] == 'stop gained')])/(len(
                 coding_df[coding_df['EXON'] == e]['POS'].unique())*3)*100) for e in coding_df['EXON'].unique()]
stop_per_ex1 = pd.DataFrame(stop_per_exon(coding_df1), columns = ['EXON','Stops','Length','Percentage_stop'])
stop_per_ex2 = pd.DataFrame(stop_per_exon(coding_df2), columns = ['EXON','Stops','Length','Percentage_stop'])

In [None]:
# percentage of observed stop gains relative to all possible SNVs  of CDS with ClinVar annotations
stop_per_CDS1_o = len(df1[(df1['Consequence_VEP'] == 'stop gained')&
                          (df1['ClinVar_gnomAD'].notnull())])/(cod1_len*3)
stop_per_CDS2_o = len(df2[(df2['Consequence_VEP'] == 'stop gained')&
                          (df2['ClinVar_gnomAD'].notnull())])/(cod2_len*3)

# per exon
def stop_per_exon_o(coding_df):
    return [(e,len(coding_df[(coding_df['EXON'] == e)&(coding_df['Consequence_VEP'] == 'stop gained')&
                             (coding_df['ClinVar_gnomAD'].notnull())]),
             len(coding_df[coding_df['EXON'] == e]['POS'].unique()),
             len(coding_df[(coding_df['EXON'] == e)&(coding_df['Consequence_VEP'] == 'stop gained')&
                           (coding_df['ClinVar_gnomAD'].notnull())])/(len(
                 coding_df[coding_df['EXON']==e]['POS'].unique())*3)*100) for e in coding_df['EXON'].unique()]
stop_per_ex1_o = pd.DataFrame(stop_per_exon_o(coding_df1), columns=['EXON','Stops','Length','Percentage_stop'])
stop_per_ex2_o = pd.DataFrame(stop_per_exon_o(coding_df2), columns=['EXON','Stops','Length','Percentage_stop'])

In [None]:
# stops per exon in percent

fig = plt.subplots(figsize = (12,10))

plt.suptitle('All possible and all observed stop gained variants per exon', fontweight='bold', fontsize=15)
ax1 = plt.subplot(2,1,1)
ax1.scatter(stop_per_ex1['EXON'], stop_per_ex1['Percentage_stop'], c='dodgerblue', 
            label='all possible stop gained variants')
ax1.set_title('A: $%s$' %gene_name1.upper(), loc='left', fontweight='bold', fontsize=12)
ax1.set_ylabel('All possible stop gained variants (in %)', fontsize=12, color='dodgerblue')
ax1.set_xlabel('Exon numbers', fontsize=12)
ax1.tick_params(axis='y', colors='dodgerblue')

plt.legend(bbox_to_anchor = (0.35, 1.05), loc = 'lower center', borderaxespad = 0)

twin1 = ax1.twinx()
twin1.scatter(stop_per_ex1_o['EXON'], stop_per_ex1_o['Percentage_stop'], c='orange', 
              label='all observed stop gained variants', marker='x')
twin1.set_ylabel('Observed stop gained variants (in %)', fontsize=12, color='darkorange')
twin1.tick_params(axis='y', colors='darkorange')

plt.legend(bbox_to_anchor = (0.65, 1.05), loc = 'lower center', borderaxespad = 0)


ax2 = plt.subplot(2,1,2, sharey = ax1)
ax2.scatter(stop_per_ex2['EXON'], stop_per_ex2['Percentage_stop'], c='dodgerblue', 
            label='all possible stop gained variants')
ax2.set_title('B: $%s$' %gene_name2.upper(), loc='left', fontweight='bold', fontsize=12)
ax2.set_ylabel('All possible stop gained variants (in %)', fontsize=12, color='dodgerblue')
ax2.set_xlabel('Exon numbers', fontsize=12)
ax2.tick_params(axis='y', colors='dodgerblue')

twin2 = ax2.twinx()
twin2.scatter(stop_per_ex2_o['EXON'], stop_per_ex2_o['Percentage_stop'], c='orange', 
              label='all observed stop gained variants', marker='x')
twin2.set_ylabel('Observed stop gained variants (in %)', color='darkorange', fontsize=12)
twin2.tick_params(axis='y', colors='darkorange')

plt.tight_layout();

plt.savefig(r'figures/' + date + '_' + gene_name2.lower() + '_' + 
            gene_name1.lower() + '_perc_stop_gains.pdf')
plt.savefig(r'figures/' + date + '_' + gene_name2.lower() + '_' + 
            gene_name1.lower() + '_perc_stop_gains.jpg')

In [None]:
d1stl = list(df1['ClinVar_gnomAD'][(df1['Consequence_VEP'] == 'stop gained')&
                                   (df1['ClinVar_gnomAD']).notnull()].unique())
d2stl = list(df2['ClinVar_gnomAD'][(df2['Consequence_VEP'] == 'stop gained')&
                                   (df2['ClinVar_gnomAD']).notnull()].unique())
d2st_col = ['firebrick', 'grey', 'green', 'yellow']
d2st_z = [6,3,4,5]
d2st_m = ['v', 'x', 'd', '<']

In [None]:
# for stop gain --> where are variants with gained stop codons, how annotated in ClinVar, which in cBioPortal
fig = plt.subplots(figsize = (11,8))
plt.suptitle('Stop gained variants with pathogenicity characterization (Clinvar)', fontweight = 'bold', fontsize = 15, 
             y = 0.98, x = 0.5)
ax1 = plt.subplot(2,1,1)
ax1.set_title('A: $%s$' %gene_name1.upper(), fontsize=13, fontweight = 'bold', loc = 'left', y = 1.12)

ax1.scatter(x = df1['POS'][(df1['Consequence_VEP'] == 'stop gained')&
                           (df1['ClinVar_gnomAD'].isnull())], 
            y = df1['CADD_PHRED_VEP'][(df1['Consequence_VEP'] == 'stop gained')&
                                      (df1['ClinVar_gnomAD'].isnull())], 
            c = 'dodgerblue', alpha = 0.5, zorder = 2, marker = 'o', label = 'Synthetic')
ax1.scatter(x = df1['POS'][(df1['Consequence_VEP'] == 'stop gained')&
                           (df1['ClinVar_gnomAD'] == 'Pathogenic')], 
            y = df1['CADD_PHRED_VEP'][(df1['Consequence_VEP'] == 'stop gained')&
                                      (df1['ClinVar_gnomAD'] == 'Pathogenic')], 
            c = 'firebrick', alpha = 1, zorder = 3, marker = 'v', label = 'Pathogenic')

ax1.set_xlabel('Nucleic position on chromosome %s at $%s$ region' %(df1.iloc[0]['CHROM'],gene_name1.upper()), fontsize = 12)
ax1.set_ylabel('CADD phred score (VEP)', fontsize = 12)
ax1.ticklabel_format(style = 'plain')
ax1.get_xaxis().set_major_formatter(tick.StrMethodFormatter('{x:,.0f}'));
plt.tight_layout()

ax2 = plt.subplot(2,1,2)

ax2.scatter(x = df2['POS'][(df2['Consequence_VEP'] == 'stop gained')&
                           (df2['ClinVar_gnomAD'].isnull())], 
            y = df2['CADD_PHRED_VEP'][(df2['Consequence_VEP'] == 'stop gained')&
                                      (df2['ClinVar_gnomAD'].isnull())], 
            c = 'dodgerblue', alpha = 0.5, zorder = 2, marker = 'o', label = 'Synthetic')
for cat,col,m,z in zip(d2stl, d2st_col, d2st_m, d2st_z):
    ax2.scatter(x = df2['POS'][(df2['Consequence_VEP'] == 'stop gained')&
                               (df2['ClinVar_gnomAD'] == cat)], 
                y = df2['CADD_PHRED_VEP'][(df2['Consequence_VEP'] == 'stop gained')&
                                          (df2['ClinVar_gnomAD'] == cat)], 
                c = col, alpha = 1, zorder = z, marker = m, label = cat)

ax2.set_title('B: $%s$' %gene_name2.upper(), loc = 'left', fontsize=13, fontweight = 'bold', y = 1.12)
ax2.set_xlabel('Nucleic position on chromosome %s at $%s$ region' %(df2.iloc[0]['CHROM'],gene_name2.upper()), fontsize = 12)
ax2.set_ylabel('CADD phred score (VEP)', fontsize = 12)
ax2.ticklabel_format(style = 'plain')
ax2.get_xaxis().set_major_formatter(tick.StrMethodFormatter('{x:,.0f}'))
ax2.legend(bbox_to_anchor = (0, -0.2), loc = 'upper left', borderaxespad = 0, ncol = 5)

(y_min1, y_max1) = plt.ylim()
if df1.iloc[0]['Strand'] == '+':
    for tup,l in zip(tuple_exons1, range(len(tuple_exons1))):
        ax1.add_patch(pplt.Rectangle((int(tup[0]), -.3), abs(int(tup[0])-int(tup[1])), 55, 
                                    fc = 'black', ec = 'black', alpha = 0.1, zorder = 1))
        if (l+2)%2 == 0:
            ax1.text(int(tup[0])-500, y_max1+1, str((int(l)+2)), size = 12)
        else:
            ax1.text(int(tup[0])-500, y_max1+3, str((int(l)+2)), size = 12)
elif df1.iloc[0]['Strand'] == '-':
    for tup,l in zip(tuple_exons1, range(len(tuple_exons1))):
        ax1.add_patch(pplt.Rectangle((int(tup[1]), -.3), abs(int(tup[0])-int(tup[1])), 55, 
                                    fc = 'black', ec = 'black', alpha = 0.1, zorder = 1))
        if (l+2)%2 == 0:
            ax1.text(int(tup[0])-500, y_max1+1, str((int(l)+2)), size = 12)
        else:
            ax1.text(int(tup[0])-500, y_max1+3, str((int(l)+2)), size = 12)
if df2.iloc[0]['Strand'] == '+':
    for tup,l in zip(tuple_exons2, range(len(tuple_exons2))):
        ax2.add_patch(pplt.Rectangle((int(tup[0]), -.3), abs(int(tup[0])-int(tup[1])), 55, fc = 'black', 
                                    ec = 'black', alpha = 0.1, zorder = 1))
        if (l+2)%2 == 0:
            ax2.text(int(tup[0])-500, y_max1+1, str((int(l)+2)), size = 12)
        else:
            ax2.text(int(tup[0])-500, y_max1+3, str((int(l)+2)), size = 12)
elif df2.iloc[0]['Strand'] == '-':
    for tup,l in zip(tuple_exons2, range(len(tuple_exons2))):
        ax2.add_patch(pplt.Rectangle((int(tup[1]), -.3), abs(int(tup[0])-int(tup[1])), 55, fc = 'black', 
                                    ec = 'black', alpha = 0.1, zorder = 1))
        if (l+2)%2 == 0:
            ax2.text(int(tup[0])-500, y_max1+1, str((int(l)+2)), size = 12)
        else:
            ax2.text(int(tup[0])-500, y_max1+3, str((int(l)+2)), size = 12)

plt.tight_layout()
plt.gcf().subplots_adjust(bottom = 0.12, top = 0.9, left = 0.05, right = 0.95);
plt.savefig(r'figures/' + date + '_' + gene_name2.lower() + '_' + 
            gene_name1.lower() + '_stop_gains.pdf')
plt.savefig(r'figures/' + date + '_' + gene_name2.lower() + '_' + 
            gene_name1.lower() + '_stop_gains.jpg')
plt.savefig(r'figures/' + date + '_' + gene_name2.lower() + '_' + 
            gene_name1.lower() + '_stop_gains.svg')