In [None]:
import matplotlib.pyplot as plt
from matplotlib import cm
import pandas as pd
import numpy as np
import matplotlib.ticker as tick
import plotly.graph_objects as go

In [None]:
all_vars = pd.read_pickle('all_annotated_variants')
all_vars = right_type(all_vars)
# unique variants
wo_dup = all_vars.drop_duplicates(cols_cpra)

In [None]:
# function to search for specific name in column
def search_col(df, inp):
    return [i for i in df.columns if inp.lower() in i.lower()]

# function to make spliceAI prediction scores into floats
def make_float(df):
    for i in ['AG', 'AL', 'DG', 'DL']:
        df[('SpliceAI_pred_DS_'+i)]=df[('SpliceAI_pred_DS_'+i)].astype(float)
    for i in ['ada_score', 'rf_score', 'MaxEntScan_diff', 'MaxEntScan_alt', 'MaxEntScan_ref']:
        df[i] = df[i].astype(float)
    return df

def right_type(df):
    dfn = df.copy()
    dfn[['#CHROM', 'REF', 'ALT']] = dfn[['#CHROM', 'REF', 'ALT']].astype(str)
    dfn['POS'] = dfn['POS'].astype(int)
    dfn['PosExon_type'] = dfn['PosExon_type'].replace({'outsideDonor':'outsideDonorSite', 
                                                       'insideDonor':'insideDonorSite'})
    dfn['ClinVar_Pathogenicity'] = dfn['ClinVar_Pathogenicity'].replace(
        {'Uncertain significance':'Uncertain Significance', 'uncertain_significance':'Uncertain Significance', 
         'not_provided':np.nan})
    dfn = make_float(dfn)
    return dfn

cols_cpra = ['#CHROM', 'POS', 'REF', 'ALT']

#### Splice prediction scores

In [None]:
mes_s = len(wo_dup[wo_dup['MaxEntScan_diff'].notnull()])
mes_ns = len(wo_dup[wo_dup['MaxEntScan_diff'].isnull()])
ar_s = len(wo_dup[(wo_dup['ada_score'].notnull())&(wo_dup['rf_score'].notnull())])
ar_ns = len(wo_dup[(wo_dup['ada_score'].isnull())|(wo_dup['rf_score'].isnull())])
spl_s = len(wo_dup[wo_dup['SpliceAI_pred_DS_AG'].notnull()])
spl_ns = len(wo_dup[wo_dup['SpliceAI_pred_DS_AG'].isnull()])

In [None]:
fig, axes = plt.subplots(figsize=(14,5))

for a,s,ns,n in zip([plt.subplot(1,3,n) for n in [1,2,3]], [mes_s, ar_s, spl_s], [mes_ns, ar_ns, spl_ns], 
                    ['A: MaxEntScan', 'B: Ada-/Rf-score', 'C: SpliceAI']):
    a.pie([s, ns], labels=['score', 'no score'], counterclock=False, startangle=90, explode=(0.1, 0), 
          autopct='%1.1f%%', shadow=True, colors=['cornflowerblue', 'orange'], textprops={'size':'16'})
    a.set_title(n, fontdict={'fontsize':'20', 'fontweight':'bold'}, loc='left')

plt.tight_layout()
plt.savefig('perc_splice_scores.png')

#### Gene priorities for unique variants

In [None]:
gene_prios_n = [0.079155032171907, 0.24244263688236, 0.101736068957145, 0.576666261988588]
gene_prios_c = ['Hot', 'Warm without GUS', 'Warm GUS', 'Cold']
theme = plt.get_cmap('RdYlBu')

fig, axes = plt.subplots(figsize=(12,5))

ax1 = plt.subplot(1,1,1)
ax1.set_prop_cycle('color', [theme(1.*i/len(gene_prios_n)) for i in range(len(gene_prios_n))])
ax1.pie(gene_prios_n, counterclock=False, startangle=90, explode=(0.05, 0.05, 0.05, 0.05), shadow=True)

lg = plt.legend(labels=['%s genes: %1.1f%%' %(c, n*100) for c,n in zip(gene_prios_c, gene_prios_n)], 
                bbox_to_anchor=(1,0.7), prop={'size':18})
lg.set_title('$\\bf{Gene\ category}$', prop={'size':18})

plt.tight_layout()
plt.gcf().subplots_adjust(bottom = 0.02, top = 0.98, left = 0.01, right = 0.5)

plt.savefig('perc_gene_prio.png')

#### ClinVar assessments

In [None]:
wo_dup['ClinVar_Pathogenicity'][wo_dup['ClinVar_Pathogenicity'].notnull()].value_counts()

In [None]:
fig, axes = plt.subplots(figsize=(12,7))
ax1 = plt.subplot(1,1,1)
ax1.pie([727, 876, 1230, 19, 82], counterclock=False, startangle=90, explode=(0.05,0.05,0.05,0.05,0.05),
        colors=['green', 'yellow', 'grey', 'orange', 'red'])
ax1.legend(['%s: %1.1f%%' %(cat, num/2934*100) for cat, num in 
            zip(['Benign', 'Likely Benign', 'Uncertain Significance', 'Likely Pathogenic', 'Pathogenic'], 
                [727, 876, 1230, 19, 82])], bbox_to_anchor=(1, 0.7), fontsize=16, title_fontsize=16,
           title=r'$\bf{ClinVar\ characterization}$')
plt.gcf().subplots_adjust(bottom = 0, top = 1, left = 0.01, right = 0.6)
plt.savefig('clinvar_perc.png')

#### Connections between evaluation groups

In [None]:
fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(color = "black", width = 0.5),
      label = ['ClinVar Pathogenic', 'ClinVar Likely Pathogenic', 'ClinVar VUS', 'ClinVar Likely Benign','ClinVar Benign', 
               'QCI Pathogenic', 'QCI Likely Pathogenic', 'QCI VUS', 'QCI Likely Benign','QCI Benign',
               'RNA Pathogenic', 'RNA Likely Pathogenic', 'RNA VUS', 'RNA Likely Benign', 'RNA Benign'],
      color = 3*['red','orange','grey','lime','green']
    ),
    link = dict(
      source = [0, 1, 1, 2, 2, 2, 2, 2, 3, 3, # indices correspond to labels
                5, 6, 6, 6, 7, 7, 8, 9],
      target = [5, 5, 6, 5, 6, 7, 8, 9, 7, 8,
                10, 10, 11, 12, 11, 12, 12, 12],
      value =  [8, 3, 1, 3, 12, 8, 1, 1, 2, 0,
                14, 3, 8, 2, 2, 8, 1, 1]
  ))])

#fig.update_layout(title_text="Sankey Diagram ClinVar to RNA-seq", font_size=10)
fig.show()
fig.write_html('sankey_clinvar_qci_rna.html')