In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#### Functions

In [None]:
# function to search for specific name in column
def search_col(df,inp):
    return [i for i in df.columns if inp.lower() in i.lower()]

In [None]:
# function to make spliceAI prediction scores into floats
def make_float(df):
    for i in ['AG', 'AL', 'DG', 'DL']:
        df[('SpliceAI_pred_DS_'+i)]=df[('SpliceAI_pred_DS_'+i)].astype(float)
    for i in ['ada_score', 'rf_score', 'MaxEntScan_diff', 'MaxEntScan_alt', 'MaxEntScan_ref']:
        df[i] = df[i].astype(float)
    return df

In [None]:
# function to make excel tables with one variant per row
columns_df_prior = ['#CHROM', 'POS', 'REF', 'ALT', 'MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref', 
                    'CADD_PHRED', 'SpliceAI_pred_DP_AG', 'SpliceAI_pred_DP_AL', 'SpliceAI_pred_DP_DG', 
                    'SpliceAI_pred_DP_DL', 'SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL', 'SpliceAI_pred_DS_DG', 
                    'SpliceAI_pred_DS_DL', 'SpliceAI_pred_SYMBOL', 'ada_score', 'rf_score',
                    'PosExonRefSeqAccession', 'PosExon_type', 'ClinVar_Pathogenicity', 'ClinVar_Traits', 'PID']
columns_df_xlsx = columns_df_prior[:-1]
columns_df_xlsx.extend(['PIDs', 'ct_PIDs'])
cols_cpra = ['#CHROM', 'POS', 'REF', 'ALT']

def xlsx_df(prio_df):
    lists = []
    for p,df in prio_df.groupby(by=['#CHROM', 'POS', 'REF', 'ALT']):
        df = df.reset_index(drop=True)
        list_df = [df.iloc[0][i] for i in columns_df_prior[:-1]]
        list_df.append(list(df['PID']))
        lists.append(list_df)
    return pd.DataFrame(lists, columns=columns_df_xlsx)

def xlsx_vars(prio_vars_df, name):
    merged = pd.merge(all_variants[columns_df_prior], prio_vars_df, how='inner')
    lists = []
    for p,df in merged.groupby(by=['#CHROM', 'POS', 'REF', 'ALT']):
        df = df.reset_index(drop=True)
        list_df = [df.iloc[0][i] for i in columns_df_prior[:-1]]
        list_df.extend([list(df['PID']), len(df['PID'])])
        lists.append(list_df)
    new_df = pd.DataFrame(lists, columns=columns_df_xlsx)
    new_df['#CHROM'] = pd.Categorical(new_df['#CHROM'],
                                      categories=['1','2','3','4','5','6','7','8','9','10','11','12', '13', '14', 
                                                  '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
                                      ordered=True)
    new_df.sort_values(by=['ct_PIDs', '#CHROM', 'POS', 'REF', 'ALT'], inplace=True, ignore_index=True, 
                       ascending=[False, True, True, True, True])
    new_df.drop(axis='columns', columns='ct_PIDs', inplace=True)
    new_df.to_excel('03_prioritized_xlsx/%s.xlsx'%name, index=False)

In [None]:
# overview on variants counts for specific prioritizations
# any spliceAI score > 0.9
def splai_o09(df):
    return df[((df['SpliceAI_pred_DS_AG']>0.9)|(df['SpliceAI_pred_DS_AL']>0.9)|
               (df['SpliceAI_pred_DS_DG']>0.9)|(df['SpliceAI_pred_DS_DL']>0.9))].copy()

# ada/rf score > 0.6
def ada_rf_o06(df):
    return df[(df['ada_score']>0.6)&
              (df['rf_score']>0.6)].copy()

# MaxEntScan high disruption of native splice site
def MES_disr(df):
    return df[(df['MaxEntScan_diff']>=0)&
              (df['MaxEntScan_alt']<6.2)].copy()

# MaxEntScan high possibility of creating new splice site
def MES_new(df):
    return df[(df['MaxEntScan_diff']<0)&
              (df['MaxEntScan_alt']>8.5)].copy()

# position relative to exon/intron boundary less than 3 and in intron
def pos_rel_ei(df):
    return df[((df['PosExonRefSeqAccession']<3)&
               (df['PosExon_type'].isin(['outsideAcceptorSite', 'outsideDonorSite'])))|
              ((df['PosExonRefSeqAccession']<2)&
               (df['PosExon_type'].isin(['insideAcceptorSite', 'insideDonorSite'])))].copy()

# variant categorization per hot/warm/cold genes
def gene_prio(df):
    gus = ['AKT2', 'AKT3', 'ATR', 'ATRX', 'CDKN1A', 'CDKN2B', 'CDKN2C', 'CDKN2D', 'CEBPA', 'CEP57', 'DAXX', 'EGFR', 
           'EGLN1', 'EGLN2', 'EPAS1', 'EPHB2', 'ABRAXAS1', 'FANCM', 'GPRC5A', 'HNF1A', 'HORMAD1', 'HORMAD2', 'JAK2',
           'KIF1B', 'MAP3K1', 'MAP3K6', 'MDH2', 'MLH3', 'MMS19', 'MN1', 'MRE11', 'PALLD', 'PIK3C2G', 'PIK3R2', 
           'PMS1', 'PRF1', 'PTCH2', 'RAD50', 'RAD51', 'RECQL', 'RINT1']
    not_gus = (set(gus) ^ set(df['HUGO_Symbol'][df['Splice Project Gene Priority']=='Warm (MASTER-ACMG)'].unique()))
    df_lst = []
    for i,lst in zip(['Hot (ACMG / MASTER)', 'Warm (MASTER-ACMG)', 'Warm (MASTER-ACMG)', 'Cold (387-ACMG-MASTER)'], 
                     [[''], gus, not_gus, ['']]):
        df_lst.append(df[(df['Splice Project Gene Priority']==i)&
                         (~df['HUGO_Symbol'].isin(lst))].copy())
    return df_lst

#### DataFrame with all annotations

In [None]:
all_variants = pd.read_pickle('all_annotated_variants')
all_variants['#CHROM'] = pd.Categorical(all_variants['#CHROM'],
                                    categories=['1','2','3','4','5','6','7','8','9','10','11','12', '13', '14', 
                                                '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
                                    ordered=True)
all_variants = all_variants.sort_values(by=['#CHROM', 'POS', 'REF', 'ALT'], ignore_index=True)
all_variants['PosExon_type'] = all_variants['PosExon_type'].replace({'insideDonor':'insideDonorSite', 
                                                             'outsideDonor':'outsideDonorSite'})
all_variants = make_float(all_variants)

# unique variants
vars_grpd = all_variants.drop_duplicates(subset=cols_cpra)

#### SpliceAI > 0.9

In [None]:
vars_grpd_splai = vars_grpd[vars_grpd['SpliceAI_pred_DP_AG'].notnull()].copy()
vars_grpd_splai = make_float(vars_grpd_splai)

vars_grpd_splai = vars_grpd_splai[cols_cpra][((vars_grpd_splai['SpliceAI_pred_DS_AG']>0.9)|
                                              (vars_grpd_splai['SpliceAI_pred_DS_AL']>0.9)|
                                              (vars_grpd_splai['SpliceAI_pred_DS_DG']>0.9)|
                                              (vars_grpd_splai['SpliceAI_pred_DS_DL']>0.9))
                                    ].drop_duplicates(cols_cpra)
vars_grpd_splai.reset_index(drop=True, inplace=True)
# -> 116 variants

#### MaxEntScan high disruption/creation of splice site

In [None]:
vars_grpd_MaxEntScan = vars_grpd[columns_df_prior][vars_grpd['MaxEntScan_alt'].notnull()].copy()
vars_grpd_MaxEntScan[['MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref']] = vars_grpd_MaxEntScan[[
    'MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref']].astype(float)

# disr_ss = disrupts native splice site, diff > 0
disr_ss_vars = vars_grpd_MaxEntScan[(vars_grpd_MaxEntScan['MaxEntScan_diff']>=0)].copy()
disr_high_vars = disr_ss_vars[cols_cpra][disr_ss_vars['MaxEntScan_alt']<6.2]
disr_mod_vars =disr_ss_vars[cols_cpra][(disr_ss_vars['MaxEntScan_alt']>=6.2)&
                                     (disr_ss_vars['MaxEntScan_alt']<=8.5)]
disr_low_vars = disr_ss_vars[cols_cpra][disr_ss_vars['MaxEntScan_alt']>8.5]

# -> MES high disr -> 417 variants -> too many
disr_high_vars = disr_high_vars[disr_high_vars['PosExonRefSeqAccession']<3]
# -> 159 variants

# new_ss = creates new splice site, diff < 0
new_ss_vars = vars_grpd_MaxEntScan[(vars_grpd_MaxEntScan['MaxEntScan_diff']<0)].copy()
new_high_vars = new_ss_vars[cols_cpra][new_ss_vars['MaxEntScan_alt']>8.5]
new_mod_vars = new_ss_vars[cols_cpra][(new_ss_vars['MaxEntScan_alt']>=6.2)&
                                      (new_ss_vars['MaxEntScan_alt']<=8.5)]
new_low_vars = new_ss_vars[cols_cpra][new_ss_vars['MaxEntScan_alt']<6.2]

# -> MES high new -> 472 variants -> too many
new_high_vars = new_high_vars[new_high_vars['PosExonRefSeqAccession']>15]
# -> 84 variants

#### dbscSNV (ada score, rf score) > 0.6

In [None]:
vars_grpd_dbscSNV = vars_grpd[columns_df_prior][vars_grpd['ada_score'].notnull()&
                                                vars_grpd['rf_score'].notnull()].copy()
vars_grpd_dbscSNV[['ada_score', 'rf_score']] = vars_grpd_dbscSNV[['ada_score', 'rf_score']].astype(float)
dbscSNV_high_vars = vars_grpd_dbscSNV[cols_cpra][(vars_grpd_dbscSNV['ada_score']>0.6)&
                                                 (vars_grpd_dbscSNV['rf_score']>0.6)].copy()
dbscSNV_high_vars.reset_index(drop=True, inplace=True)
#dbscSNV_high_xlsx = xlsx_df(dbscSNV_high)

# -> 278 variants

#### Variant position rel. to splice site intronic +/- 1,2

In [None]:
vars_grpd_pos_rel_df = vars_grpd[cols_cpra][(vars_grpd['PosExon_type'].isin(['outsideAcceptorSite', 
                                                        'outsideDonor']))&(vars_grpd['PosExonRefSeqAccession']<3)]
vars_grpd_pos_rel_df.reset_index(drop=True, inplace=True)
#pos_rel_df_xlsx = xlsx_df(pos_rel_df)

# -> 56 variants

#### ClinVar assessment for positive/negative controls

In [None]:
clinvar_unknown_vars = vars_grpd[cols_cpra][(vars_grpd['ClinVar_Pathogenicity']=='Uncertain Significance')|
                                            (vars_grpd['ClinVar_Pathogenicity']=='uncertain_significance')|
                                            (vars_grpd['ClinVar_Pathogenicity']=='Uncertain significance')|
                                            (vars_grpd['ClinVar_Pathogenicity']=='not_provided')|
                                            (vars_grpd['ClinVar_Pathogenicity'].isnull())]
clinvar_benign_vars = vars_grpd[cols_cpra][(vars_grpd['ClinVar_Pathogenicity']=='Benign')|
                                           (vars_grpd['ClinVar_Pathogenicity']=='Likely Benign')]
clinvar_patho_vars = vars_grpd[cols_cpra][(vars_grpd['ClinVar_Pathogenicity']=='Pathogenic')|
                                          (vars_grpd['ClinVar_Pathogenicity']=='Likely Pathogenic')]

# pathogenic: 101 variants -> more filters needed
# benign: 1,603 variants -> more filters needed

In [None]:
# for positive controls: pathogenic, spliceAI>0.9
path_splAI = clinvar_patho_vars[clinvar_patho_vars['SpliceAI_pred_DP_AG'].notnull()].copy()
path_splAI = make_float(path_splAI)
path_splAI = path_splAI[(path_splAI['SpliceAI_pred_DS_AG']>0.9)|(path_splAI['SpliceAI_pred_DS_AL']>0.9)|
                        (path_splAI['SpliceAI_pred_DS_DG']>0.9)|(path_splAI['SpliceAI_pred_DS_DL']>0.9)
                       ].reset_index(drop=True)

# -> 30 variants

# for negative controls: benign, spliceAI == 0, rf/ada score < 0.001
ben_splAI = clinvar_benign_vars[(clinvar_benign_vars['SpliceAI_pred_DP_AG'].notnull())&
                                (clinvar_benign_vars['ada_score'].notnull())].copy()
ben_splAI = make_float(ben_splAI)
ben_splAI[['ada_score', 'rf_score']] = ben_splAI[['ada_score', 'rf_score']].astype(float)
ben_splAI = ben_splAI[(ben_splAI['SpliceAI_pred_DS_AG']==0)&(ben_splAI['SpliceAI_pred_DS_AL']==0)&
                      (ben_splAI['SpliceAI_pred_DS_DG']==0)&(ben_splAI['SpliceAI_pred_DS_DL']==0)&
                      (ben_splAI['MaxEntScan_diff'].notnull())&(ben_splAI['ada_score']<0.001)&
                      (ben_splAI['rf_score']<0.001)].reset_index(drop=True)

# -> 46 variants

#### Redundant variants

In [None]:
red_vars = all_variants[['#CHROM', 'POS', 'REF', 'ALT']].value_counts()
red_vars = red_vars[red_vars>5]

red_vars_df = all_variants.copy()
red_vars_df_splAI = red_vars_df[columns_df_prior][(red_vars_df['SpliceAI_pred_DS_AG'].notnull())].copy()
red_vars_df_splAI = make_float(red_vars_df_splAI)
red_vars_splAI = []
for (c,p,r,a), ct in zip(red_vars.index, red_vars):
    df = red_vars_df_splAI[
        ((red_vars_df_splAI['#CHROM']==c)&(red_vars_df_splAI['POS']==p)&
         (red_vars_df_splAI['REF']==r)&(red_vars_df_splAI['ALT']==a))&
        ((red_vars_df_splAI['SpliceAI_pred_DS_AG']>0)|(red_vars_df_splAI['SpliceAI_pred_DS_AL']>0)|
         (red_vars_df_splAI['SpliceAI_pred_DS_DG']>0)|(red_vars_df_splAI['SpliceAI_pred_DS_DL']>0))
    ].reset_index(drop=True)
    if df.empty == False:
        list_df = [df.iloc[0][i] for i in columns_df_prior[:-1]]
        list_df.append(list(df['PID']))
        red_vars_splAI.append(list_df)

red_vars_splAI_xlsx = pd.DataFrame(red_vars_splAI, columns=columns_df_xlsx[:-1])
red_vars_splAI_xlsx = red_vars_splAI_xlsx.join(pd.DataFrame(red_vars_splAI_xlsx['PIDs'].to_list())
                                              ).drop(columns='PIDs')

# -> 206 variants

#### Hot/Cold splice project gene priority

In [None]:
hot_gene_vars = vars_grpd[cols_cpra][vars_grpd['Splice Project Gene Priority']==
                                     'Hot (ACMG / MASTER)'].reset_index(drop=True)
warm_gene_vars = vars_grpd[cols_cpra][vars_grpd['Splice Project Gene Priority']==
                                      'Warm (MASTER-ACMG)'].reset_index(drop=True)
cold_gene_vars = vars_grpd[cols_cpra][vars_grpd['Splice Project Gene Priority']==
                                      'Cold (387-ACMG-MASTER)'].reset_index(drop=True)
#len(vars_grpd), len(hot_gene_vars), len(warm_gene_vars), len(cold_gene_vars)