To prioritize splice-relevant variants

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [7]:
# function to search for specific name in column
def search_col(inp):
    return [i for i in naga_all.columns if inp in i.lower()]

In [23]:
# function to make spliceAI prediction scores into floats
def make_float(df):
    for i in ['AG', 'AL', 'DG', 'DL']:
        df[('SpliceAI_pred_DS_'+i)]=df[('SpliceAI_pred_DS_'+i)].astype(float)
    for i in ['ada_score', 'rf_score', 'MaxEntScan_diff', 'MaxEntScan_alt', 'MaxEntScan_ref']:
        df[i] = df[i].astype(float)
    return df

In [24]:
# function to make excel tables with one variant per row
columns_df_prior = ['#CHROM', 'POS', 'REF', 'ALT', 'MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref', 
                    'CADD_PHRED', 'SpliceAI_pred_DP_AG', 'SpliceAI_pred_DP_AL', 'SpliceAI_pred_DP_DG', 
                    'SpliceAI_pred_DP_DL', 'SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL', 'SpliceAI_pred_DS_DG', 
                    'SpliceAI_pred_DS_DL', 'SpliceAI_pred_SYMBOL', 'ada_score', 'rf_score',
                    'PosExonRefSeqAccession', 'PosExon_type', 'ClinVar_Pathogenicity', 'ClinVar_Traits', 'PID']
columns_df_xlsx = columns_df_prior[:-1]
columns_df_xlsx.extend(['PIDs', 'ct_PIDs'])
cols_cpra = ['#CHROM', 'POS', 'REF', 'ALT']

def xlsx_df(prio_df):
    lists = []
    for p,df in prio_df.groupby(by=['#CHROM', 'POS', 'REF', 'ALT']):
        df = df.reset_index(drop=True)
        list_df = [df.iloc[0][i] for i in columns_df_prior[:-1]]
        list_df.append(list(df['PID']))
        lists.append(list_df)
    return pd.DataFrame(lists, columns=columns_df_xlsx)

def xlsx_vars(prio_vars_df, name):
    merged = pd.merge(naga_all[columns_df_prior], prio_vars_df, how='inner')
    lists = []
    for p,df in merged.groupby(by=['#CHROM', 'POS', 'REF', 'ALT']):
        df = df.reset_index(drop=True)
        list_df = [df.iloc[0][i] for i in columns_df_prior[:-1]]
        list_df.extend([list(df['PID']), len(df['PID'])])
        lists.append(list_df)
    new_df = pd.DataFrame(lists, columns=columns_df_xlsx)
    new_df['#CHROM'] = pd.Categorical(new_df['#CHROM'],
                                      categories=['1','2','3','4','5','6','7','8','9','10','11','12', '13', '14', 
                                                  '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
                                      ordered=True)
    new_df.sort_values(by=['ct_PIDs', '#CHROM', 'POS', 'REF', 'ALT'], inplace=True, ignore_index=True, 
                       ascending=[False, True, True, True, True])
    new_df.drop(axis='columns', columns='ct_PIDs', inplace=True)
    new_df.to_excel('03_prioritized_xlsx/%s.xlsx'%name, index=False)

In [54]:
# overview on variants counts for specific prioritizations
# any spliceAI score > 0.9
def splai_o09(df):
    return df[((df['SpliceAI_pred_DS_AG']>0.9)|(df['SpliceAI_pred_DS_AL']>0.9)|
               (df['SpliceAI_pred_DS_DG']>0.9)|(df['SpliceAI_pred_DS_DL']>0.9))].copy()

# ada/rf score > 0.6
def ada_rf_o06(df):
    return df[(df['ada_score']>0.6)&
              (df['rf_score']>0.6)].copy()

# MaxEntScan high disruption of native splice site
def MES_disr(df):
    return df[(df['MaxEntScan_diff']>=0)&
              (df['MaxEntScan_alt']<6.2)].copy()

# MaxEntScan high possibility of creating new splice site
def MES_new(df):
    return df[(df['MaxEntScan_diff']<0)&
              (df['MaxEntScan_alt']>8.5)].copy()

# position relative to exon/intron boundary less than 3 and in intron
def pos_rel_ei(df):
    return df[((df['PosExonRefSeqAccession']<3)&
               (df['PosExon_type'].isin(['outsideAcceptorSite', 'outsideDonorSite'])))|
              ((df['PosExonRefSeqAccession']<2)&
               (df['PosExon_type'].isin(['insideAcceptorSite', 'insideDonorSite'])))].copy()

# variant categorization per hot/warm/cold genes
def gene_prio(df):
    gus = ['AKT2', 'AKT3', 'ATR', 'ATRX', 'CDKN1A', 'CDKN2B', 'CDKN2C', 'CDKN2D', 'CEBPA', 'CEP57', 'DAXX', 'EGFR', 
           'EGLN1', 'EGLN2', 'EPAS1', 'EPHB2', 'ABRAXAS1', 'FANCM', 'GPRC5A', 'HNF1A', 'HORMAD1', 'HORMAD2', 'JAK2',
           'KIF1B', 'MAP3K1', 'MAP3K6', 'MDH2', 'MLH3', 'MMS19', 'MN1', 'MRE11', 'PALLD', 'PIK3C2G', 'PIK3R2', 
           'PMS1', 'PRF1', 'PTCH2', 'RAD50', 'RAD51', 'RECQL', 'RINT1']
    not_gus = (set(gus) ^ set(df['HUGO_Symbol'][df['Splice Project Gene Priority']=='Warm (MASTER-ACMG)'].unique()))
    df_lst = []
    for i,lst in zip(['Hot (ACMG / MASTER)', 'Warm (MASTER-ACMG)', 'Warm (MASTER-ACMG)', 'Cold (387-ACMG-MASTER)'], 
                     [[''], gus, not_gus, ['']]):
        df_lst.append(df[(df['Splice Project Gene Priority']==i)&
                         (~df['HUGO_Symbol'].isin(lst))].copy())
    return df_lst

def var_scores(df):
    table = pd.DataFrame(columns=['category', 'variant_count', 'row_count'])
    for data,n in zip([splai_o09(df), ada_rf_o06(df), MES_disr(df), MES_new(df), pos_rel_ei(df)]+gene_prio(df),
                      ['splai_o0.9', 'ada_rf_o0.6', 'MES_high_disr', 'MES_high_new', 'pos_rel_e_i', 
                       'gene_prio_hot', 'gene_prio_warm_wo_gus', 'gene_prio_gus', 'gene_prio_cold']):
        table = table.append({'category':n, 'variant_count':len(data.drop_duplicates(subset=cols_cpra)), 
                              'row_count':len(data)}, ignore_index=True)
    return table

In [26]:
naga_all=pd.read_pickle('00_dataframes/naga_vep_floss_hboc_splai')
naga_all['#CHROM'] = pd.Categorical(naga_all['#CHROM'],
                                    categories=['1','2','3','4','5','6','7','8','9','10','11','12', '13', '14', 
                                                '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
                                    ordered=True)
naga_all = naga_all.sort_values(by=['#CHROM', 'POS', 'REF', 'ALT'], ignore_index=True)
naga_all['PosExon_type'] = naga_all['PosExon_type'].replace({'insideDonor':'insideDonorSite', 
                                                             'outsideDonor':'outsideDonorSite'})
naga_all = make_float(naga_all)

### Overview count of variants

In [60]:
ov_all = var_scores(naga_all)
#ov_all.to_excel('03_prioritized_xlsx/overview_var_counts_all.xlsx', index=False)

# For grouped variants

## Variants grouped

In [10]:
vars_grpd = naga_all.drop_duplicates(subset=cols_cpra)
#len(naga_all), len(vars_grpd)

## SpliceAI grouped

In [11]:
vars_grpd_splai = vars_grpd[vars_grpd['SpliceAI_pred_DP_AG'].notnull()].copy()
vars_grpd_splai = make_float(vars_grpd_splai)

vars_grpd_splai = vars_grpd_splai[cols_cpra][((vars_grpd_splai['SpliceAI_pred_DS_AG']>0.9)|
                                              (vars_grpd_splai['SpliceAI_pred_DS_AL']>0.9)|
                                              (vars_grpd_splai['SpliceAI_pred_DS_DG']>0.9)|
                                              (vars_grpd_splai['SpliceAI_pred_DS_DL']>0.9))
                                    ].drop_duplicates()
vars_grpd_splai.reset_index(drop=True, inplace=True)

In [15]:
len(naga_all[((naga_all['SpliceAI_pred_DS_AG']>0.9)|(naga_all['SpliceAI_pred_DS_AL']>0.9)|
              (naga_all['SpliceAI_pred_DS_DG']>0.9)|(naga_all['SpliceAI_pred_DS_DL']>0.9))].drop_duplicates())

219

## MaxEntScan grouped

In [None]:
vars_grpd_MaxEntScan = vars_grpd[columns_df_prior][vars_grpd['MaxEntScan_alt'].notnull()].copy()
vars_grpd_MaxEntScan[['MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref']] = vars_grpd_MaxEntScan[[
    'MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref']].astype(float)

# disr_ss = disrupts native splice site, diff > 0
disr_ss_vars = vars_grpd_MaxEntScan[(vars_grpd_MaxEntScan['MaxEntScan_diff']>=0)].copy()
disr_high_vars = disr_ss_vars[cols_cpra][disr_ss_vars['MaxEntScan_alt']<6.2]
disr_mod_vars =disr_ss_vars[cols_cpra][(disr_ss_vars['MaxEntScan_alt']>=6.2)&
                                     (disr_ss_vars['MaxEntScan_alt']<=8.5)]
disr_low_vars = disr_ss_vars[cols_cpra][disr_ss_vars['MaxEntScan_alt']>8.5]

#disr_high_xlsx, disr_mod_xlsx, disr_low_xlsx = [xlsx_df(i) for i in [disr_high, disr_mod, disr_low]]

# new_ss = creates new splice site, diff < 0
new_ss_vars = vars_grpd_MaxEntScan[(vars_grpd_MaxEntScan['MaxEntScan_diff']<0)].copy()
new_high_vars = new_ss_vars[cols_cpra][new_ss_vars['MaxEntScan_alt']>8.5]
new_mod_vars = new_ss_vars[cols_cpra][(new_ss_vars['MaxEntScan_alt']>=6.2)&
                                      (new_ss_vars['MaxEntScan_alt']<=8.5)]
new_low_vars = new_ss_vars[cols_cpra][new_ss_vars['MaxEntScan_alt']<6.2]

# reset index for all dfs
for df in [disr_high_vars, disr_low_vars, disr_mod_vars, new_high_vars, new_low_vars, new_mod_vars]:
    df.reset_index(drop=True, inplace=True)

#new_high_xlsx, new_mod_xlsx, new_low_xlsx = [xlsx_df(i) for i in [new_high, new_mod, new_low]]

## dbscSNV (ada score, rf score) grouped

In [None]:
vars_grpd_dbscSNV = vars_grpd[columns_df_prior][vars_grpd['ada_score'].notnull()&
                                                vars_grpd['rf_score'].notnull()].copy()
vars_grpd_dbscSNV[['ada_score', 'rf_score']] = vars_grpd_dbscSNV[['ada_score', 'rf_score']].astype(float)
dbscSNV_high_vars = vars_grpd_dbscSNV[cols_cpra][(vars_grpd_dbscSNV['ada_score']>0.6)&
                                                 (vars_grpd_dbscSNV['rf_score']>0.6)].copy()
dbscSNV_high_vars.reset_index(drop=True, inplace=True)
#dbscSNV_high_xlsx = xlsx_df(dbscSNV_high)

## Variant position rel. to splice site grouped

In [None]:
vars_grpd_pos_rel_df = vars_grpd[cols_cpra][(vars_grpd['PosExon_type'].isin(['outsideAcceptorSite', 
                                                        'outsideDonor']))&(vars_grpd['PosExonRefSeqAccession']<3)]
vars_grpd_pos_rel_df.reset_index(drop=True, inplace=True)
#pos_rel_df_xlsx = xlsx_df(pos_rel_df)
#pos_rel_df_xlsx.to_excel('03_prioritized_xlsx/var_pos_ss.xlsx', index=False, sheet_name='Pos<3,oA,oD')

## ClinVar characterization grouped

In [None]:
clinvar_unknown_vars = vars_grpd[cols_cpra][(vars_grpd['ClinVar_Pathogenicity']=='Uncertain Significance')|
                                            (vars_grpd['ClinVar_Pathogenicity']=='uncertain_significance')|
                                            (vars_grpd['ClinVar_Pathogenicity']=='Uncertain significance')|
                                            (vars_grpd['ClinVar_Pathogenicity']=='not_provided')|
                                            (vars_grpd['ClinVar_Pathogenicity'].isnull())]
clinvar_benign_vars = vars_grpd[cols_cpra][(vars_grpd['ClinVar_Pathogenicity']=='Benign')|
                                           (vars_grpd['ClinVar_Pathogenicity']=='Likely Benign')]
clinvar_patho_vars = vars_grpd[cols_cpra][(vars_grpd['ClinVar_Pathogenicity']=='Pathogenic')|
                                          (vars_grpd['ClinVar_Pathogenicity']=='Likely Pathogenic')]
for df in [clinvar_unknown_vars, clinvar_benign_vars, clinvar_patho_vars]:
    df.reset_index(drop=True, inplace=True)

In [None]:
controls_cols = ['#CHROM', 'POS', 'REF', 'ALT', 'MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref', 
                 'CADD_PHRED', 'SpliceAI_pred_DP_AG', 'SpliceAI_pred_DP_AL', 'SpliceAI_pred_DP_DG', 
                 'SpliceAI_pred_DP_DL', 'SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL', 'SpliceAI_pred_DS_DG', 
                 'SpliceAI_pred_DS_DL', 'SpliceAI_pred_SYMBOL', 'ada_score', 'rf_score',
                 'PosExonRefSeqAccession', 'PosExon_type', 'ClinVar_Pathogenicity', 'ClinVar_Traits', 
                 'VEP_Most_Severe_Consequence', 'PID']
controls_xlsx_cols = controls_cols[:-1]
controls_xlsx_cols.extend(['PIDs', 'ct_PIDs'])

clinvar_benign_vars2 = vars_grpd[cols_cpra][(vars_grpd['ClinVar_Pathogenicity']=='Benign')|
                                           (vars_grpd['ClinVar_Pathogenicity']=='Likely Benign')]
clinvar_patho_vars2 = vars_grpd[cols_cpra][(vars_grpd['ClinVar_Pathogenicity']=='Pathogenic')|
                                          (vars_grpd['ClinVar_Pathogenicity']=='Likely Pathogenic')]

In [None]:
# with VEP most severe consequence for pathogenic variants as controls
merged2 = pd.merge(naga_all[controls_cols], clinvar_patho_vars2, how='inner')
lists2 = []
for p,df2 in merged2.groupby(by=['#CHROM', 'POS', 'REF', 'ALT']):
    df2 = df2.reset_index(drop=True)
    list_df2 = [df2.iloc[0][i] for i in controls_cols[:-1]]
    list_df2.extend([list(df2['PID']), len(df2['PID'])])
    lists2.append(list_df2)
new_df2 = pd.DataFrame(lists2, columns=controls_xlsx_cols)
new_df2['#CHROM'] = pd.Categorical(new_df2['#CHROM'],
                                    categories=['1','2','3','4','5','6','7','8','9','10','11','12', '13', '14', 
                                                '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
                                    ordered=True)
new_df2.sort_values(by=['ct_PIDs', '#CHROM', 'POS', 'REF', 'ALT'], inplace=True, ignore_index=True, 
                    ascending=[False, True, True, True, True])
new_df2.drop(axis='columns', columns='ct_PIDs', inplace=True)


In [None]:
# RNA expression data for specific variants
pd.merge(merged2[['#CHROM', 'POS', 'REF', 'ALT', 'PID']], naga_all, how='left')[['#CHROM', 'POS', 'REF', 'ALT', 
         'PID', 'Control_VAF', 'Tumor_VAF', 'exome_control_AF', 'local_control_AF', 'RNA_VAF', 'AF_C', 'AF_T', 
         'AF_RNA', 'AF_RNA_noSkipReads', 'Tumor_dpALT', 'Control_dpALT', 'Alt_T', 'Alt_C', 'Cov_T', 'Cov_C']][
    (pd.merge(merged2[['#CHROM', 'POS', 'REF', 'ALT', 'PID']], naga_all, how='left')['AF_C'].notnull())&
    (pd.merge(merged2[['#CHROM', 'POS', 'REF', 'ALT', 'PID']], naga_all, how='left')['VEP_Most_Severe_Consequence']
     =='splice_acceptor_variant')]

In [None]:
#new_df2.to_excel('03_prioritized_xlsx/clinvar_patho_vep.xlsx', index=False)

## Arnes variant table with positive controls by Gepado

In [None]:
arne_pos = pd.read_excel('03_prioritized_xlsx/arne_pos_crtls.xlsx')
arne_pos[['#CHROM', 'REF', 'ALT']] = arne_pos[['#CHROM', 'REF', 'ALT']].astype(str)
arne_pos['POS'] = arne_pos['POS'].astype(int)
arne_pos = arne_pos.rename(columns={'PID':'PID_trans'})
arne_pos_vars = arne_pos[cols_cpra].drop_duplicates().reset_index(drop=True)

In [None]:
# variants of Arnes table (28) not all in Nagas table (14)
arne_naga = pd.merge((arne_pos[['#CHROM', 'POS', 'REF', 'ALT']]).drop_duplicates(ignore_index=True), vars_grpd, 
                     how='inner')
xlsx_vars(arne_naga, 'arne_naga_vars')

In [None]:
#arne_naga[~arne_naga['HGVSg'].isin(pd.merge(ol_splai_mes_dbscSNV, naga_all, how='inner')['HGVSg'])]

In [None]:
#ol_splai_mes_dbscSNV_naga['ClinVar_Pathogenicity'].value_counts()

## Hot/Cold splice project gene priority grouped

In [None]:
hot_gene_vars = vars_grpd[cols_cpra][vars_grpd['Splice Project Gene Priority']==
                                     'Hot (ACMG / MASTER)'].reset_index(drop=True)
warm_gene_vars = vars_grpd[cols_cpra][vars_grpd['Splice Project Gene Priority']==
                                      'Warm (MASTER-ACMG)'].reset_index(drop=True)
cold_gene_vars = vars_grpd[cols_cpra][vars_grpd['Splice Project Gene Priority']==
                                      'Cold (387-ACMG-MASTER)'].reset_index(drop=True)
#len(vars_grpd), len(hot_gene_vars), len(warm_gene_vars), len(cold_gene_vars)

## gnomAD AF grouped

In [None]:
# previously filtered by Naga -> only variants with AF < 0.005
ga_af_vars = vars_grpd[vars_grpd['max_gnomAD_AF'].notnull()].copy()
ga_af_vars['max_gnomAD_AF'] = ga_af_vars['max_gnomAD_AF'].astype(float)

## Variants without splice prediction scores

In [None]:
no_splAI = vars_grpd[vars_grpd['SpliceAI_pred_DP_AG'].isnull()]
no_MES = vars_grpd[vars_grpd['MaxEntScan_alt'].isnull()]
no_dbsc = vars_grpd[vars_grpd['rf_score'].isnull()|vars_grpd['ada_score'].isnull()]

len(vars_grpd), len(no_dbsc), len(no_MES), len(no_splAI)

In [None]:
#no_splAI[(no_splAI['ALT'].str.len()==1)&(no_splAI['REF'].str.len()==1)]

## Overlap of criteria grouped

In [None]:
#len(vars_grpd_splai), len(disr_high_vars), len(dbscSNV_high_vars), len(vars_grpd_pos_rel_df), len(clinvar_patho_vars)

In [None]:
ol_splai_mes = pd.merge(vars_grpd_splai, disr_high_vars, how='inner')
ol_splai_mes_dbscSNV = pd.merge(ol_splai_mes, dbscSNV_high_vars, how='inner')
ol_splai_mes_dbscSNV_hot = pd.merge(ol_splai_mes_dbscSNV, hot_gene_vars, how='inner')

## Evaluated variants

In [None]:
new_splAI_high = pd.merge(vars_grpd_splai, vars_grpd, how='left'
                         )[~pd.merge(vars_grpd_splai, vars_grpd, how='left')['HGVSg'].isin(
    pd.merge(ol_splai_mes_dbscSNV, vars_grpd, how='left')['HGVSg'])]

In [None]:
new_MES_high_disr = pd.merge(disr_high_vars, vars_grpd, how='left'
                         )[~pd.merge(disr_high_vars, vars_grpd, how='left')['HGVSg'].isin(
    pd.merge(ol_splai_mes_dbscSNV, vars_grpd, how='left')['HGVSg'])]

In [None]:
new_MES_high_new = pd.merge(new_high_vars, vars_grpd, how='left'
                         )[~pd.merge(new_high_vars, vars_grpd, how='left')['HGVSg'].isin(
    pd.merge(ol_splai_mes_dbscSNV, vars_grpd, how='left')['HGVSg'])]

In [None]:
new_dbsc_high = pd.merge(dbscSNV_high_vars, vars_grpd, how='left'
                         )[~pd.merge(dbscSNV_high_vars, vars_grpd, how='left')['HGVSg'].isin(
    pd.merge(ol_splai_mes_dbscSNV, vars_grpd, how='left')['HGVSg'])]

In [None]:
new_pos_rel = pd.merge(vars_grpd_pos_rel_df, vars_grpd, how='left'
                         )[~pd.merge(vars_grpd_pos_rel_df, vars_grpd, how='left')['HGVSg'].isin(
    pd.merge(ol_splai_mes_dbscSNV, vars_grpd, how='left')['HGVSg'])]

## Make .xlsx with selected columns, PIDs for grouped variants

In [None]:
# dfs with all important information to .xlsx
#for df, n in [(vars_grpd_splai, 'spliceAI_high'), (disr_high_vars, 'MaxEntScan_high_disr'), 
              #(new_high_vars, 'MaxEntScan_high_new_ss'), (dbscSNV_high_vars, 'dbscSNV_high'), 
              #(vars_grpd_pos_rel_df, 'pos_rel_IntExBound_under3'), (clinvar_benign_vars, 'clinvar_benign'), 
              #(clinvar_patho_vars, 'clinvar_patho'), (ol_splai_mes_dbscSNV_naga, 'merge_splAI_MES_dbsc')]:
    #xlsx_vars(df, n)

In [None]:
#for df, n in [(new_splAI_high, 'new_splAI_high'), (new_MES_high_disr, 'new_MES_high_disr'), 
              #(new_MES_high_new, 'new_MES_high_new'), (new_dbsc_high, 'new_dbsc_high'), 
              #(new_pos_rel, 'new_pos_rel')]:
    #xlsx_vars(df, n)

In [None]:
# merged df to .xlsx
#xlsx_vars(ol_splai_mes_dbscSNV_hot, 'merge_splAI_MES_dbsc_hot')

# For original df, all rows

## SpliceAI

In [None]:
spl_ai = naga_all[naga_all['SpliceAI_pred_DP_AG'].notnull()].copy()
spl_ai = make_float(spl_ai)

# how many rows in each cutoff
splAI_ct = [(round(i,1), len(spl_ai[(spl_ai['SpliceAI_pred_DS_AG']>i)|(spl_ai['SpliceAI_pred_DS_AL']>i)|
                            (spl_ai['SpliceAI_pred_DS_DG']>i)|(spl_ai['SpliceAI_pred_DS_DL']>i)]))
            for i in np.arange(0,1,0.1)]

In [None]:
# high spliceAI scores (>0.9)
spl_ai_df = spl_ai[columns_df_prior][((spl_ai['SpliceAI_pred_DS_AG']>0.9)|(spl_ai['SpliceAI_pred_DS_AL']>0.9)|
                                      (spl_ai['SpliceAI_pred_DS_DG']>0.9)|(spl_ai['SpliceAI_pred_DS_DL']>0.9))
                                    ].drop_duplicates()
spl_ai_df_xlsx = xlsx_df(spl_ai_df)
spl_ai_df_xlsx.to_excel('03_prioritized_xlsx/splAI_vars.xlsx', index=False, sheet_name='SpliceAI>0.9')

In [None]:
spl_ai_gain = spl_ai[columns_df_prior][(spl_ai['PosExonRefSeqAccession']>2)&
                                       ((spl_ai['SpliceAI_pred_DS_AG']>0.9)|(spl_ai['SpliceAI_pred_DS_AL']>0.9)|
                                        (spl_ai['SpliceAI_pred_DS_DG']>0.9)|(spl_ai['SpliceAI_pred_DS_DL']>0.9))
                                      ].copy()
spl_ai_gain_xlsx = xlsx_df(spl_ai_gain)
spl_ai_gain_xlsx.to_excel('03_prioritized_xlsx/splAI_gain.xlsx', index=False, sheet_name='SpliceAI>0.9,pos>2')

## MaxEntScan

In [None]:
# differentiation between: (1) variant disrupts native splice site and (2) variant creates new splice site
MaxEntScan = naga_all[columns_df_prior][naga_all['MaxEntScan_alt'].notnull()].copy()
MaxEntScan[['MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref']] = MaxEntScan[['MaxEntScan_alt', 
                                                                                  'MaxEntScan_diff', 
                                                                                  'MaxEntScan_ref']].astype(float)
# disr_ss = disrupts native splice site, diff > 0
disr_ss = MaxEntScan[(MaxEntScan['MaxEntScan_diff']>=0)]
disr_high = disr_ss[disr_ss['MaxEntScan_alt']<6.2]
disr_mod = disr_ss[(disr_ss['MaxEntScan_alt']>=6.2)&(disr_ss['MaxEntScan_alt']<=8.5)]
disr_low = disr_ss[disr_ss['MaxEntScan_alt']>8.5]

disr_high_xlsx, disr_mod_xlsx, disr_low_xlsx = [xlsx_df(i) for i in [disr_high, disr_mod, disr_low]]

# new_ss = creates new splice site, diff < 0
new_ss = MaxEntScan[(MaxEntScan['MaxEntScan_diff']<0)]
new_high = new_ss[new_ss['MaxEntScan_alt']>8.5]
new_mod = new_ss[(new_ss['MaxEntScan_alt']>=6.2)&(new_ss['MaxEntScan_alt']<=8.5)]
new_low = new_ss[new_ss['MaxEntScan_alt']<6.2]

new_high_xlsx, new_mod_xlsx, new_low_xlsx = [xlsx_df(i) for i in [new_high, new_mod, new_low]]

lens_MES = [(n,len(d)) for d,n in zip([disr_ss, disr_high, disr_mod, disr_low, new_ss, new_high, new_mod, new_low],
                                      ['disr_ss', 'disr_high', 'disr_mod', 'disr_low', 
                                       'new_ss', 'new_high', 'new_mod', 'new_low'])]

In [None]:
writer = pd.ExcelWriter('03_prioritized_xlsx/MaxEntScan_vars.xlsx')
disr_high_xlsx[disr_high_xlsx['PosExonRefSeqAccession']<3].to_excel(writer, index=False, 
                                                                    sheet_name='disr_high_pos<3')
new_high_xlsx[new_high_xlsx['PosExonRefSeqAccession']>15].to_excel(writer, index=False, 
                                                                   sheet_name='new_high_pos>15')
writer.save()

## dbscSNV (ada score, rf score)

In [None]:
dbscSNV = naga_all[naga_all['ada_score'].notnull()&naga_all['rf_score'].notnull()].copy()
dbscSNV[['ada_score', 'rf_score']] = dbscSNV[['ada_score', 'rf_score']].astype(float)
dbscSNV_high = dbscSNV[(dbscSNV['ada_score']>0.6)&(dbscSNV['rf_score']>0.6)].copy()
dbscSNV_high_xlsx = xlsx_df(dbscSNV_high)

## Variant position rel. to splice site

In [None]:
pos_rel_df = naga_all[columns_df_prior][(naga_all['PosExon_type'].isin(['outsideAcceptorSite', 'outsideDonor']))&
                                        (naga_all['PosExonRefSeqAccession']<3)].drop_duplicates()
pos_rel_df_xlsx = xlsx_df(pos_rel_df)
pos_rel_df_xlsx.to_excel('03_prioritized_xlsx/var_pos_ss.xlsx', index=False, sheet_name='Pos<3,oA,oD')

## Hot/Cold splice project gene priority

In [None]:
# gene priorities
hot_gene, warm_gene, cold_gene = [naga_all[naga_all['Splice Project Gene Priority']==i].copy() for i in 
                                  ['Hot (ACMG / MASTER)', 'Warm (MASTER-ACMG)', 'Cold (387-ACMG-MASTER)']]
hot_gene_xlsx = xlsx_df(hot_gene)

In [None]:
# overlap hot splice gene priority, high spliceAI score > 0.9
hot_splAI_xlsx = pd.merge(spl_ai_df_xlsx, hot_gene_xlsx[cols_cpra], how='inner')
hot_splAI_xlsx.to_excel('03_prioritized_xlsx/hot_gene_splAI.xlsx', sheet_name='hotGenePriority,splAI>0.9', 
                        index=False)

## ClinVar characterization

In [None]:
# ClinVar groups
# clinvar_unknown: 12,520 rows (6,533 variants)
# clinvar_benign: 5,475 rows (1,603 variants)
# clinvar_patho: 173 rows (101 variants)
clinvar_unknown = naga_all[(naga_all['ClinVar_Pathogenicity']=='Uncertain Significance')|
                           (naga_all['ClinVar_Pathogenicity']=='uncertain_significance')|
                           (naga_all['ClinVar_Pathogenicity']=='Uncertain significance')|
                           (naga_all['ClinVar_Pathogenicity']=='not_provided')|
                           (naga_all['ClinVar_Pathogenicity'].isnull())]
clinvar_benign = naga_all[(naga_all['ClinVar_Pathogenicity']=='Benign')|
                          (naga_all['ClinVar_Pathogenicity']=='Likely Benign')]
clinvar_patho = naga_all[(naga_all['ClinVar_Pathogenicity']=='Pathogenic')|
                         (naga_all['ClinVar_Pathogenicity']=='Likely Pathogenic')]

In [None]:
# for positive control: pathogenic, spliceAI>0.9 --> 63 rows, 30 variants
path_splAI = clinvar_patho[clinvar_patho['SpliceAI_pred_DP_AG'].notnull()].copy()
path_splAI = make_float(path_splAI)
path_splAI = path_splAI[(path_splAI['SpliceAI_pred_DS_AG']>0.9)|(path_splAI['SpliceAI_pred_DS_AL']>0.9)|
                        (path_splAI['SpliceAI_pred_DS_DG']>0.9)|(path_splAI['SpliceAI_pred_DS_DL']>0.9)
                       ].reset_index(drop=True)
path_splAI_xlsx = xlsx_df(path_splAI)

# for negative control: benign, spliceAI == 0, rf/ada score < 0.001 --> 164 rows, 46 variants
ben_splAI = clinvar_benign[(clinvar_benign['SpliceAI_pred_DP_AG'].notnull())&
                           (clinvar_benign['ada_score'].notnull())].copy()
ben_splAI = make_float(ben_splAI)
ben_splAI[['ada_score', 'rf_score']] = ben_splAI[['ada_score', 'rf_score']].astype(float)
ben_splAI = ben_splAI[(ben_splAI['SpliceAI_pred_DS_AG']==0)&(ben_splAI['SpliceAI_pred_DS_AL']==0)&
                      (ben_splAI['SpliceAI_pred_DS_DG']==0)&(ben_splAI['SpliceAI_pred_DS_DL']==0)&
                      (ben_splAI['MaxEntScan_diff'].notnull())&(ben_splAI['ada_score']<0.001)&
                      (ben_splAI['rf_score']<0.001)].reset_index(drop=True)
ben_splAI_xlsx = xlsx_df(ben_splAI)

# in excel file
writer = pd.ExcelWriter('03_prioritized_xlsx/control_vars.xlsx')
path_splAI_xlsx.to_excel(writer, index=False, sheet_name='pos_crtl_path_high_splAI')
ben_splAI_xlsx.to_excel(writer, index=False, sheet_name='neg_crtl_benign_low_scores')
writer.save()

## Allele Frequency gnomAD

In [None]:
# AF by gnomAD ('max_gnomAD_AF') --> 15,174 rows (6,331 variants), all variants < 0.005 AF because of cutoff Naga
gnomAD_af = naga_all[naga_all['max_gnomAD_AF'].notnull()].copy()
gnomAD_af['max_gnomAD_AF'] = gnomAD_af['max_gnomAD_AF'].astype(float)

## Redundant variants

In [None]:
red_vars = naga_all[['#CHROM', 'POS', 'REF', 'ALT']].value_counts()
red_vars = red_vars[red_vars>5]

In [None]:
red_vars_df = naga_all.copy()
red_vars_df_splAI = red_vars_df[columns_df_prior][(red_vars_df['SpliceAI_pred_DS_AG'].notnull())].copy()
red_vars_df_splAI = make_float(red_vars_df_splAI)
red_vars_splAI = []
for (c,p,r,a), ct in zip(red_vars.index, red_vars):
    df = red_vars_df_splAI[
        ((red_vars_df_splAI['#CHROM']==c)&(red_vars_df_splAI['POS']==p)&
         (red_vars_df_splAI['REF']==r)&(red_vars_df_splAI['ALT']==a))&
        ((red_vars_df_splAI['SpliceAI_pred_DS_AG']>0)|(red_vars_df_splAI['SpliceAI_pred_DS_AL']>0)|
         (red_vars_df_splAI['SpliceAI_pred_DS_DG']>0)|(red_vars_df_splAI['SpliceAI_pred_DS_DL']>0))
    ].reset_index(drop=True)
    if df.empty == False:
        list_df = [df.iloc[0][i] for i in columns_df_prior[:-1]]
        list_df.append(list(df['PID']))
        red_vars_splAI.append(list_df)

In [None]:
red_vars_splAI_xlsx = pd.DataFrame(red_vars_splAI, columns=columns_df_xlsx)
red_vars_splAI_xlsx = red_vars_splAI_xlsx.join(pd.DataFrame(red_vars_splAI_xlsx['PIDs'].to_list())
                                              ).drop(columns='PIDs')
red_vars_splAI_xlsx.to_excel('03_prioritized_xlsx/redundant_vars_new.xlsx', sheet_name='redundant_vars_o6', 
                             index=False)

## One patient, many variants

In [None]:
# circos plot would be possible
by_patient = naga_all['PID'].value_counts()
by_patient_20x = by_patient[by_patient>19]
#len(by_patient), len(by_patient_20x)

## Overlap between prioritized variant categorizations

In [None]:
# overlap between spliceAI, position relative to ss --> 117 rows
# overlap between spliceAI, position relative to ss, MaxEntScan disr_high --> 115 rows
# overlap between spliceAI, position relative to ss, MaxEntScan disr_high, dbscSNV_high --> 114 rows, 52 variants
splAI_rel = pd.merge(spl_ai_df[['#CHROM', 'POS', 'REF', 'ALT', 'PID']], 
                        pos_rel_df[['#CHROM', 'POS', 'REF', 'ALT', 'PID']], how='inner')
splAI_rel_MES = pd.merge(splAI_rel, 
                         disr_high[['#CHROM', 'POS', 'REF', 'ALT', 'PID']], how='inner')
splAI_rel_MES_dbsc = pd.merge(splAI_rel_MES, 
                              dbscSNV_high[['#CHROM', 'POS', 'REF', 'ALT', 'PID']], how = 'inner')
naga_all_merge = pd.merge(splAI_rel_MES_dbsc, naga_all, on=['#CHROM', 'POS', 'REF', 'ALT', 'PID'], how='inner')

In [None]:
# with _xlsx dataframes, variant-specific
splAI_rel2 = pd.merge(spl_ai_df_xlsx[['#CHROM', 'POS', 'REF', 'ALT']], 
                        pos_rel_df_xlsx[['#CHROM', 'POS', 'REF', 'ALT']], how='inner')
splAI_rel_MES2 = pd.merge(splAI_rel2, 
                         disr_high_xlsx[['#CHROM', 'POS', 'REF', 'ALT']], how='inner')
splAI_rel_MES_dbsc2 = pd.merge(splAI_rel_MES2, 
                              dbscSNV_high_xlsx[['#CHROM', 'POS', 'REF', 'ALT']], how = 'inner')
splAI_rel_MES_dbsc_red2 = pd.merge(splAI_rel_MES_dbsc2, 
                                   red_vars_splAI_xlsx[['#CHROM', 'POS', 'REF', 'ALT']], how = 'inner')
#naga_all_merge2 = pd.merge(splAI_rel_MES_dbsc, naga_all, on=['#CHROM', 'POS', 'REF', 'ALT', 'PID'], how='inner')

In [None]:
merge2 = pd.merge(splAI_rel_MES_dbsc_red2, red_vars_splAI_xlsx, on=['#CHROM', 'POS', 'REF', 'ALT'], how='left')
merge_wona = merge2.copy()
merge_wona.dropna(axis=1, how='all', inplace=True)
merge_wona

In [None]:
naga_all_merge[columns_df_prior].to_excel('03_prioritized_xlsx/prioriz_naga_vars.xlsx', index=False, 
                                          sheet_name='SplAI,Pos<3,MES,dbscSNV')

In [None]:
# overlap high in spliceAI and create new splice site and position rel. to ss > 15 --> 
splAI_rel_MES_newSS = new_high[(new_high['SpliceAI_pred_DS_DG'].notnull())].copy()
splAI_rel_MES_newSS = make_float(splAI_rel_MES_newSS)
splAI_rel_MES_newSS =splAI_rel_MES_newSS[
                               ((splAI_rel_MES_newSS['SpliceAI_pred_DS_AG']>0.2)|
                                (splAI_rel_MES_newSS['SpliceAI_pred_DS_AL']>0.2)|
                                (splAI_rel_MES_newSS['SpliceAI_pred_DS_DG']>0.2)|
                                (splAI_rel_MES_newSS['SpliceAI_pred_DS_DL']>0.2))]