In [1]:
import pandas as pd
import pysam as ps
import matplotlib.pyplot as plt
import numpy as np

In [None]:
naga_vars = pd.read_pickle('../03_andreas_HBOC_vars/00_SplicingNonCodingVariants/naga_vars_RNA')
naga_vars = naga_vars.sort_values(by=['#CHROM', 'POS'], ignore_index=True)

### VEP TXT

In [None]:
# data from vep
vep = pd.read_table('02_vep/vep_naga_vars.txt')

In [None]:
# some variants not with HGSg in VEP --> in VCF format for VEP
not_in_vep = naga_vars[~naga_vars['HGVSg'].isin(list(vep['#Uploaded_variation']))].copy()
not_in_vep.drop_duplicates(subset=['#CHROM', 'POS', 'REF', 'ALT'], inplace=True, ignore_index=True)
not_in_vep['ID'] = not_in_vep['HGVSg']
not_in_vep[['#CHROM', 'POS', 'ID', 'REF', 'ALT']].to_csv('02_vep/vep_ins.txt', sep='\t', header=False, index=False)

In [None]:
vep_ins = pd.read_table('02_vep/vep_ins_naga_vars.txt')

In [None]:
vep = vep.append(vep_ins, ignore_index=True)
vep[['transcr']] = vep['Feature'].str.split('.', expand=True)[0]
vep_ref = vep[vep['transcr'].isin(naga_vars['RefSeq accession'])].copy()
vep_ref.drop_duplicates(inplace=True)
vep_ref = vep_ref.reset_index(drop=True)
vep_ref = vep_ref.replace('-', np.nan)

In [None]:
# one RefSeq not in vep_ref, but in naga_vep (NM_020732) for gene ARID1B
# 25 unique variants are missing
# --> RefSeq has been replaced by NM_001374820
wrong_refseq = set(vep_ref['transcr']) ^ set(naga_vars['RefSeq accession'])
wrong_refseq_vars = naga_vars[naga_vars['RefSeq accession']==list(wrong_refseq)[0]]
vep_wrong_refseq = vep[(vep['#Uploaded_variation'].isin(list(wrong_refseq_vars['HGVSg'])))&
                       (vep['SYMBOL']==wrong_refseq_vars['HUGO_Symbol'].unique()[0])]
vep_wrong_refseq = vep_wrong_refseq[vep_wrong_refseq['transcr']=='NM_001374820']
# change RefSeq to Naga's RefSeq
new_refs = vep_wrong_refseq.copy()
new_refs['transcr'] = 'NM_020732'

In [None]:
vep_ref = vep_ref.append(new_refs, ignore_index=True)
vep_ref = vep_ref.rename(columns={'#Uploaded_variation':'HGVSg', 'transcr':'RefSeq accession'})

In [None]:
# merge VEP annotations to Naga variants with RNA info
naga_vep = pd.merge(naga_vars, vep_ref[['IMPACT', 'EXON', 'INTRON', 'cDNA_position', 'CDS_position',
                                        'Protein_position', 'Amino_acids', 'Codons', 'DISTANCE', 'STRAND', 
                                        'SIFT', 'PolyPhen', 'HGVS_OFFSET', 'SpliceAI_pred_DP_AG', 
                                        'SpliceAI_pred_DP_AL', 'SpliceAI_pred_DP_DG', 'SpliceAI_pred_DP_DL', 
                                        'SpliceAI_pred_DS_AG', 'SpliceAI_pred_DS_AL', 'SpliceAI_pred_DS_DG', 
                                        'SpliceAI_pred_DS_DL', 'SpliceAI_pred_SYMBOL', 'CADD_PHRED', 'CADD_RAW', 
                                        'MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref', 'ada_score', 
                                        'rf_score', 'HGVSg', 'RefSeq accession']], 
                    how='left', on=['HGVSg', 'RefSeq accession'])
naga_vep = naga_vep.drop_duplicates()

### FLOSSIES

In [None]:
# (previously as pickle)
floss = pd.read_pickle('00_dataframes/flossies_df')
floss.rename(columns={'CHROM':'#CHROM'}, inplace=True)
for df in [floss, naga_vep]:
    df['POS'] = df['POS'].astype(int)
    df[['#CHROM','REF','ALT']] = df[['#CHROM','REF','ALT']].astype(str)

In [None]:
naga_vep_floss = pd.merge(naga_vep, floss[['#CHROM', 'POS', 'REF', 'ALT', 'Splice_Change_Flossies', 
                                           'European_(n=7325)_Flossies', 'African_(n=2559)_Flossies',
                                           'Overall_Frequency_Flossies', 'European (n=3646)', 
                                           'African (n=1283)']], 
                          how='left', on=['#CHROM','POS','REF','ALT'])
naga_vep_floss = naga_vep_floss.replace('-', np.nan)
naga_vep_floss = naga_vep_floss.replace('.', np.nan)

### Andreas' HBOC variants

In [14]:
hboc = pd.read_excel('../03_andreas_HBOC_vars/00_SplicingNonCodingVariants/2021-02-17-AR-HBOC-Variants_for_SAV_project.xlsx')
hboc['POS'] = hboc['POS'].astype(int)
hboc[['#CHROM','REF','ALT']] = hboc[['#CHROM','REF','ALT']].astype(str)

find duplicates

In [15]:
hboc2 = hboc.copy()
hboc2 = hboc2.reset_index(drop=False)
dupl2 = hboc2.groupby(['#CHROM', 'POS', 'REF', 'ALT'])
dupl_cpra2 = [i for i,df in dupl2 if len(df)>1]
#dupl2.get_group(dupl_cpra2[0])

In [16]:
for_drop = []
for i in dupl_cpra2:
    df = dupl2.get_group(i)
    if not df[df['ART']=='keine Angabe'].empty and len(df[df['ART']=='keine Angabe'])<2:
        for_drop.append(df[df['ART']=='keine Angabe'].iloc[0]['index'])
hboc2.drop(index=for_drop, inplace=True)
#len(hboc2), len(hboc), len(for_drop)

In [17]:
dupl3 = hboc2.groupby(['#CHROM', 'POS', 'REF', 'ALT'])
dupl_cpra3 = [i for i,df in dupl3 if len(df)>1]
#len(dupl_cpra3)
#dupl3.get_group(dupl_cpra3[37])
#set(hboc.loc[30]) ^ set(hboc.loc[634])

In [18]:
# index 293/294 (gr1) classes 3/2
# index 515/516 (gr6) classes 2/3
# index 3366/3367 (gr20) classes 2/3
for_drop2 = [176, 348, 466, 468, 492, 623, 4001, 1190, 1240, 1248, 1302, 1329, 1372, 1373, 1371, 1444, 1462, 
             1465, 1537, 3419, 3276, 801, 818, 851, 1045, 1061, 1120, 1125, 1135, 1651, 2778, 2905, 1936, 2044, 
             30, 3035]
hboc2.drop(index=for_drop2, inplace=True)
#len(hboc2), len(hboc), len(for_drop2)

In [19]:
hboc2 = hboc2.rename(columns={'ART':'kind_hboc', 'KLASSIFIKATION':'classification_hboc', 
                              'ERFASSUNG':'aquisition_hboc', 'TASKFORCE_REVIEWED':'taskforce_review_hboc', 
                              'DATUM_TASKFORCE_REVIEW':'date_taskforce_hboc', 
                              'BEMERKUNG_TASKFORCE':'note_hboc', 'SONSTIGE_BEMERKUNGEN':'add_note_hboc', 
                              'SPLICE_PREDICTION_ALAMUT':'splice_pred_alamut_hboc', 
                              'PREDICTION_UMD_PREDICTOR':'prediction_hboc', 'KOMMENTAR':'comment_hboc', 
                              'LITERATUR_ERGEBNIS':'literature_hboc', 
                              'EVIDENZLEVEL_LITERATUR':'evidence_level_lit_hboc', 
                              'HANDLUNGSEMPFEHLUNG':'recomm_action_hboc', 'ID':'ID_hboc'})
hboc2 = hboc2[['#CHROM', 'POS', 'REF','ALT', 'ID_hboc', 'kind_hboc', 'classification_hboc', 'aquisition_hboc', 
               'taskforce_review_hboc', 'date_taskforce_hboc', 'note_hboc', 'add_note_hboc', 
               'splice_pred_alamut_hboc', 'prediction_hboc', 'literature_hboc', 'evidence_level_lit_hboc', 
               'comment_hboc', 'recomm_action_hboc']]

In [21]:
hboc2.to_pickle('00_dataframes/hboc_vars')

In [None]:
naga_vep_floss_hboc = pd.merge(naga_vep_floss, hboc2, on=['#CHROM','POS','REF','ALT'], how='left')

### New SpliceAI scores (for variants without SplicaAI score)

In [None]:
splai = pd.read_pickle('00_dataframes/new_splAI_scores')

In [None]:
new_splai = pd.merge(naga_vep_floss_hboc[naga_vep_floss_hboc['SpliceAI_pred_DP_AG'].isnull()].copy(), 
                     splai[['#CHROM', 'POS', 'REF', 'ALT', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 
                            'DP_DG', 'DP_DL', 'SYMBOL']], on=['#CHROM', 'POS', 'REF', 'ALT'], how='left')
new_splai = new_splai.drop(columns=['SpliceAI_pred_DP_AG','SpliceAI_pred_DP_AL','SpliceAI_pred_DP_DG',
                                    'SpliceAI_pred_DP_DL','SpliceAI_pred_DS_AG','SpliceAI_pred_DS_AL',
                                    'SpliceAI_pred_DS_DG','SpliceAI_pred_DS_DL','SpliceAI_pred_SYMBOL'], )
new_splai = new_splai.rename(columns={i:'SpliceAI_pred_'+i for i in ['DS_AG','DS_AL','DS_DG','DS_DL','DP_AG',
                                                                     'DP_AL','DP_DG','DP_DL','SYMBOL']})
with_splai = naga_vep_floss_hboc[naga_vep_floss_hboc['SpliceAI_pred_DP_AG'].notnull()].copy()

In [None]:
naga_vep_floss_hboc_splai = with_splai.append(new_splai)
naga_vep_floss_hboc_splai['#CHROM'] = pd.Categorical(naga_vep_floss_hboc_splai['#CHROM'],
                                      categories=['1','2','3','4','5','6','7','8','9','10','11','12', '13', '14', 
                                                  '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
                                      ordered=True)
naga_vep_floss_hboc_splai.sort_values(by=['#CHROM', 'POS', 'REF', 'ALT'], inplace=True, ignore_index=True, 
                       ascending=True)

### All annotations together

In [None]:
len(naga_vars), len(naga_vep), len(naga_vep_floss), len(naga_vep_floss_hboc), len(naga_vep_floss_hboc_splai)

In [None]:
naga_vep_floss_hboc_splai.to_pickle('00_dataframes/naga_vep_floss_hboc_splai')