In [1]:
import pandas as pd
import numpy as np

### Functions

In [2]:
def search_col(df, inp):
    return [i for i in df.columns if inp in i.lower()]
cols_cpra = ['#CHROM', 'POS', 'REF', 'ALT']

In [3]:
# exclude one pandas dataframe (df2) from another dataframe (df1)
# pd.concat([df1, df2, df2]).drop_duplicates(keep=False)

### Variant input, cleaning known variants from NCT/Gepado/MASTER

In [4]:
all_vars = pd.read_pickle('00_dataframes/all_annotated_vars')
igv_vars = pd.read_pickle('00_dataframes/igv_vars')
known_vars = pd.read_excel('01_xlsx_files/2021-08-12_NCT_gepado_MASTER_variants.xlsx')

In [5]:
known_vars = known_vars.replace('\n\n', np.nan)
known_vars['Chr'] = known_vars['Chr'].replace({'0'+str(i):str(i) for i in range(1,10)})
known_vars = known_vars.rename(columns={'Chr':'#CHROM', 'Position':'POS', 'Reference Nucleotide':'REF',
                                        'Variant Nucleotide':'ALT'})
known_vars['PID_trans'] = known_vars['Patient Name'].str.split(',', expand=True)[0]
known_vars = known_vars[known_vars['POS'].str.isnumeric()==True].copy()

In [6]:
# see, if entries with different names exist
#known_vars[['try1', 'try2']] = known_vars['Patient Name'].str.split(', ', expand = True)
#known_vars[known_vars['try1']!=known_vars['try2']]
#known_vars = known_vars.drop(columns=['try1', 'try2'])
# --> 15 entries with different name after ',' -> but all not relevant

In [6]:
for df in [all_vars, igv_vars, known_vars]:
    for i in ['#CHROM', 'REF', 'ALT']:
        df[i] = df[i].astype(str)
    df['POS'] = df['POS'].astype(int)

In [8]:
#known_vars.to_pickle('00_dataframes/nct_gepado_master_vars')

In [100]:
search_col(all_vars, 'spliceai')

['SpliceAI_pred_DP_AG',
 'SpliceAI_pred_DP_AL',
 'SpliceAI_pred_DP_DG',
 'SpliceAI_pred_DP_DL',
 'SpliceAI_pred_DS_AG',
 'SpliceAI_pred_DS_AL',
 'SpliceAI_pred_DS_DG',
 'SpliceAI_pred_DS_DL',
 'SpliceAI_pred_SYMBOL']

### Merge dataframes

In [7]:
# merge vars: variants being described in known_vars and analyzed by me in IGV
merge_vars = pd.merge(igv_vars[cols_cpra].drop_duplicates(), known_vars[cols_cpra].drop_duplicates(), how='inner'
                     ).drop_duplicates()
# not_known_vars: variants only analyzed by me in IGV, but not in known_vars
not_known_vars = pd.concat([igv_vars[cols_cpra].drop_duplicates(), merge_vars, merge_vars]
                          ).drop_duplicates(keep=False)

In [10]:
# merge IGV variants (476 variants) and NCT known variants (4414 variants) --> 108 variants
#len(pd.merge(igv_vars[cols_cpra].drop_duplicates(), known_vars[cols_cpra].drop_duplicates(), how='inner'))

# merge all Naga variants (8237 variants) and NCT known variants (4414 variants) --> 788 variants
#len(pd.merge(all_vars[cols_cpra].drop_duplicates(), known_vars[cols_cpra].drop_duplicates(), how='inner'))

In [8]:
# df with all by NCT/Gepado... known variants and splice prioritized, seen in IGV
merge_df = pd.merge(merge_vars, igv_vars, how='inner')
merge_df = pd.merge(merge_df, known_vars, on=cols_cpra+['PID_trans'], how='inner').drop_duplicates(
    subset=igv_vars.columns)

# df with all in IGV seen splice prioritized variants plus PID without ones in NCT/Gepado...
not_known_df = pd.merge(not_known_vars, igv_vars, how='inner')
not_known_df = pd.concat([igv_vars,merge_df,merge_df]).drop_duplicates(subset=igv_vars.columns,keep=False)

In [9]:
search_col(merge_df, 'short')

['shortcut']

In [88]:
# control length of dfs
#len(merge_vars), len(not_known_vars), len(igv_vars[cols_cpra].drop_duplicates())
#len(merge_df), len(not_known_df), len(igv_vars), len(merge_df.append(not_known_df))

In [10]:
for i in ['8_30916058_A_G', '16_2137924_TCCCTGCAGTGCAGGAAAGGTAGGGCCGGGTGGGG_T', '22_24175755_G_A', '1_45797835_T_G', 
          '2_47612302_C_G', '2_48033789_C_T', '3_14190057_C_T', '3_52443725_C_G', '10_104389820_C_A', 
          '11_108224490_T_G', '12_133254296_C_A', '13_48881547_G_A', '13_48953712_CTTTTT_C', '14_105243112_G_T', 
          '16_3650977_T_TA', '17_7579699_C_T', '17_29482996_TTCAGCTTCCAATA_T']:
    if i in merge_df['shortcut'].unique():
        print(i)

8_30916058_A_G
2_48033789_C_T
12_133254296_C_A
16_3650977_T_TA
17_7579699_C_T


In [99]:
# variants in both subsets, because some in some patients not written down
set(not_known_df['shortcut'].unique())&set(merge_df['shortcut'].unique())

{'11_108117691_G_A',
 '16_2112580_C_T',
 '16_2121617_T_C',
 '1_243663089_C_T',
 '1_45797228_C_T',
 '21_36231773_C_T',
 '2_128017002_A_C',
 '3_37035130_C_G',
 '8_30916058_A_G'}

In [None]:
merged_known_igv = pd.merge(igv_vars[cols_cpra+['shortcut', 'PID_trans']].drop_duplicates(), 
             known_vars[cols_cpra+['Variant Classification Total', 'PID_trans']].drop_duplicates(), 
             on=(cols_cpra+['PID_trans']), how='inner').drop_duplicates()
uncertain_merged = merged_known_igv[merged_known_igv['Variant Classification Total'].isin([
    'Unklare Signifikanz', 'Artefakt, fraglich', np.nan, 'Artefakt, gesichert'])]

In [None]:
uncertain_splice_yes = pd.merge(uncertain_merged, igv_vars.drop_duplicates(subset=cols_cpra), how='left')[
    pd.merge(uncertain_merged, igv_vars.drop_duplicates(subset=cols_cpra), how='left')['splice_change']=='yes']
uncertain_splice_yes = pd.merge(uncertain_splice_yes, known_vars, 
                                on=cols_cpra+['PID_trans', 'Variant Classification Total'], how='left')

In [None]:
not_known = pd.concat([igv_vars[cols_cpra+['PID_trans']], merged_known_igv[cols_cpra+['PID_trans']], 
                       merged_known_igv[cols_cpra+['PID_trans']]]).drop_duplicates(keep=False)

In [None]:
len(not_known.groupby(cols_cpra)), len(merged_known_igv.groupby(cols_cpra)), len(igv_vars.groupby(cols_cpra))

In [None]:
uncertain_splice_yes[['#CHROM', 'POS', 'REF', 'ALT', 'PID_trans', 'Variant Classification Total', 'INFO_IGV',
                      'Tumorboard Counseling Information', 'Included in Report', 'Splice Project Gene Priority']]

In [None]:
no_classif = pd.merge(uncertain_merged[uncertain_merged['Variant Classification Total'].isnull()], igv_vars, 
                      on=cols_cpra)

In [None]:
spl_no_classif = no_classif[cols_cpra][no_classif['splice_change']=='yes'].drop_duplicates(subset=cols_cpra)

In [None]:
spl_no_classif_info = pd.merge(spl_no_classif, known_vars, on=cols_cpra, how='left')


In [None]:
spl_no_classif_info

In [None]:
spl_no_classif_nct = pd.merge(spl_no_classif_info, on=cols_cpra, how='left')
                               igv_vars[['#CHROM', 'POS', 'REF', 'ALT', 'consensus_sequence_IGV_region_left',
                                       'consensus_sequence_IGV_sequence_left', 'consensus_sequence_IGV_region_right',
                                       'consensus_sequence_IGV_sequence_right', 'INFO_IGV', 'too_low_coverage',
                                       'splice_change']].drop_duplicates(subset=cols_cpra), 
                               on=cols_cpra, how='left')


In [None]:
spl_no_classif_info

In [None]:
len(no_classif_info)

In [None]:
len(pd.merge(uncertain_merged, igv_vars, on=cols_cpra)[pd.merge(uncertain_merged, igv_vars, on=cols_cpra)
                                                       ['splice_change']=='yes'].groupby(cols_cpra))