In [1]:
import pandas as pd
import pysam as ps
import numpy as np

In [2]:
vars_splai = pd.read_csv('03_prioritized_xlsx/wo_splai_score.txt', sep='\t', 
                         names=['#CHROM', 'POS', 'ID', 'REF', 'ALT'])
all_vars = pd.read_pickle('00_dataframes/naga_vep_floss_hboc')

In [30]:
# function to make excel tables with one variant per row
columns_df_prior = ['#CHROM', 'POS', 'REF', 'ALT', 'MaxEntScan_alt', 'MaxEntScan_diff', 'MaxEntScan_ref', 
                    'CADD_PHRED', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL', 'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 
                    'SYMBOL', 'ada_score', 'rf_score', 'PosExonRefSeqAccession', 'PosExon_type', 
                    'ClinVar_Pathogenicity', 'ClinVar_Traits', 'PID']
columns_df_xlsx = columns_df_prior[:-1]
columns_df_xlsx.extend(['PIDs', 'ct_PIDs'])
cols_cpra = ['#CHROM', 'POS', 'REF', 'ALT']

def xlsx_df(prio_df):
    lists = []
    for p,df in prio_df.groupby(by=['#CHROM', 'POS', 'REF', 'ALT']):
        df = df.reset_index(drop=True)
        list_df = [df.iloc[0][i] for i in columns_df_prior[:-1]]
        list_df.append(list(df['PID']))
        lists.append(list_df)
    return pd.DataFrame(lists, columns=columns_df_xlsx)

def xlsx_vars(prio_vars_df, name):
    #merged = pd.merge(for_igv_splai_xlsx[columns_df_prior], prio_vars_df, how='left')
    lists = []
    for p,df in prio_vars_df.groupby(by=['#CHROM', 'POS', 'REF', 'ALT']):
        df = df.reset_index(drop=True)
        list_df = [df.iloc[0][i] for i in columns_df_prior[:-1]]
        list_df.extend([list(df['PID']), len(df['PID'])])
        lists.append(list_df)
    new_df = pd.DataFrame(lists, columns=columns_df_xlsx)
    new_df['#CHROM'] = pd.Categorical(new_df['#CHROM'],
                                      categories=['1','2','3','4','5','6','7','8','9','10','11','12', '13', '14', 
                                                  '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y'],
                                      ordered=True)
    new_df.sort_values(by=['ct_PIDs', '#CHROM', 'POS', 'REF', 'ALT'], inplace=True, ignore_index=True, 
                       ascending=[False, True, True, True, True])
    new_df.drop(axis='columns', columns='ct_PIDs', inplace=True)
    new_df.to_excel('03_prioritized_xlsx/%s.xlsx'%name, index=False)

In [4]:
vcf_header = ps.VariantHeader()
for i in vars_splai['#CHROM'].unique():
    vcf_header.add_meta('contig', items = [('ID', i)])
vcf_header.add_meta('reference', 
                    value='/mnt/g27prist/CMTD/Stephan/bcbio_installation/genomes/Hsapiens/GRCh37/seq/GRCh37.fa')

vcf_out = ps.VariantFile('03_prioritized_xlsx/for_splai.vcf', 'w', header=vcf_header)

for i in vars_splai.index:
    rec = vcf_out.new_record()
    el = vars_splai.loc[i]
    rec.chrom = str(el['#CHROM'])
    rec.start = el['POS']-1
    rec.stop = el['POS']
    rec.ref = el['REF']
    rec.alts = el['ALT']
    vcf_out.write(rec)
vcf_out.close()

In [5]:
# ALT sequences were comma-separated --> 
with open('03_prioritized_xlsx/for_splai_new.vcf', 'w') as w:
    with open('03_prioritized_xlsx/for_splai_new.txt') as o:
        for line in o.readlines():
            if line.startswith('#'):
                w.write(line)
                line = line.replace(',','')
                w.write(line)

In [2]:
# ran: OMP_NUM_THREADS=1 spliceai -I for_splai_new.vcf -O 2021-08-16_splai_rest.vcf -R /mnt/g27prist/CMTD/Stephan/bcbio_installation/genomes/Hsapiens/GRCh37/seq/GRCh37.fa -A grch37
# duration: ca. 15 minutes
splAI_out = pd.read_csv('03_prioritized_xlsx/2021-08-16_splai_rest.vcf', sep='\t', skiprows=27)
# 
for i,cat in enumerate(['ALLELE','SYMBOL','DS_AG','DS_AL','DS_DG','DS_DL','DP_AG','DP_AL','DP_DG','DP_DL']):
    splAI_out[cat] = splAI_out.INFO.str.split('|', expand=True)[i]
splAI_out = splAI_out.drop(columns='INFO')
splAI_out = splAI_out.replace({'.':np.nan, 'None':np.nan})

# make spliceAI prediction scores into floats
for i in ['AG', 'AL', 'DG', 'DL']:
    splAI_out[('DS_'+i)]=splAI_out[('DS_'+i)].astype(float)

In [5]:
#splAI_out[splAI_out['DS_AG'].notnull()].to_pickle('00_dataframes/new_splAI_scores')

In [7]:
len(splAI_out), len(splAI_out[splAI_out['DS_AG'].isnull()]), len(splAI_out[(splAI_out['DS_AG']>0.9)|
                                                                           (splAI_out['DS_AL']>0.9)|
                                                                           (splAI_out['DS_DG']>0.9)|
                                                                           (splAI_out['DS_DL']>0.9)])

(260, 68, 10)

In [8]:
splai_high = (splAI_out[((splAI_out['DS_AG']>=0.2)|
                         (splAI_out['DS_AL']>=0.2)|
                         (splAI_out['DS_DG']>=0.2)|
                         (splAI_out['DS_DL']>=0.2))])

In [9]:
#for i in np.arange(0,1,0.1):
#    print(i, len(splAI_out[(splAI_out['DS_AG']>i)|
#                           (splAI_out['DS_AL']>i)|
#                           (splAI_out['DS_DG']>i)|
#                           (splAI_out['DS_DL']>i)]))

In [10]:
for_igv_splai_xlsx = pd.merge(splai_high[['#CHROM', 'POS', 'REF', 'ALT', 'ALLELE', 'SYMBOL', 'DS_AG', 'DS_AL',
                                          'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']], 
                              all_vars[['#CHROM', 'POS', 'REF', 'ALT', 'MaxEntScan_alt', 
                                        'MaxEntScan_diff', 'MaxEntScan_ref', 'CADD_PHRED', 'ada_score', 
                                        'rf_score', 'PosExonRefSeqAccession', 'PosExon_type', 
                                        'ClinVar_Pathogenicity', 'ClinVar_Traits', 'PID']], 
                              on=['#CHROM','POS','REF','ALT'], how='left').drop_duplicates()

In [11]:
for_igv_splai_xlsx[['consensus_sequence_IGV_region_left', 
                    'consensus_sequence_IGV_sequence_left', 
                    'consensus_sequence_IGV_region_right', 
                    'consensus_sequence_IGV_sequence_right', 
                    'INFO_IGV', 'too_low_coverage', 'splice_change']] = np.nan
for_igv_splai_xlsx['shortcut'] = (for_igv_splai_xlsx['#CHROM']+'_'+for_igv_splai_xlsx['POS'].astype(str)+'_'+
                                  for_igv_splai_xlsx['REF']+'_'+for_igv_splai_xlsx['ALT'])

In [12]:
splai_grpd = for_igv_splai_xlsx.groupby(['#CHROM', 'POS', 'REF', 'ALT'])
lst_splai_grpd = [i for i, df in splai_grpd]

In [13]:
igv_for_excel = for_igv_splai_xlsx.drop_duplicates(subset=['#CHROM', 'POS', 'REF', 'ALT'])
#xlsx_vars(igv_for_excel, 'new_splai_some')

### Variants still without SpliceAI score

In [18]:
igv_wo_splai = splAI_out[splAI_out['DS_DG'].isnull()]

In [19]:
igv_wo_splai_xlsx = pd.merge(igv_wo_splai[['#CHROM', 'POS', 'REF', 'ALT', 'ALLELE', 'SYMBOL', 'DS_AG', 'DS_AL',
                                          'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL']], 
                              all_vars[['#CHROM', 'POS', 'REF', 'ALT', 'MaxEntScan_alt', 
                                        'MaxEntScan_diff', 'MaxEntScan_ref', 'CADD_PHRED', 'ada_score', 
                                        'rf_score', 'PosExonRefSeqAccession', 'PosExon_type', 
                                        'ClinVar_Pathogenicity', 'ClinVar_Traits', 'PID']], 
                              on=['#CHROM','POS','REF','ALT'], how='left').drop_duplicates()

In [31]:
xlsx_vars(igv_wo_splai_xlsx, 'no_splai_score')