In [None]:
import pandas as pd
import pysam as ps
import numpy as np

In [None]:
# previously filtered variants without spliceAI score
vars_splai = pd.read_csv('variants_wo_splai_score.txt', sep='\t', 
                         names=['#CHROM', 'POS', 'ID', 'REF', 'ALT'])
all_vars = pd.read_pickle('DataFrame_all_variants')

In [None]:
# create VCF file of variants without SpliceAI score
vcf_header = ps.VariantHeader()
for i in vars_splai['#CHROM'].unique():
    vcf_header.add_meta('contig', items = [('ID', i)])
vcf_header.add_meta('reference', value='reference_GRCh37.fa')

vcf_out = ps.VariantFile('variants_wo_splai.vcf', 'w', header=vcf_header)

for i in vars_splai.index:
    rec = vcf_out.new_record()
    el = vars_splai.loc[i]
    rec.chrom = str(el['#CHROM'])
    rec.start = el['POS']-1
    rec.stop = el['POS']
    rec.ref = el['REF']
    rec.alts = el['ALT']
    vcf_out.write(rec)
vcf_out.close()

In [None]:
# ALT sequences were comma-separated -> delete comma
with open('variants_wo_splai.vcf', 'w') as w:
    with open('new_variants_wo_splai.txt') as o:
        for line in o.readlines():
            if line.startswith('#'):
                w.write(line)
                line = line.replace(',','')
                w.write(line)

In [None]:
# convert 'new_variants_wo_splai.txt' to VCF file
# Python SpliceAI run:
# ran: OMP_NUM_THREADS=1 spliceai -I new_variants_wo_splai..vcf -O new_splai_score.vcf -R reference_GRCh37.fa -A grch37
# duration: ca. 15 minutes
splAI_out = pd.read_csv('new_splai_score.vcf', sep='\t', skiprows=27)
# convert SpliceAI output into DataFrame with columns
for i,cat in enumerate(['ALLELE','SYMBOL','DS_AG','DS_AL','DS_DG','DS_DL','DP_AG','DP_AL','DP_DG','DP_DL']):
    splAI_out[cat] = splAI_out.INFO.str.split('|', expand=True)[i]
splAI_out = splAI_out.drop(columns='INFO')
splAI_out = splAI_out.replace({'.':np.nan, 'None':np.nan})

# make spliceAI prediction scores into floats
for i in ['AG', 'AL', 'DG', 'DL']:
    splAI_out[('DS_'+i)]=splAI_out[('DS_'+i)].astype(float)
    
splAI_out.to_pickle('DataFrame_new_SpliceAI')