# Strelka to Sc-TUSV-ext

In [None]:
import pandas as pd
import numpy as np
import os
import gzip
from io import StringIO

def convert_chromosome(chrom):
    if chrom == 'chrX':
        return 23
    else:
        return int(chrom[3:])

def read_vcf_without_headers(input_file_path):
    filtered_lines = []
    with gzip.open(input_file_path, 'rt') as file:
        for line in file:
            if not line.startswith('##'):
                filtered_lines.append(line)

    filtered_content = ''.join(filtered_lines)
    df_snv = pd.read_csv(StringIO(filtered_content), sep='\t')
    df_snv = df_snv[df_snv['#CHROM'] != 'chrY']
    df_snv['chr'] = df_snv['#CHROM'].apply(convert_chromosome)
    df_snv['data'] = df_snv[list(df_snv.columns)[-2]]
    display(df_snv)
    return df_snv

def strelka_to_sctusv(df_snv, output_file_path):
    
    file_vcf = open(output_file_path,'w')
    file_vcf.write('##fileformat=VCFv4.2\n\
##filedate=20211011\n\
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n\
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">\n\
##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">\n\
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n\
##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n\
##FORMAT=<ID=CN,Number=2,Type=Integer,Description="Copy number genotype for imprecise events">\n\
##FORMAT=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of adjacency">\n\
##FORMAT=<ID=BDP,Number=1,Type=Integer,Description="Depth of split reads">\n\
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">\n\
##ALT=<ID=DEL,Description="Deletion">\n\
##ALT=<ID=DUP,Description="Duplication">\n\
##ALT=<ID=INS,Description="Insertion of novel sequence">\n\
##ALT=<ID=CNV,Description="Copy number variable region">\n\
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	TUMOR	NORMAL\n')

    

    # Writing the CNVs
    #count=0
    #for index, row in df_dip_cnvs.iterrows():

    #    line = str(row['chr'])+'\t'+str(row['start'])+'\t'+'cnv'+str(count)+'\t'+'.'+'\t'+'<CNV>'+'\t'+'.\tPASS'+'\t'\
    #                    +'END='+str(row['end'])+';IMPRECISE'+'\t'+'GT:CN\t'+str('0|1')+':'+str(row['cn_a'])+','+str(row['cn_b'])+'\t'+'0|0:1,1\n'
    #    file_vcf.write(line)
    #    count+=1

    
    # Writing the SNVs
    count=0
    for index, row in df_snv.iterrows():
        if len(str(row['REF']))==1 and len(str(row['ALT']))==1:        
            line = str(row['chr'])+'\t'+str(row['POS'])+'\t'+'snv'+str(count)+'\t'+row['REF']+'\t'+row['ALT']+'\t'+'.\tPASS'+'\t'\
                     '.\tGT:CNADJ\t'+str(row['data'])[:4]+str(1)+'\t'+'0|0:0\n'
            
            file_vcf.write(line)
            count+=1
    file_vcf.close()
    print("done")
    
input_file_path = "strelka.vcf.gz"
output_file_path = "sample.vcf"
df_snv = read_vcf_without_headers(input_file_path)
strelka_to_sctusv(df_snv,output_file_path)

