### Converting SV VCF files produced by Manta and annotated by VEP into simple CSV files for input to SigProfiler

In [2]:
import pandas as pd 
import numpy as np 
import sys 
import re 
import os 

In [21]:
def process_vcf_line(line):
    parts = line.strip().split('\t')
    
    # Extracting CHROM
    chrom = re.sub(r'chr', '', parts[0])

    # Extracting START
    start = parts[1]

    # Extracting and converting SVCLASS
    svclass = parts[2].split(':')[0]
    svclass_conversion = {
        'MantaDEL': 'deletion',
        'MantaINS': 'insertion',
        'MantaDUP': 'tandem-duplication',
        'MantaINV': 'inversion',
        'MantaBND': 'translocation'
    }
    svclass = svclass_conversion.get(svclass, 'Unknown')

    # Extracting SVLEN
    svlen = None
    if svclass in ['deletion', 'insertion', 'tandem-duplication']:
        info = parts[7]
        svlen_match = re.search(r'SVLEN=([-\d]+)', info)
        if svlen_match:
            svlen = svlen_match.group(1)
    
    svend = None
    if svclass in ['inversion']:
        info = parts[7]
        svend_match = re.search(r'END=([-\d]+)', info)
        if svend_match:
            svend = svend_match.group(1)

    # Extracting INV-TRN_ID and MATE-INV-TRN_ID
    inv_trn_id = mate_inv_trn_id = None
    if svclass == 'translocation':
        info = parts[7]
        inv_trn_id = parts[2]
        mate_match = re.search(r'MATEID=([^;]+)', info)
        if mate_match:
            mate_inv_trn_id = mate_match.group(1)

    return [chrom, start, svclass, svlen, svend, inv_trn_id, mate_inv_trn_id]

In [24]:
def process_vcf_file(vcf_filename):
    data = []
    with open(vcf_filename, 'r') as file:
        for line in file:
            if line.startswith('##'):
                continue
            if line.startswith('#'):
                continue
            data.append(process_vcf_line(line))

    df = pd.DataFrame(data, columns=['CHROM', 'START', 'SVCLASS', 'SVLEN', 'SVEND', 'INV-TRN_ID', 'MATE-INV-TRN_ID'])
    
    return df

#     # # Post-processing for inversion-translocation
#     # for index, row in df.iterrows():
#     #     if row['SVCLASS'] == 'inversion-translocation':
#     #         mate_row = df[df['INV-TRN_ID'] == row['MATE-INV-TRN_ID']]
#     #         if not mate_row.empty:
#     #             mate_row = mate_row.iloc[0]
#     #             if row['CHROM'] == mate_row['CHROM']:
#     #                 df.at[index, 'SVCLASS'] = 'inversion'
#     #             else:
#     #                 df.at[index, 'SVCLASS'] = 'translocation'

#     # return df

In [25]:
df = process_vcf_file('/Users/beverlyfu/Desktop/Thesis/Pediatric-Cancer/vcf_files/kidsfirst-nbl-sample.vcf')

In [26]:
def transform_and_export_df(df, vcf_filename):
    df['chrom1'] = df['CHROM']
    df['start1'] = df['START'].astype(int)
    df['end1'] = df['start1'] + 1
    df['chrom2'] = None
    df['start2'] = None
    df['end2'] = None

    for index, row in df.iterrows():
        if row['SVCLASS'] in ['tandem-duplication', 'deletion', 'insertion']:
            svlen = abs(int(row['SVLEN']) if row['SVLEN']) else 0
            df.at[index, 'chrom2'] = row['chrom1']
            df.at[index, 'start2'] = row['start1'] + svlen
            df.at[index, 'end2'] = df.at[index, 'start2'] + 1

        elif row['SVCLASS'] in ['inversion']:
            svend = int(row['SVEND']) if row['SVEND'] else 0
            df.at[index, 'chrom2'] = row['chrom1']
            df.at[index, 'start2'] = svend
            df.at[index, 'end2'] = svend + 1

        elif row['SVCLASS'] in ['translocation']:
            mate_row = df[df['INV-TRN_ID'] == row['MATE-INV-TRN_ID']]
            if not mate_row.empty:
                mate_row = mate_row.iloc[0]
                df.at[index, 'chrom2'] = mate_row['chrom1']
                df.at[index, 'start2'] = int(mate_row['START'])
                df.at[index, 'end2'] = int(mate_row['START']) + 1
                
    df.rename(columns={'SVCLASS': 'svclass'}, inplace=True)
    final_df = df[['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'svclass']]
    csv_filename = vcf_filename.replace('.vcf', '.csv')
    final_df.to_csv(csv_filename, index=False)

In [27]:
# Example usage
# Assuming df is your existing DataFrame from the original script
transform_and_export_df(df, '/Users/beverlyfu/Desktop/Thesis/Pediatric-Cancer/vcf_files/kidsfirst-nbl-sample.vcf')