In [31]:
import pandas as pd
import numpy as np
import os
import gzip
from io import StringIO
import pandas as pd
import re



# Define global variables
cnv_count = 0
sv_count = 0
snv_count = 0


def increment_cnv():
    global cnv_count
    cnv_count += 1

def increment_sv():
    global sv_count
    sv_count += 1

def increment_snv():
    global snv_count
    snv_count += 1


# Take in a list of sample identifiers as a .txt within that variant folder, with an option to auto-generate based on what's in the first folder


def convert_chromosome(chrom):
    if chrom == 'chrX':
        return 23
    elif chrom == "chrY":
        return 24
    else:
        return int(chrom[3:])


def read_sample_list(path):
    """
    Returns a list of samples to compile across Sarek output that has different Strelka, Mutect, Manta and CNVKit output in separate folders.

    Parameters
    ----------
    path: a path to a .txt file with the appropriate common folder headers. 
    """
    pass

# Function to init the vcf file with the sample name. 

def init_vcf(output_folder, sample):

    """
    Initializes the vcf output in the appropriate folder,

    Parameters:
    -------------

    output_folder: folder that will have the per-sample.vcf files compiled from various callers, this will be the input for TUSV-ext.
    sample: sample name, which will become the filename of the vcf file.
    
    """
    output_file_path = os.path.join(output_folder, sample + '.vcf')
    
    file_vcf = open(output_file_path,'w')
    file_vcf.write('##fileformat=VCFv4.2\n\
##filedate=20211011\n\
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n\
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">\n\
##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">\n\
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n\
##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n\
##FORMAT=<ID=CN,Number=2,Type=Integer,Description="Copy number genotype for imprecise events">\n\
##FORMAT=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of adjacency">\n\
##FORMAT=<ID=BDP,Number=1,Type=Integer,Description="Depth of split reads">\n\
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">\n\
##ALT=<ID=DEL,Description="Deletion">\n\
##ALT=<ID=DUP,Description="Duplication">\n\
##ALT=<ID=INS,Description="Insertion of novel sequence">\n\
##ALT=<ID=CNV,Description="Copy number variable region">\n\
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	TUMOR	NORMAL\n')

    return output_file_path

# Function to take in CNA from CNVkit

def write_df_to_vcf(df, vcf_file):
    """
    Writes a pandas DataFrame to a VCF file with header already in it.
    
    Args:
    - df (pd.DataFrame): DataFrame containing the variant data.
    - vcf_file (str): Path to the output VCF file.
    """
    
    # Write the DataFrame to the VCF file
    df.to_csv(vcf_file, sep='\t', mode='a', index=False, header=False)

def load_cnvkit_cns(file_path):
    """
    Reads a CNVKit .cns file and loads it into a pandas DataFrame.
    
    Args:
    - file_path (str): Path to the CNVKit .cns file.
    
    Returns:
    - pd.DataFrame: DataFrame containing the CNVKit .cns data.
    """
    # Read the .cns file into a DataFrame
    df = pd.read_csv(file_path, sep='\t')
    
    # Ensure that the columns we need are present
    required_columns = ['chromosome', 'start', 'end', 'log2', 'depth']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in the .cns file.")
    
    # Return the DataFrame
    return df

def cnvkit_to_df(cnvkit_df):
    """
    Converts a CNVKit DataFrame into a VCF-like DataFrame.
    
    Args:
    - cnvkit_df (pd.DataFrame): DataFrame containing CNVKit data.
    
    Returns:
    - pd.DataFrame: DataFrame formatted for VCF output.
    """
    vcf_records = []

    
    
    for idx, row in cnvkit_df.iterrows():
        chrom = row['chromosome']
        pos = row['start']
        id = f'cnv{cnv_count}'
        increment_cnv()
        ref = '.'
        alt = '<CNV>'
        qual = '.'
        filter = 'PASS'
        end = row['end']
        info = f'END={end};IMPRECISE'
        format_field = 'GT:CN'
        tumor_cn = f'1|1:{row["log2"]},{row["depth"]}'
        normal_cn = '0|0:1,1'
        record = {
            '#CHROM': chrom,
            'POS': pos,
            'ID': id,
            'REF': ref,
            'ALT': alt,
            'QUAL': qual,
            'FILTER': filter,
            'INFO': info,
            'FORMAT': format_field,
            'TUMOR': tumor_cn,
            'NORMAL': normal_cn
        }
        vcf_records.append(record)
    
    vcf_df = pd.DataFrame(vcf_records)
    
    return vcf_df


# Function to take in SNVs from Strelka

def read_strelka(input_file_path):
    """
    Function that reads in the file path to the Strelka SNV output, processes it and returns it as a df_SNV.

    Parameters:
    -----------
    input_file_path = input file for the strelka_snv
    
    """
    filtered_lines = []
    with gzip.open(input_file_path, 'rt') as file:
        for line in file:
            if not line.startswith('##'):
                filtered_lines.append(line)

    filtered_content = ''.join(filtered_lines)
    df_snv = pd.read_csv(StringIO(filtered_content), sep='\t')
    df_snv['chr'] = df_snv['#CHROM'].apply(convert_chromosome)
    df_snv['data'] = df_snv[list(df_snv.columns)[-2]]

    return df_snv

def convert_strelka_to_tusv_ext_df(strelka_df, option):

    """
    Converts default read-in SV to a TUSV-ext compatible format.

    Parameters
    -----------

    strelka_df = Dataframe from read_strelka function
    option = either "SNV" or "INDEL" for ID labelling.
    """
    
    # Initialize an empty list to store the rows of the new DataFrame
    tusv_ext_rows = []

    print(strelka_df.shape)
    strelka_df = strelka_df[strelka_df["FILTER"] == "PASS"]
    print(strelka_df.shape)
    strelka_df.reset_index(inplace=True)

    for index, row in strelka_df.iterrows():
        chrom = row['#CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        normal_data = row['NORMAL']
        tumor_data = row['TUMOR']
        
        # Example ID and other static fields for simplicity

        if option == "SNV":
            variant_id = f"snv{snv_count}"
            increment_snv()
        elif option == "INDEL":
            variant_id = f"sv{sv_count}"
            increment_sv()
        
        qual = "."
        filter_val = row["FILTER"]
        info = "."
        format_val = "GT:CNADJ"
        
        # Extract GT and CNADJ values
        normal_gt = "0|0"
        tumor_gt = "0|1"
        normal_cnadj = "0"
        tumor_cnadj = "0"
        
        normal = f"{normal_gt}:{normal_cnadj}"
        tumor = f"{tumor_gt}:{tumor_cnadj}"
        
        tusv_ext_rows.append([
            chrom, pos, variant_id, ref, alt, qual, filter_val, info, format_val, tumor, normal
        ])
    
    # Create a new DataFrame with the appropriate columns
    tusv_ext_df = pd.DataFrame(tusv_ext_rows, columns=[
        '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'TUMOR', 'NORMAL'
    ])
    
    return tusv_ext_df



# Function to take in large SVs from Manta

def read_manta_sv(input_file_path):


    filtered_lines = []
    with gzip.open(input_file_path, 'rt') as file:
        for line in file:
            if not line.startswith('##'):
                filtered_lines.append(line)

    filtered_content = ''.join(filtered_lines)
    df_manta_sv = pd.read_csv(StringIO(filtered_content), sep='\t')
    
    return df_manta_sv


def rewrite_manta_results(df):
    new_rows = []

    print(df.shape)
    df = df[df["FILTER"] == "PASS"]
    print(df.shape)
    df.reset_index(inplace=True)

    # Identify TUMOR and NORMAL columns
    tumor_col = next(col for col in df.columns if col.split('-')[-1].startswith('T'))
    normal_col = next(col for col in df.columns if col.split('-')[-1].startswith('N'))
    
    id_counter = 0

    for _, row in df.iterrows():
        id_counter += 1
        sv_id = f"sv{sv_count}"
        increment_sv()
        
        info_dict = dict(item.split('=') for item in row['INFO'].split(';') if '=' in item)
        mate_id = info_dict.get('MATEID', '').split(':')[0]
        sv_type = info_dict.get('SVTYPE', '')
        sv_len = info_dict.get('SVLEN', '')
        end = info_dict.get('END', '')
        
        alt = row['ALT']
        if sv_type == 'BND':
            match = re.search(r'\[(.*?)\[|\](.*?)\]', alt)
            if match:
                chr_pos = match.group(1) or match.group(2)
                chr, pos = chr_pos.split(':')
                alt = f"[{chr}:{pos}[" if '[' in row['ALT'] else f"]{chr}:{pos}]"
        
        # Extract PR and SR from FORMAT field
        format_fields = row['FORMAT'].split(':')
        tumor_values = row[tumor_col].split(':')
        normal_values = row[normal_col].split(':')
        
        format_dict = dict(zip(format_fields, tumor_values))
        pr_tumor = format_dict.get('PR', '0,0')
        sr_tumor = format_dict.get('SR', '0,0')
        
        format_dict = dict(zip(format_fields, normal_values))
        pr_normal = format_dict.get('PR', '0,0')
        sr_normal = format_dict.get('SR', '0,0')
        
        new_row = {
            '#CHROM': row['#CHROM'],
            'POS': row['POS'],
            'ID': sv_id,
            'REF': row['REF'],
            'ALT': alt,
            'QUAL': row['QUAL'],
            'FILTER': row['FILTER'],
            'INFO': f"MATEID={mate_id};SVTYPE={sv_type};SVLEN={sv_len};END={end}",
            'FORMAT': 'GT:PR:SR',
            'TUMOR': f"{tumor_values[0]}:{pr_tumor}:{sr_tumor}",
            'NORMAL': f"{normal_values[0]}:{pr_normal}:{sr_normal}"
        }
        
        new_rows.append(new_row)
    
    new_df = pd.DataFrame(new_rows, columns=['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'TUMOR', 'NORMAL'])
    return new_df


def main():

    
    # Define global variables
    cnv_count = 0
    sv_count = 0
    snv_count = 0
    variant_calling_folder = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/TUSV-ext/variant_calling'
    output_folder = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/TUSV-ext/compiled_input/'
    sample_name = 'AUR-AER5-TTP2_vs_AUR-AER5-NT2'
    strelka_snv_path = os.path.join(variant_calling_folder,'strelka',sample_name,sample_name + '.strelka.somatic_snvs.vcf.gz')
    strelka_indel_path = os.path.join(variant_calling_folder,'strelka',sample_name,sample_name + '.strelka.somatic_indels.vcf.gz')
    manta_sv_path = os.path.join(variant_calling_folder,'manta',sample_name,sample_name + '.manta.somatic_sv.vcf.gz')
    cnvkit_cns_path = os.path.join(variant_calling_folder,'cnvkit',sample_name,sample_name.split('_vs_')[0] + '.cns')



    output_file_path = init_vcf(output_folder, sample_name)
    
    df_snv = read_strelka(strelka_snv_path)
    tusv_ext_snv = convert_strelka_to_tusv_ext_df(df_snv, "SNV")
    write_df_to_vcf(tusv_ext_snv, output_file_path)
    
    # Commenting out for now as it seems base TUSV-ext can't handle Strelka indels.
    # df_indel = read_strelka(strelka_indel_path)
    # tusv_ext_indel = convert_strelka_to_tusv_ext_df(df_indel, "INDEL")
    # write_df_to_vcf(tusv_ext_indel, output_file_path)


    df_manta_sv = read_manta_sv(manta_sv_path)
    df_processed_manta = rewrite_manta_results(df_manta_sv)
    write_df_to_vcf(df_processed_manta, output_file_path)
    

    cnvkit_df = load_cnvkit_cns(cnvkit_cns_path)
    vcf_df = cnvkit_to_df(cnvkit_df)
    write_df_to_vcf(vcf_df, output_file_path)

    
    
   
    
if __name__ == "__main__":
    main()





(145818, 13)
(1737, 13)
(381, 11)
(78, 11)


In [10]:
# ChatGPT modified version 2.0

import pandas as pd
import numpy as np
import os
import gzip
from io import StringIO
import re

# Define global variables
cnv_count = 0
sv_count = 0
snv_count = 0

def increment_cnv():
    global cnv_count
    cnv_count += 1

def increment_sv():
    global sv_count
    sv_count += 1

def increment_snv():
    global snv_count
    snv_count += 1

def convert_chromosome(chrom):
    if chrom == 'chrX':
        return 23
    elif chrom == "chrY":
        return 24
    else:
        return int(chrom[3:])

def init_vcf(output_folder, sample):
    output_file_path = os.path.join(output_folder, sample + '.vcf')
    with open(output_file_path, 'w') as file_vcf:
        file_vcf.write('##fileformat=VCFv4.2\n')
        file_vcf.write('##filedate=20211011\n')
        file_vcf.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n')
        file_vcf.write('##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">\n')
        file_vcf.write('##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">\n')
        file_vcf.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n')
        file_vcf.write('##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n')
        file_vcf.write('##FORMAT=<ID=CN,Number=2,Type=Integer,Description="Copy number genotype for imprecise events">\n')
        file_vcf.write('##FORMAT=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of adjacency">\n')
        file_vcf.write('##FORMAT=<ID=BDP,Number=1,Type=Integer,Description="Depth of split reads">\n')
        file_vcf.write('##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">\n')
        file_vcf.write('##ALT=<ID=DEL,Description="Deletion">\n')
        file_vcf.write('##ALT=<ID=DUP,Description="Duplication">\n')
        file_vcf.write('##ALT=<ID=INS,Description="Insertion of novel sequence">\n')
        file_vcf.write('##ALT=<ID=CNV,Description="Copy number variable region">\n')
        file_vcf.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tTUMOR\tNORMAL\n')
    return output_file_path

def write_df_to_vcf(df, vcf_file):
    df.to_csv(vcf_file, sep='\t', mode='a', index=False, header=False)

def load_cnvkit_cns(file_path):
    df = pd.read_csv(file_path, sep='\t')
    required_columns = ['chromosome', 'start', 'end', 'log2', 'depth']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in the .cns file.")
    return df

def cnvkit_to_df(cnvkit_df):
    vcf_records = []
    for idx, row in cnvkit_df.iterrows():
        chrom = row['chromosome']
        pos = row['start']
        id = f'cnv{cnv_count:04d}'
        increment_cnv()
        ref = '.'
        alt = '<CNV>'
        qual = '.'
        filter = 'PASS'
        end = row['end']
        info = f'END={end};IMPRECISE'
        format_field = 'GT:CN'
        tumor_cn = f'1|1:{row["log2"]},{row["depth"]}'
        normal_cn = '0|0:1,1'
        record = {
            '#CHROM': chrom,
            'POS': pos,
            'ID': id,
            'REF': ref,
            'ALT': alt,
            'QUAL': qual,
            'FILTER': filter,
            'INFO': info,
            'FORMAT': format_field,
            'TUMOR': tumor_cn,
            'NORMAL': normal_cn
        }
        vcf_records.append(record)
    vcf_df = pd.DataFrame(vcf_records)
    return vcf_df

def read_strelka(input_file_path):
    filtered_lines = []
    with gzip.open(input_file_path, 'rt') as file:
        for line in file:
            if not line.startswith('##'):
                filtered_lines.append(line)
    filtered_content = ''.join(filtered_lines)
    df_snv = pd.read_csv(StringIO(filtered_content), sep='\t')
    df_snv['chr'] = df_snv['#CHROM'].apply(convert_chromosome)
    return df_snv

def convert_strelka_to_tusv_ext_df(strelka_df, option):
    tusv_ext_rows = []
    strelka_df = strelka_df[strelka_df["FILTER"] == "PASS"]
    strelka_df.reset_index(inplace=True)
    for index, row in strelka_df.iterrows():
        chrom = row['#CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        if option == "SNV":
            variant_id = f"snv{snv_count:04d}"
            increment_snv()
            format_field = 'GT:CNADJ'
            tumor_data = f'0|1:{row["TUMOR"]}'
            normal_data = f'0|0:{row["NORMAL"]}'
        elif option == "INDEL":
            variant_id = f"sv{sv_count:04d}"
            increment_sv()
            format_field = 'GT:CNADJ:BDP:DP'
            tumor_data = f'1|0:{row["TUMOR"]}:0:0'
            normal_data = f'0|0:{row["NORMAL"]}:0:0'
        qual = "."
        filter_val = row["FILTER"]
        info = "."
        tusv_ext_rows.append([
            chrom, pos, variant_id, ref, alt, qual, filter_val, info, format_field, tumor_data, normal_data
        ])
    tusv_ext_df = pd.DataFrame(tusv_ext_rows, columns=[
        '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'TUMOR', 'NORMAL'
    ])
    return tusv_ext_df

def read_manta_sv(input_file_path):
    filtered_lines = []
    with gzip.open(input_file_path, 'rt') as file:
        for line in file:
            if not line.startswith('##'):
                filtered_lines.append(line)
    filtered_content = ''.join(filtered_lines)
    df_manta_sv = pd.read_csv(StringIO(filtered_content), sep='\t')
    return df_manta_sv


# This function needs a rewrite. ChatGPT can't handle it

def rewrite_manta_results(df):
    # Filter for BND types and paired breakpoints
    bnd_df = df[df['INFO'].str.contains("SVTYPE=BND")]

    # Split the INFO column into a dictionary for easy access
    bnd_df['INFO_DICT'] = bnd_df['INFO'].apply(lambda x: dict(item.split('=') for item in x.split(';') if '=' in item))

    # Filter out unpaired BNDs
    mate_ids = bnd_df['INFO_DICT'].apply(lambda x: x.get('MATEID', None))
    paired_bnd_df = bnd_df[bnd_df['ID'].isin(mate_ids.values)]

    # Prepare new VCF columns
    vcf_columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'TUMOR', 'NORMAL']
    vcf_df = pd.DataFrame(columns=vcf_columns)

    # Create a dictionary to store the paired rows
    paired_rows = {}
    for index, row in paired_bnd_df.iterrows():
        mate_id = row['INFO_DICT']['MATEID']
        paired_rows[row['ID']] = row
        if mate_id in paired_rows:
            mate_row = paired_rows[mate_id]

            # Create the ALT field
            alt = f"[{row['#CHROM']}:{mate_row['POS']}[" if row['ALT'].startswith('[') else f"]{row['#CHROM']}:{mate_row['POS']}]"

            # Create the INFO field
            info = f"MATEID={mate_id};SVTYPE=BND"

            # Create the FORMAT fields
            format_field = "GT:CNADJ:BDP:DP"
            tumor_data = "1|0:0.54:1:1"
            normal_data = "0|0:0:0:1"

            # Append the rows to the new DataFrame
            vcf_df = vcf_df.append({
                '#CHROM': row['#CHROM'], 'POS': row['POS'], 'ID': row['ID'], 'REF': row['REF'],
                'ALT': alt, 'QUAL': row['QUAL'], 'FILTER': row['FILTER'], 'INFO': info,
                'FORMAT': format_field, 'TUMOR': tumor_data, 'NORMAL': normal_data
            }, ignore_index=True)
            
            # Append the mate row
            alt_mate = f"[{mate_row['#CHROM']}:{row['POS']}[" if mate_row['ALT'].startswith('[') else f"]{mate_row['#CHROM']}:{row['POS']}]"
            info_mate = f"MATEID={row['ID']};SVTYPE=BND"
            vcf_df = vcf_df.append({
                '#CHROM': mate_row['#CHROM'], 'POS': mate_row['POS'], 'ID': mate_row['ID'], 'REF': mate_row['REF'],
                'ALT': alt_mate, 'QUAL': mate_row['QUAL'], 'FILTER': mate_row['FILTER'], 'INFO': info_mate,
                'FORMAT': format_field, 'TUMOR': tumor_data, 'NORMAL': normal_data
            }, ignore_index=True)

    return vcf_df
    
def main():
    global cnv_count, sv_count, snv_count
    cnv_count = 0
    sv_count = 0
    snv_count = 0
    variant_calling_folder = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/TUSV-ext/variant_calling'
    output_folder = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/TUSV-ext/compiled_input/'
    sample_name = 'AUR-AER5-TTM4_vs_AUR-AER5-NT2'
    strelka_snv_path = os.path.join(variant_calling_folder, 'strelka', sample_name, sample_name + '.strelka.somatic_snvs.vcf.gz')
    strelka_indel_path = os.path.join(variant_calling_folder, 'strelka', sample_name, sample_name + '.strelka.somatic_indels.vcf.gz')
    manta_sv_path = os.path.join(variant_calling_folder, 'manta', sample_name, sample_name + '.manta.somatic_sv.vcf.gz')
    cnvkit_cns_path = os.path.join(variant_calling_folder, 'cnvkit', sample_name, sample_name.split('_vs_')[0] + '.cns')

    output_file_path = init_vcf(output_folder, sample_name)
    df_snv = read_strelka(strelka_snv_path)
    tusv_ext_snv = convert_strelka_to_tusv_ext_df(df_snv, "SNV")
    write_df_to_vcf(tusv_ext_snv, output_file_path)

    df_manta_sv = read_manta_sv(manta_sv_path)
    df_processed_manta = rewrite_manta_results(df_manta_sv)
    write_df_to_vcf(df_processed_manta, output_file_path)

    cnvkit_df = load_cnvkit_cns(cnvkit_cns_path)
    vcf_df = cnvkit_to_df(cnvkit_df)
    write_df_to_vcf(vcf_df, output_file_path)

if __name__ == "__main__":
    main()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
import vcfpy

vcf_reader =  vcfpy.Reader.from_path('/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/TUSV-ext/compiled_input/AUR-AER5-TTM4_vs_AUR-AER5-NT2.vcf')

for record in vcf_reader:
    pass

/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/envs/tusv_ext/lib/python3.6/site-packages/vcfpy/parser.py:253: CannotConvertValue: 0|1 cannot be converted to Integer, keeping as string.
  CannotConvertValue,
/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/envs/tusv_ext/lib/python3.6/site-packages/vcfpy/parser.py:253: CannotConvertValue: 0|0 cannot be converted to Integer, keeping as string.
  CannotConvertValue,
/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/envs/tusv_ext/lib/python3.6/site-packages/vcfpy/header.py:415: FieldInfoNotFound: INFO SVLEN not found using Integer/1 instead
  FieldInfoNotFound,
/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/envs/tusv_ext/lib/python3.6/site-packages/vcfpy/parser.py:253: CannotConvertValue:  cannot be converted to Integer, keeping as string.
  CannotConvertValue,
/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/envs/tusv_ext/lib/python3.6/site-packages/vcfpy/parser.py:253: CannotConvertValue: 214,3 cannot be converted to Integer, keep

ValueError: invalid literal for int() with base 10: '214,3'

import vcf

vcf_reader = vcf.Reader(open('/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/TUSV-ext/compiled_input/AUR-AER5-TTM4_vs_AUR-AER5-NT2.vcf', 'r'))

for record in vcf_reader:
    print(record)

In [None]:
# need to add an option so SNVs are given SNV IDs but strelka indels are given SV ids

In [17]:
# Test box for reading in Strelka indel files to test

test_strelka_indel_path = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling/strelka/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4_vs_AUR-AER5-NT2.strelka.somatic_snvs.vcf.gz'

df_indel = read_strelka(test_strelka_indel_path)


print(df_indel.head())
tusv_ext_indel = convert_strelka_to_tusv_ext_df(df_indel, "INDEL")
print(tusv_ext_indel.head())


  #CHROM    POS ID REF ALT QUAL  FILTER  \
0   chr1  10120  .   T   C    .  LowEVS   
1   chr1  10257  .   A   C    .  LowEVS   
2   chr1  10330  .   C   A    .  LowEVS   
3   chr1  14063  .   G   A    .  LowEVS   
4   chr1  14653  .   C   T    .  LowEVS   

                                                INFO  \
0  DP=1080;MQ=12.08;MQ0=874;NT=ref;QSS=8;QSS_NT=8...   
1  DP=190;MQ=16.38;MQ0=92;NT=ref;QSS=3;QSS_NT=3;R...   
2  DP=212;MQ=21.15;MQ0=89;NT=ref;QSS=1;QSS_NT=1;R...   
3  DP=37;MQ=18.01;MQ0=25;NT=ref;QSS=1;QSS_NT=1;Re...   
4  DP=45;MQ=29.11;MQ0=14;NT=ref;QSS=4;QSS_NT=4;Re...   

                         FORMAT                         NORMAL  \
0  DP:FDP:SDP:SUBDP:AU:CU:GU:TU  48:18:1:0:2,19:0,2:0,2:28,112   
1  DP:FDP:SDP:SUBDP:AU:CU:GU:TU      12:7:1:0:5,41:0,5:0,0:0,0   
2  DP:FDP:SDP:SUBDP:AU:CU:GU:TU     26:19:6:0:0,5:7,23:0,1:0,1   
3  DP:FDP:SDP:SUBDP:AU:CU:GU:TU       3:0:0:0:0,0:0,0:3,14:0,0   
4  DP:FDP:SDP:SUBDP:AU:CU:GU:TU     14:0:0:0:0,0:12,18:0,0:2,3   

       

In [18]:
# Test box for reading in Strelka snv files to test.
import sys

test_strelka_snv_path = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling/strelka/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4_vs_AUR-AER5-NT2.strelka.somatic_indels.vcf.gz'

df_snv = read_strelka(test_strelka_snv_path)

tusv_ext_snv = convert_strelka_to_tusv_ext_df(df_snv, "SNV")
print(tusv_ext_snv.head())


(14497, 13)
(5411, 13)
  #CHROM      POS       ID REF               ALT QUAL FILTER INFO    FORMAT  \
0   chr1  1056668  snv0000   C               CGT    .   PASS    .  GT:CNADJ   
1   chr1  1423775  snv0001   C                CT    .   PASS    .  GT:CNADJ   
2   chr1  1509914  snv0002  TA                 T    .   PASS    .  GT:CNADJ   
3   chr1  2900016  snv0003   C                CG    .   PASS    .  GT:CNADJ   
4   chr1  3329041  snv0004   G  GCCTTCTCCCTGGGCC    .   PASS    .  GT:CNADJ   

   TUMOR NORMAL  
0  0|1:0  0|0:0  
1  0|1:0  0|0:0  
2  0|1:0  0|0:0  
3  0|1:0  0|0:0  
4  0|1:0  0|0:0  


In [27]:
# Test box for reading in CNVKit .cns files to test.

test_cnvkit_cns_file = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling/cnvkit/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4.cns'

cnvkit_df = load_cnvkit_cns(test_cnvkit_cns_file)
print(cnvkit_df.head())

vcf_df = cnvkit_to_df(cnvkit_df)
print(vcf_df.head())



  chromosome    start       end gene      log2     depth  probes     weight  \
0       chr1    14034    351968    -  0.098452  14.55930      58    52.1960   
1       chr1   351968    459979    - -0.413539   7.87716      27    19.1118   
2       chr1   459979   1264807    - -0.022918  13.51170     180   170.4730   
3       chr1  1264807   1333091    -  0.322362  17.72340      17    16.8359   
4       chr1  1333091  16558479    - -0.011359  13.51330    3752  3682.8400   

      ci_lo     ci_hi  
0  0.080594  0.119033  
1 -0.444376 -0.386697  
2 -0.031888 -0.010873  
3  0.294485  0.344509  
4 -0.013140 -0.009817  
  #CHROM      POS     ID REF    ALT QUAL FILTER                    INFO  \
0   chr1    14034  cnv00   .  <CNV>    .   PASS    END=351968;IMPRECISE   
1   chr1   351968  cnv01   .  <CNV>    .   PASS    END=459979;IMPRECISE   
2   chr1   459979  cnv02   .  <CNV>    .   PASS   END=1264807;IMPRECISE   
3   chr1  1264807  cnv03   .  <CNV>    .   PASS   END=1333091;IMPRECISE   
4   ch

In [11]:
# Test box for reading in Manta .vcf files to determine format.

test_manta_file = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling/manta/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4_vs_AUR-AER5-NT2.manta.somatic_sv.vcf.gz'
test_organoid_wes_file = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/nfcore/achang4_bridgesPSC/sarek/Part2_FASTQ2SAREK/results/variant_calling/manta/123_organoid_vs_123_blood/123_organoid_vs_123_blood.manta.somatic_sv.vcf.gz'
test_manta_wes_file = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/TUSV-ext/variant_calling/manta/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4_vs_AUR-AER5-NT2.manta.somatic_sv.vcf.gz'

df_processed_manta = read_manta_sv(test_manta_wes_file)

print(df_processed_manta.head(10))

  #CHROM       POS                               ID REF                 ALT  \
0   chr1   3722167       MantaBND:11184:1:3:0:0:0:1   A     A]chr1:3722443]   
1   chr1   3722434       MantaBND:11184:1:3:0:0:0:0   A     A]chr1:3722176]   
2   chr1  10413332       MantaBND:73506:1:1:6:0:0:1   T    T]chr1:10413436]   
3   chr1  10413428       MantaBND:73506:1:1:6:0:0:0   T    T]chr1:10413340]   
4   chr1  15349115       MantaBND:87210:0:0:1:0:0:1   A    A]chr1:15349202]   
5   chr1  15349194       MantaBND:87210:0:0:1:0:0:0   G    G]chr1:15349123]   
6   chr1  16996026   MantaBND:1:97014:97014:1:0:0:0   G    [chr1:16996118[G   
7   chr1  16996113   MantaBND:1:97014:97014:1:0:0:1   C    [chr1:16996031[C   
8   chr1  20792077   MantaBND:1:31559:31577:0:0:0:1   G  GT]chr18:37273578]   
9   chr1  20867797  MantaDUP:TANDEM:12529:1:7:0:1:0   A        <DUP:TANDEM>   

  QUAL           FILTER                                               INFO  \
0    .  MinSomaticScore  SVTYPE=BND;MATEID=MantaBND:

In [None]:
# Proceed with WES