In [29]:
import pandas as pd
import numpy as np
import os
import gzip
from io import StringIO
import pandas as pd
import re



# TODO: It needs to take in Strelka SNVs, Strelka SVs and Manta SVs, CNVKit CNAs. 

folder_path = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling'

# Take in a list of sample identifiers as a .txt within that variant folder, with an option to auto-generate based on what's in the first folder


def convert_chromosome(chrom):
    if chrom == 'chrX':
        return 23
    elif chrom == "chrY":
        return 24
    else:
        return int(chrom[3:])


def read_sample_list(path):
    """
    Returns a list of samples to compile across Sarek output that has different Strelka, Mutect, Manta and CNVKit output in separate folders.

    Parameters
    ----------
    path: a path to a .txt file with the appropriate common folder headers. 
    """
    pass

# Function to init the vcf file with the sample name. 

def init_vcf(output_folder, sample):

    """
    Initializes the vcf output in the appropriate folder,

    Parameters:
    -------------

    output_folder: folder that will have the per-sample.vcf files compiled from various callers, this will be the input for TUSV-ext.
    sample: sample name, which will become the filename of the vcf file.
    
    """
    output_file_path = os.path.join(output_folder, sample + '.vcf')
    
    file_vcf = open(output_file_path,'w')
    file_vcf.write('##fileformat=VCFv4.2\n\
##filedate=20211011\n\
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n\
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">\n\
##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">\n\
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n\
##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n\
##FORMAT=<ID=CN,Number=2,Type=Integer,Description="Copy number genotype for imprecise events">\n\
##FORMAT=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of adjacency">\n\
##FORMAT=<ID=BDP,Number=1,Type=Integer,Description="Depth of split reads">\n\
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">\n\
##ALT=<ID=DEL,Description="Deletion">\n\
##ALT=<ID=DUP,Description="Duplication">\n\
##ALT=<ID=INS,Description="Insertion of novel sequence">\n\
##ALT=<ID=CNV,Description="Copy number variable region">\n\
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	TUMOR	NORMAL\n')

# Function to take in CNA from CNVkit

def write_df_to_vcf(df, vcf_file):
    """
    Writes a pandas DataFrame to a VCF file.
    
    Args:
    - df (pd.DataFrame): DataFrame containing the variant data.
    - vcf_file (str): Path to the output VCF file.
    """

    # Write the VCF header to the file
    with open(vcf_file, 'w') as f:
        for line in vcf_header:
            f.write(line + '\n')
    
    # Write the DataFrame to the VCF file
    df.to_csv(vcf_file, sep='\t', mode='a', index=False, header=False)

def load_cnvkit_cns(file_path):
    """
    Reads a CNVKit .cns file and loads it into a pandas DataFrame.
    
    Args:
    - file_path (str): Path to the CNVKit .cns file.
    
    Returns:
    - pd.DataFrame: DataFrame containing the CNVKit .cns data.
    """
    # Read the .cns file into a DataFrame
    df = pd.read_csv(file_path, sep='\t')
    
    # Ensure that the columns we need are present
    required_columns = ['chromosome', 'start', 'end', 'log2', 'depth']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in the .cns file.")
    
    # Return the DataFrame
    return df

def cnvkit_to_df(cnvkit_df):
    """
    Converts a CNVKit DataFrame into a VCF-like DataFrame.
    
    Args:
    - cnvkit_df (pd.DataFrame): DataFrame containing CNVKit data.
    
    Returns:
    - pd.DataFrame: DataFrame formatted for VCF output.
    """
    vcf_records = []
    for idx, row in cnvkit_df.iterrows():
        chrom = row['chromosome']
        pos = row['start']
        id = f'cnv{str(idx).zfill(2)}'
        ref = '.'
        alt = '<CNV>'
        qual = '.'
        filter = 'PASS'
        end = row['end']
        info = f'END={end};IMPRECISE'
        format_field = 'GT:CN'
        tumor_cn = f'1|1:{row["log2"]},{row["depth"]}'
        normal_cn = '0|0:1,1'
        record = {
            '#CHROM': chrom,
            'POS': pos,
            'ID': id,
            'REF': ref,
            'ALT': alt,
            'QUAL': qual,
            'FILTER': filter,
            'INFO': info,
            'FORMAT': format_field,
            'TUMOR': tumor_cn,
            'NORMAL': normal_cn
        }
        vcf_records.append(record)
    
    vcf_df = pd.DataFrame(vcf_records)
    
    return vcf_df


# Function to take in SNVs from Strelka

def read_strelka(input_file_path):
    """
    Function that reads in the file path to the Strelka SNV output, processes it and returns it as a df_SNV.

    Parameters:
    -----------
    input_file_path = input file for the strelka_snv
    
    """
    filtered_lines = []
    with gzip.open(input_file_path, 'rt') as file:
        for line in file:
            if not line.startswith('##'):
                filtered_lines.append(line)

    filtered_content = ''.join(filtered_lines)
    df_snv = pd.read_csv(StringIO(filtered_content), sep='\t')
    df_snv['chr'] = df_snv['#CHROM'].apply(convert_chromosome)
    df_snv['data'] = df_snv[list(df_snv.columns)[-2]]

    return df_snv

def convert_strelka_to_tusv_ext_df(strelka_df):
    # Initialize an empty list to store the rows of the new DataFrame
    tusv_ext_rows = []

    for index, row in strelka_df.iterrows():
        chrom = row['#CHROM']
        pos = row['POS']
        ref = row['REF']
        alt = row['ALT']
        normal_data = row['NORMAL']
        tumor_data = row['TUMOR']
        
        # Example ID and other static fields for simplicity
        variant_id = f"snv{index:04d}"
        qual = "."
        filter_val = "PASS"
        info = "."
        format_val = "GT:CNADJ"
        
        # Extract GT and CNADJ values
        normal_gt = "0|0"
        tumor_gt = "0|1"
        normal_cnadj = "0"
        tumor_cnadj = "0"
        
        normal = f"{normal_gt}:{normal_cnadj}"
        tumor = f"{tumor_gt}:{tumor_cnadj}"
        
        tusv_ext_rows.append([
            chrom, pos, variant_id, ref, alt, qual, filter_val, info, format_val, tumor, normal
        ])
    
    # Create a new DataFrame with the appropriate columns
    tusv_ext_df = pd.DataFrame(tusv_ext_rows, columns=[
        '#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'TUMOR', 'NORMAL'
    ])
    
    return tusv_ext_df



# Function to take in large SVs from Manta

def read_manta_sv(input_file_path):


    filtered_lines = []
    with gzip.open(input_file_path, 'rt') as file:
        for line in file:
            if not line.startswith('##'):
                filtered_lines.append(line)

    filtered_content = ''.join(filtered_lines)
    df_manta_sv = pd.read_csv(StringIO(filtered_content), sep='\t')

    
    df_processed_manta = rewrite_manta_results(df_manta_sv)
    
    return df_processed_manta


def rewrite_manta_results(df):
    new_rows = []

    print(df.shape)
    df = df[df["FILTER"] == "PASS"]
    print(df.shape)
    
    # Identify TUMOR and NORMAL columns
    tumor_col = next(col for col in df.columns if col.split('-')[-1].startswith('T'))
    normal_col = next(col for col in df.columns if col.split('-')[-1].startswith('N'))
    
    id_counter = 0

    for _, row in df.iterrows():
        id_counter += 1
        sv_id = f"sv{id_counter:02d}"
        
        info_dict = dict(item.split('=') for item in row['INFO'].split(';') if '=' in item)
        mate_id = info_dict.get('MATEID', '').split(':')[0]
        sv_type = info_dict.get('SVTYPE', '')
        sv_len = info_dict.get('SVLEN', '')
        end = info_dict.get('END', '')
        
        alt = row['ALT']
        if sv_type == 'BND':
            match = re.search(r'\[(.*?)\[|\](.*?)\]', alt)
            if match:
                chr_pos = match.group(1) or match.group(2)
                chr, pos = chr_pos.split(':')
                alt = f"[{chr}:{pos}[" if '[' in row['ALT'] else f"]{chr}:{pos}]"
        
        # Extract PR and SR from FORMAT field
        format_fields = row['FORMAT'].split(':')
        tumor_values = row[tumor_col].split(':')
        normal_values = row[normal_col].split(':')
        
        format_dict = dict(zip(format_fields, tumor_values))
        pr_tumor = format_dict.get('PR', '0,0')
        sr_tumor = format_dict.get('SR', '0,0')
        
        format_dict = dict(zip(format_fields, normal_values))
        pr_normal = format_dict.get('PR', '0,0')
        sr_normal = format_dict.get('SR', '0,0')
        
        new_row = {
            '#CHROM': row['#CHROM'],
            'POS': row['POS'],
            'ID': sv_id,
            'REF': row['REF'],
            'ALT': alt,
            'QUAL': row['QUAL'],
            'FILTER': row['FILTER'],
            'INFO': f"MATEID={mate_id};SVTYPE={sv_type};SVLEN={sv_len};END={end}",
            'FORMAT': 'GT:PR:SR',
            'TUMOR': f"{tumor_values[0]}:{pr_tumor}:{sr_tumor}",
            'NORMAL': f"{normal_values[0]}:{pr_normal}:{sr_normal}"
        }
        
        new_rows.append(new_row)
    
    new_df = pd.DataFrame(new_rows, columns=['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'TUMOR', 'NORMAL'])
    return new_df

In [22]:
# Test box for reading in Strelka indel files to test

test_strelka_indel_path = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling/strelka/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4_vs_AUR-AER5-NT2.strelka.somatic_snvs.vcf.gz'

df_indel = read_strelka(test_strelka_snv_path)


print(df_indel.head())



  #CHROM    POS ID REF ALT QUAL  FILTER  \
0   chr1  10120  .   T   C    .  LowEVS   
1   chr1  10257  .   A   C    .  LowEVS   
2   chr1  10330  .   C   A    .  LowEVS   
3   chr1  14063  .   G   A    .  LowEVS   
4   chr1  14653  .   C   T    .  LowEVS   

                                                INFO  \
0  DP=1080;MQ=12.08;MQ0=874;NT=ref;QSS=8;QSS_NT=8...   
1  DP=190;MQ=16.38;MQ0=92;NT=ref;QSS=3;QSS_NT=3;R...   
2  DP=212;MQ=21.15;MQ0=89;NT=ref;QSS=1;QSS_NT=1;R...   
3  DP=37;MQ=18.01;MQ0=25;NT=ref;QSS=1;QSS_NT=1;Re...   
4  DP=45;MQ=29.11;MQ0=14;NT=ref;QSS=4;QSS_NT=4;Re...   

                         FORMAT                         NORMAL  \
0  DP:FDP:SDP:SUBDP:AU:CU:GU:TU  48:18:1:0:2,19:0,2:0,2:28,112   
1  DP:FDP:SDP:SUBDP:AU:CU:GU:TU      12:7:1:0:5,41:0,5:0,0:0,0   
2  DP:FDP:SDP:SUBDP:AU:CU:GU:TU     26:19:6:0:0,5:7,23:0,1:0,1   
3  DP:FDP:SDP:SUBDP:AU:CU:GU:TU       3:0:0:0:0,0:0,0:3,14:0,0   
4  DP:FDP:SDP:SUBDP:AU:CU:GU:TU     14:0:0:0:0,0:12,18:0,0:2,3   

       

In [19]:
# Test box for reading in Strelka snv files to test.


test_strelka_snv_path = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling/strelka/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4_vs_AUR-AER5-NT2.strelka.somatic_indels.vcf.gz'

df_snv = read_strelka_snv(test_strelka_snv_path)

tusv_ext_snv = convert_strelka_to_tusv_ext_df(df_snv)
print(tusv_ext_snv.head())


  #CHROM    POS       ID REF ALT QUAL FILTER INFO    FORMAT  TUMOR NORMAL
0   chr1  10120  snv0000   T   C    .   PASS    .  GT:CNADJ  0|1:0  0|0:0
1   chr1  10257  snv0001   A   C    .   PASS    .  GT:CNADJ  0|1:0  0|0:0
2   chr1  10330  snv0002   C   A    .   PASS    .  GT:CNADJ  0|1:0  0|0:0
3   chr1  14063  snv0003   G   A    .   PASS    .  GT:CNADJ  0|1:0  0|0:0
4   chr1  14653  snv0004   C   T    .   PASS    .  GT:CNADJ  0|1:0  0|0:0


In [27]:
# Test box for reading in CNVKit .cns files to test.

test_cnvkit_cns_file = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling/cnvkit/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4.cns'

cnvkit_df = load_cnvkit_cns(test_cnvkit_cns_file)
print(cnvkit_df.head())

vcf_df = cnvkit_to_df(cnvkit_df)
print(vcf_df.head())



  chromosome    start       end gene      log2     depth  probes     weight  \
0       chr1    14034    351968    -  0.098452  14.55930      58    52.1960   
1       chr1   351968    459979    - -0.413539   7.87716      27    19.1118   
2       chr1   459979   1264807    - -0.022918  13.51170     180   170.4730   
3       chr1  1264807   1333091    -  0.322362  17.72340      17    16.8359   
4       chr1  1333091  16558479    - -0.011359  13.51330    3752  3682.8400   

      ci_lo     ci_hi  
0  0.080594  0.119033  
1 -0.444376 -0.386697  
2 -0.031888 -0.010873  
3  0.294485  0.344509  
4 -0.013140 -0.009817  
  #CHROM      POS     ID REF    ALT QUAL FILTER                    INFO  \
0   chr1    14034  cnv00   .  <CNV>    .   PASS    END=351968;IMPRECISE   
1   chr1   351968  cnv01   .  <CNV>    .   PASS    END=459979;IMPRECISE   
2   chr1   459979  cnv02   .  <CNV>    .   PASS   END=1264807;IMPRECISE   
3   chr1  1264807  cnv03   .  <CNV>    .   PASS   END=1333091;IMPRECISE   
4   ch

In [28]:
# Test box for reading in Manta .vcf files to determine format.

test_manta_file = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/wholebody_phylo/AER5_results/variant_calling/manta/AUR-AER5-TTM4_vs_AUR-AER5-NT2/AUR-AER5-TTM4_vs_AUR-AER5-NT2.manta.somatic_sv.vcf.gz'
test_organoid_wes_file = '/bgfs/alee/LO_LAB/Personal/Alexander_Chang/alc376/nfcore/achang4_bridgesPSC/sarek/Part2_FASTQ2SAREK/results/variant_calling/manta/123_organoid_vs_123_blood/123_organoid_vs_123_blood.manta.somatic_sv.vcf.gz'
df_processed_manta = read_manta_sv(test_organoid_wes_file)

print(df_processed_manta.head())

StopIteration: 