In [None]:
import pandas as pd
import genepy.mutations as mut
import numpy as np

In [None]:
from itertools import islice
import gzip

# NOTEKEEPING CELL
# extra quotation marks in header throw off bcftools, so the following code removes them
# has been incorporated into upstream workflow so won't be needed again for this script
fields = {}
description = {}
c = 0
headerrow = 0
with gzip.open("../temp/CDS-hWv3gY_fixedcolumn.vcf.gz") as f:
    with open("../temp/header.hr", "w") as f2:
        for l in f:
            l = l.decode("utf-8") if type(l) is not str else l
            if l.startswith("##INFO=<ID=OC_provean__prediction"):
                l = l.replace('"D(amaging)"', 'D(amaging)').replace('"N(eutral)"', 'N(eutral)')
                print(l)
                f2.write(l)
            elif l.startswith('##'):
                if 'FORMAT' in l[:20]:
                    res = l.split('ID=')[1].split(',')[0]
                    desc = l.split('Description=')[1][:-2]
                    description.update({res: desc})
                if 'INFO' in l[:20]:
                    res = l.split('ID=')[1].split(',')[0]
                    desc = l.split('Description=')[1][:-2]
                    description.update({res: desc})
                    fields.update({res: []})
                f2.write(l)
                c += 1
            elif l.startswith("#CHROM"):
                f2.write(l)
                headerrow = c + 1
            else:
                break

In [None]:
# run the following line to split multiallelic rows and generate *_norm.vcf
# bcftools norm -m-any CDS-hWv3gY_reheadered.vcf -o CDS-hWv3gY_norm.vcf
vcf_fn = "../temp/CDS-hWv3gY_norm.vcf"

In [None]:
# Expand funcotation cols
def expand_funcotation(df, desc):
    funco_fields = desc['FUNCOTATION'].replace(": ", "|").replace("\"", "").split("|")[1:]
    func_df = df["FUNCOTATION"].str.replace("[", "", regex=True).replace("]", "", regex=True).str.split("|").tolist()
    func_df = pd.DataFrame(data=func_df, columns=funco_fields, index=df.index)
    func_df = func_df.replace("_%20_", " ", regex=True).replace("_%3D_", "=", regex=True).replace("_%7C_", "|", regex=True).replace("_%2C_", ",", regex=True)
    cols_to_drop = []
    for f in funco_fields:
        # drop columns that have the same value across all rows
        if len(func_df[f].unique()) == 1:
            print(f)
            cols_to_drop.append(f)
    return pd.concat([df.drop(columns="FUNCOTATION"), func_df.drop(columns=cols_to_drop)], axis=1)


# parse several columns and clean up the formatting
# TODO: more columns need to be cleaned up once we decide which cols to keep in the final MAF
def transform_oc_cols(df):
    df['OC_base__all_mappings'] = df['OC_base__all_mappings'].str.replace("%3A", ":").str.replace("%3B-", ";").str.split(";")
    df['OC_base__all_mappings'] = df['OC_base__all_mappings'].apply(lambda x: dict([(v.split(":")[0], ",".join(v.split(":")[1:])) for v in x]) if x is not None else dict())
    
    df['OC_chasmplus__all'] = df['OC_chasmplus__all'].str.replace("\]%2C\[", ";").str.replace("\]", "").str.replace("\[", "").str.replace("%2C", ",").str.split(';')
    df['OC_chasmplus__all'] = df['OC_chasmplus__all'].apply(lambda x: dict([(v.split(",")[0], ",".join(v.split(",")[1:])) for v in x]) if type(x) == list else dict())
    
    df['OC_funseq2__hot'] = df['OC_funseq2__hot'].str.replace("%3B", ";")
    
    df['OC_funseq2__all'] = df['OC_funseq2__all'].str.replace("\]%2C\[", ";").str.replace("\]", "").str.replace("\[", "").str.replace("%2C", ",").str.replace("\"", "").str.split(';')
    df['OC_funseq2__all'] = df['OC_funseq2__all'].apply(lambda x: dict([(v.split(",")[0], ",".join(v.split(",")[1:])) for v in x]) if type(x) == list else dict())
    
    df['OC_genehancer__target_genes'] = df['OC_genehancer__target_genes'].str.replace("%3A", ":").str.replace("%2C", ";").str.split(';')
    df['OC_genehancer__target_genes'] = df['OC_genehancer__target_genes'].apply(lambda x: dict([(v.split(":")[0], float(v.split(":")[1])) for v in x]) if type(x) == list else dict())

    
#
def vcf_to_maf(vcf_fn):
    
    # read in vcf as a df
    vcf_df_split, desc = mut.vcf_to_df(vcf_fn, additional_cols=["PON"])
    
    # parse funcotation columns
    vcf_with_func = expand_funcotation(vcf_df_split, desc)
    
    # mutect2 generates 2 DP columns, one in INFO and the other in FORMAT
    # according to mutect2, INFO fields are for the variant as a whole (over all samples), 
    # while FORMAT fields are for individual samples.
    # https://github.com/broadinstitute/gatk/issues/6067
    # here we are dropping the one in INFO
    vcf_with_func['DP_keep'] = vcf_with_func['DP'].iloc[:, 1]
    vcf_with_func = vcf_with_func.drop(columns=['DP']).rename(columns={'DP_keep': 'DP'})
    
    # clean up formatting
    transform_oc_cols(vcf_with_func)
    
    return vcf_with_func