In [None]:
import pandas as pd
import genepy.mutations as mut
import numpy as np

In [None]:
vcf_df, desc, header = mut.vcf_to_df("../temp/CDS-hWv3gY_fixedcolumn.vcf.gz", additional_cols=["PON"])
vcf_df

In [None]:
from itertools import islice
import gzip

fields = {}
description = {}
c = 0
headerrow = 0
with gzip.open("../temp/CDS-hWv3gY_fixedcolumn.vcf.gz") as f:
    with open("../temp/header.hr", "w") as f2:
        for l in f:
            l = l.decode("utf-8") if type(l) is not str else l
            if l.startswith("##INFO=<ID=OC_provean__prediction"):
                l = l.replace('"D(amaging)"', 'D(amaging)').replace('"N(eutral)"', 'N(eutral)')
                print(l)
                f2.write(l)
            elif l.startswith('##'):
                if 'FORMAT' in l[:20]:
                    res = l.split('ID=')[1].split(',')[0]
                    desc = l.split('Description=')[1][:-2]
                    description.update({res: desc})
                if 'INFO' in l[:20]:
                    res = l.split('ID=')[1].split(',')[0]
                    desc = l.split('Description=')[1][:-2]
                    description.update({res: desc})
                    fields.update({res: []})
                f2.write(l)
                c += 1
            elif l.startswith("#CHROM"):
                f2.write(l)
                headerrow = c + 1
            else:
                break

In [None]:
names = ['chr', 'pos', 'id', 'ref', 'alt', 'qual']
names += ['strand']
names += ['data', 'format'] + ['sample']

a = pd.read_csv('../temp/CDS-hWv3gY_norm.vcf', sep='\t', header=None, skiprows=headerrow+3, names=names, index_col=False)

In [None]:
a

In [None]:
vcf_df_split, desc, header = mut.vcf_to_df("../temp/CDS-hWv3gY_norm.vcf", additional_cols=["PON"])
vcf_df_split

In [None]:
# Expand funcotation cols
def expand_funcotation(df, desc):
    funco_fields = desc['FUNCOTATION'].replace(": ", "|").replace("\"", "").split("|")[1:]
    func_df = df["FUNCOTATION"].str.replace("[", "", regex=True).replace("]", "", regex=True).str.split("|").tolist()
    func_df = pd.DataFrame(data=func_df, columns=funco_fields, index=df.index)
    func_df = func_df.replace("_%20_", " ", regex=True).replace("_%3D_", "=", regex=True).replace("_%7C_", "|", regex=True).replace("_%2C_", ",", regex=True)
    cols_to_drop = []
    for f in funco_fields:
        # drop columns that have the same value across all rows
        if len(func_df[f].unique()) == 1:
            print(f)
            cols_to_drop.append(f)
    return pd.concat([df.drop(columns="FUNCOTATION"), func_df.drop(columns=cols_to_drop)], axis=1)


In [None]:

vcf_with_func = expand_funcotation(vcf_df_split, desc)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

# mutect2 generates 2 DP columns, one in INFO and the other in FORMAT
# according to mutect2, INFO fields are for the variant as a whole (over all samples), 
# while FORMAT fields are for individual samples.
# https://github.com/broadinstitute/gatk/issues/6067
# here we are dropping the one in INFO

vcf_with_func['DP_keep'] = vcf_with_func['DP'].iloc[:, 1]
vcf_with_func = vcf_with_func.drop(columns=['DP']).rename(columns={'DP_keep': 'DP'})
vcf_with_func['DP']

vcf_with_func

In [None]:
def transform_oc_cols(df):
    df['OC_base__all_mappings'] = df['OC_base__all_mappings'].str.replace("%3A", ":").str.replace("%3B-", ";").str.split(";")
    df['OC_base__all_mappings'] = df['OC_base__all_mappings'].apply(lambda x: dict([(v.split(":")[0], ",".join(v.split(":")[1:])) for v in x]) if x is not None else dict())
    
    df['OC_chasmplus__all'] = df['OC_chasmplus__all'].str.replace("\]%2C\[", ";").str.replace("\]", "").str.replace("\[", "").str.replace("%2C", ",").str.split(';')
    df['OC_chasmplus__all'] = df['OC_chasmplus__all'].apply(lambda x: dict([(v.split(",")[0], ",".join(v.split(",")[1:])) for v in x]) if type(x) == list else dict())
    
    df['OC_funseq2__hot'] = df['OC_funseq2__hot'].str.replace("%3B", ";")
    
    df['OC_funseq2__all'] = df['OC_funseq2__all'].str.replace("\]%2C\[", ";").str.replace("\]", "").str.replace("\[", "").str.replace("%2C", ",").str.replace("\"", "").str.split(';')
    df['OC_funseq2__all'] = df['OC_funseq2__all'].apply(lambda x: dict([(v.split(",")[0], ",".join(v.split(",")[1:])) for v in x]) if type(x) == list else dict())
    
    df['OC_genehancer__target_genes'] = df['OC_genehancer__target_genes'].str.replace("%3A", ":").str.replace("%2C", ";").str.split(';')
    df['OC_genehancer__target_genes'] = df['OC_genehancer__target_genes'].apply(lambda x: dict([(v.split(":")[0], float(v.split(":")[1])) for v in x]) if type(x) == list else dict())

    

In [None]:
transform_oc_cols(vcf_with_func)

In [None]:
vcf_with_func

In [None]:
for f in vcf_with_func.columns:
    # drop columns that have the same value across all rows
    if not isinstance(vcf_with_func[f][0], dict) and len(vcf_with_func[f].unique()) == 1:
        print(f, ": ", vcf_with_func[f][0])