In [None]:
import pandas as pd
import genepy.mutations as mut
import numpy as np

from importlib import reload
%load_ext autoreload
%autoreload 2

In [None]:
mut = reload(mut)

In [None]:
# Expand funcotation cols
def expand_funcotation(df, desc):
    funco_fields = desc['FUNCOTATION'].replace(": ", "|").replace("\"", "").split("|")[1:]
    func_df = df["FUNCOTATION"].str.replace("[", "", regex=True).replace("]", "", regex=True).str.split("|").tolist()
    func_df = pd.DataFrame(data=func_df, columns=funco_fields, index=df.index)
    func_df = func_df.replace("_%20_", " ", regex=True).replace("_%3D_", "=", regex=True).replace("_%7C_", "|", regex=True).replace("_%2C_", ",", regex=True)
    cols_to_drop = []
    for f in funco_fields:
        # drop columns that have the same value across all rows
        if len(func_df[f].unique()) == 1:
            print(f)
            cols_to_drop.append(f)
    return pd.concat([df.drop(columns="FUNCOTATION"), func_df.drop(columns=cols_to_drop)], axis=1)


# parse several columns and clean up the formatting
# TODO: more columns need to be cleaned up once we decide which cols to keep in the final MAF
def transform_oc_cols(df):
    df['OC_base__all_mappings'] = df['OC_base__all_mappings'].str.replace("%3A", ":").str.replace("%3B-", ";").str.split(";")
    df['OC_base__all_mappings'] = df['OC_base__all_mappings'].apply(lambda x: dict([(v.split(":")[0], ",".join(v.split(":")[1:])) for v in x]) if x is not None else dict())
    
    df['OC_chasmplus__all'] = df['OC_chasmplus__all'].str.replace("\]%2C\[", ";").str.replace("\]", "").str.replace("\[", "").str.replace("%2C", ",").str.split(';')
    df['OC_chasmplus__all'] = df['OC_chasmplus__all'].apply(lambda x: dict([(v.split(",")[0], ",".join(v.split(",")[1:])) for v in x]) if type(x) == list else dict())
    
    df['OC_funseq2__hot'] = df['OC_funseq2__hot'].str.replace("%3B", ";")
    
    df['OC_funseq2__all'] = df['OC_funseq2__all'].str.replace("\]%2C\[", ";").str.replace("\]", "").str.replace("\[", "").str.replace("%2C", ",").str.replace("\"", "").str.split(';')
    df['OC_funseq2__all'] = df['OC_funseq2__all'].apply(lambda x: dict([(v.split(",")[0], ",".join(v.split(",")[1:])) for v in x]) if type(x) == list else dict())
    
    df['OC_genehancer__target_genes'] = df['OC_genehancer__target_genes'].str.replace("%3A", ":").str.replace("%2C", ";").str.split(';')
    df['OC_genehancer__target_genes'] = df['OC_genehancer__target_genes'].apply(lambda x: dict([(v.split(":")[0], float(v.split(":")[1])) for v in x]) if type(x) == list else dict())

    
#
def vcf_to_maf(vcf_fn, **kwargs):
    
    # read in vcf as a df
    vcf_df_split, desc = mut.vcf_to_df(vcf_fn, additional_cols=["PON"], **kwargs)
        
    # mutect2 generates 2 DP columns, one in INFO and the other in FORMAT
    # according to mutect2, INFO fields are for the variant as a whole (over all samples), 
    # while FORMAT fields are for individual samples.
    # https://github.com/broadinstitute/gatk/issues/6067
    # here we are dropping the one in INFO
    vcf_df_split['DP_keep'] = vcf_df_split['DP'].iloc[:, 1]
    vcf_df_split = vcf_df_split.drop(columns=['DP']).rename(columns={'DP_keep': 'DP'})
    
    # clean up formatting
    #transform_oc_cols(vcf_with_func)
    
    return vcf_with_func

In [None]:
vcf_df_split

In [None]:
vcf_df_split, desc = mut.vcf_to_df("../../merged.vcf.gz", additional_cols=["PON"], parse_filter=True)

In [None]:
i = 'chr1    14464   .       A       T       .       clustered_events;germline;map_qual;panel_of_normals     AS_SB_TABLE=1,0|3,0;ECNT=6;GERMQ=1;PON;ROQ=60;DP=4;AS_FilterStatus=map_qual;FUNCOTATION=[WASH7P|hg38|chr1|14464|14464|RNA||SNP|A|A|T|g.chr1:14464A>T|ENST00000488147.1|-|||c.e11-37T>A|||0.57356608478803|TTAAGAACACAGTGGCGCAGG||||||||||||||||||||||||||||||||||||||||||||||||||||||||HGNC:38034|WAS_%20_protein_%20_family_%20_homolog_%20_7_%20_pseudogene|Approved|pseudogene|pseudogene|||FAM39F|"family_%20_with_%20_sequence_%20_similarity_%20_39_%2C__%20_member_%20_F"|1p36.33|2010-03-23|||||653635||18159949|NR_024540|14|Wiskott-Aldrich_%20_Syndrome_%20_protein_%20_family|||653635||NR_024540||ENSG00000227232||||||||||||true|false|0.9042_%2C_0.09585|false|false|1|false|true|false|DDX11L1:100287102_%7C_WASH7P:653635|false|false|false|false|true|false|false|false|false|false|false|false|false|false|false|false|false|true|false|false|546169444|14464|false|false|0|true|0|false|0.83032301223241590_%2C_0.16967698776758409|false|false|false|SNV|true|0x050100040005150026000100|1|false|142|rs546169444|||MCF7|NA|Unknown];MBQ=30,30;MFRL=302,302;MMQ=31,24;MPOS=6;POPAF=0.69;TLOD=7.72   GT:AD:AF:DP:F1R2:F2R1:FAD:SB    0/1:1,3:0.666:4:0,0:1,3:1,3:1,0,3,0     ./.:.:.:.:.:.:.:.'

j = 'chr1    3650142 .       GC      AC,AA   .       germline;panel_of_normals       AS_SB_TABLE=0,0|0,21;ECNT=2;GERMQ=1;PON;ROQ=93;DP=25;AS_FilterStatus=SITE,SITE;FUNCOTATION=[WRAP73|hg38|chr1|3650142|3650142|FIVE_PRIME_FLANK||SNP|G|G|A|g.chr1:3650142G>A|ENST00000270708.12|-||||||0.5935162094763092|AGCGCCGCCGGCTTCCGCGCG|TP73_ENST00000346387.8_FIVE_PRIME_FLANK/TP73_ENST00000354437.8_FIVE_PRIME_FLANK/TP73_ENST00000357733.7_FIVE_PRIME_FLANK/TP73_ENST00000378295.9_FIVE_PRIME_FLANK/WRAP73_ENST00000378322.7_FIVE_PRIME_FLANK/TP73_ENST00000604074.5_FIVE_PRIME_FLANK||||||||||||||||||||||||||||||||||||||||||||90|biliary_tract(2)_%7C_breast(11)_%7C_central_nervous_system(44)_%7C_large_intestine(11)_%7C_pancreas(22)|||||||BC086311|NM_017818.3|NP_060288|HGNC:12759|WD_%20_repeat_%20_containing_%2C__%20_antisense_%20_to_%20_TP73|Approved|gene_%20_with_%20_protein_%20_product|protein-coding_%20_gene|WDR8|"WD_%20_repeat_%20_domain_%20_8"|||1p36.32|2016-10-05|2011-04-13|2011-04-13|AB034912_%2C__%20_EF494669||49856|ENSG00000116213|||362|WD_%20_repeat_%20_domain_%20_containing|CCDS48|OTTHUMG00000000612|49856|606040|NM_017818|Q9P2S5|ENSG00000116213|uc001ako.4|hg38|OREG1502683|Type_%3D_TRANSCRIPTION_%20_FACTOR_%20_BINDING_%20_SITE_%7C_Gene_Symbol_%3D_WRAP73_%7C_Gene_ID_%3D_ENSG00000116213_%7C_Gene_Source_%3D_ENSEMBL_%7C_Regulatory_Element_Symbol_%3D_EGR1_%7C_Regulatory_Element_ID_%3D_ENST00000239938_%7C_Regulatory_Element_Source_%3D_ENSEMBL_%7C_PMID_%3D_18971253_%7C_Dataset_%3D_PAZAR|WRP73_HUMAN||Q5T0D6_%7C_Q9BUH7_%7C_Q9NTK7_%7C_Q9NX56|Q9P2S5||centrosome_%20_(GO:0005813)_%7C_cytoplasm_%20_(GO:0005737)||true|false|0.2927_%2C_0.7073_%2C_.|false|false|1|false|true|true|WRAP73:49856|true|true|false|true|true|false|false|false|false|false|false|false|false|false|false|false|false|false|true|false|2251098|3650142|true|false|0|true|0|false|0.22000828236493374_%2C_0.74411474260958205_%2C_0.03587697502548419|false|false|false|SNV|true|0x05010002000517053e000100|1|false|100|rs2251098|||MCF7|NA|Unknown],[WRAP73|hg38|chr1|3650142|3650143|FIVE_PRIME_FLANK||DNP|GC|GC|AA|g.chr1:3650142_3650143GC>AA|ENST00000270708.12|-||||||0.5920398009950248|AGCGCCGCCGGCTTCCGCGCGG|TP73_ENST00000346387.8_FIVE_PRIME_FLANK/TP73_ENST00000354437.8_FIVE_PRIME_FLANK/TP73_ENST00000357733.7_FIVE_PRIME_FLANK/TP73_ENST00000378295.9_FIVE_PRIME_FLANK/WRAP73_ENST00000378322.7_FIVE_PRIME_FLANK/TP73_ENST00000604074.5_FIVE_PRIME_FLANK||||||||||||||||||||||||||||||||||||||||||||90|biliary_tract(2)_%7C_breast(11)_%7C_central_nervous_system(44)_%7C_large_intestine(11)_%7C_pancreas(22)|||||||BC086311|NM_017818.3|NP_060288|HGNC:12759|WD_%20_repeat_%20_containing_%2C__%20_antisense_%20_to_%20_TP73|Approved|gene_%20_with_%20_protein_%20_product|protein-coding_%20_gene|WDR8|"WD_%20_repeat_%20_domain_%20_8"|||1p36.32|2016-10-05|2011-04-13|2011-04-13|AB034912_%2C__%20_EF494669||49856|ENSG00000116213|||362|WD_%20_repeat_%20_domain_%20_containing|CCDS48|OTTHUMG00000000612|49856|606040|NM_017818|Q9P2S5|ENSG00000116213|uc001ako.4|hg38|OREG1502683|Type_%3D_TRANSCRIPTION_%20_FACTOR_%20_BINDING_%20_SITE_%7C_Gene_Symbol_%3D_WRAP73_%7C_Gene_ID_%3D_ENSG00000116213_%7C_Gene_Source_%3D_ENSEMBL_%7C_Regulatory_Element_Symbol_%3D_EGR1_%7C_Regulatory_Element_ID_%3D_ENST00000239938_%7C_Regulatory_Element_Source_%3D_ENSEMBL_%7C_PMID_%3D_18971253_%7C_Dataset_%3D_PAZAR|WRP73_HUMAN||Q5T0D6_%7C_Q9BUH7_%7C_Q9NTK7_%7C_Q9NX56|Q9P2S5||centrosome_%20_(GO:0005813)_%7C_cytoplasm_%20_(GO:0005737)||false|false||false|false||false|false|false||false|false|false|false|false|false|false|false|false|false|false|false|false|false|false|false|false|false|false|false|||false|false||false||false||false|false|false||false|||false|||||MHHCALL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE|SANGER|Unknown];MBQ=0,30,30;MFRL=0,279,241;MMQ=60,60,60;MPOS=15,19;POPAF=0.11,7.3;TLOD=73.67,4.2    GT:AD:AF:DP:F1R2:F2R1:FAD:SB    1|1:0,21,.:0.96,.:21:0,4,.:0,9,.:0,21,.:0,0,0,21        0/2:0,.,1:.,0.667:1:0,.,0:0,.,0:0,.,1:0,0,0,1'

cols = "#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  MCF7    MHHCALL2_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE"
cols = cols[1:].split('\t')
values = i.split('\t')
# add annotation field in comment for allele_specific_expression: 
# add columns named from input_name
muts = values[len(cols[9:]):]
muts = np.array([mut.split(':') for mut in muts])
for mut in muts.T:
    #if multi allelic site
    # do we have reads from multiple ?
        # yes, sum over things not coming from RNA
        # No, do we have AF > N ?
            # yes, keep it (and register that case in A, if not coming from RNA)
            # No register that case in B, if not coming from RNA and remove this multi allelic from the file
            
            # if more than X1% of multi allelic have caracteristic A, raise an issue
            # if more than X2% of multi allelic have caracteristic B, raise an issue
            # if more than X4% of multi allelic have caracteristic B, send a warning
            # if more than X3% of sites are multi alleic, raise an issue
    #else:
        # sum/avg things not coming from RNA
        # if sum is below the filter, drop mutation entirely
    # if RNA sample:
        # if AF of RNA sample skewed compared to other samples: RNA_AF > 0.9 
            #note allele_specific_expression in INFO field: ;allele_specific_expression=0 or 1 or 2 or 3...;
    