# Genetables

In [1]:
import pandas as pd
import os
import numpy as np
import warnings
from tqdm.notebook import tqdm

In [2]:
def get_cgc_synonyms(cgc_df):

    cgc_df["Gene Symbol"] = cgc_df.apply(lambda x: x["Gene Symbol"] if pd.isna(x["Synonyms"]) 
                                         else f'{x["Gene Symbol"]},{x["Synonyms"]}', axis=1)
    cgc_genes = []
    for _, row in cgc_df.iterrows():
        gene_list = row["Gene Symbol"].split(',')
        for gene in gene_list:
            cgc_genes.append(gene)

    return cgc_genes


def load_3dclust(path_c_3dclust):
    """
    Load gene results from the 3d clustering method.
    """
    
    #cohort_3dclust_df = pd.read_csv(path_c_3dclust)
    cohort_3dclust_df = pd.read_csv(path_c_3dclust, sep="\t")
    
    # if "pval" not in cohort_3dclust_df.columns:
    #     cohort_3dclust_df["pval"] = np.nan
    cohort_3dclust_df= cohort_3dclust_df[["Gene", "Uniprot_ID", "Status", "pval", "qval"]].reset_index()
    
    return cohort_3dclust_df.rename(columns = {"Status" : "o3d_status", "index" : "o3d_rank",
                                               "pval" : "o3d_pval", "qval" : "o3d_qval"})


def load_cbase(path_c_cbase):
    """
    Load gene results from the cbase method.
    """
    
    cohort_cbase_df = pd.read_csv(path_c_cbase, sep="\t")
    cohort_cbase_df = cohort_cbase_df.sort_values("p_pos")
    cohort_cbase_df = cohort_cbase_df[["gene", "p_pos", "q_pos"]].reset_index(drop=True).reset_index()

    return cohort_cbase_df.rename(columns = {"index": "cbase_rank", "gene" : "Gene",
                                             "p_pos" : "cbase_pval", "q_pos" : "cbase_qval"})


def load_dndscv(path_c_dndscv):
    """
    Load gene results from the dndscv method.
    """
    
    cohort_dndscv_df = pd.read_csv(path_c_dndscv, sep = "\t")
    cohort_dndscv_df = cohort_dndscv_df[["gene_name", "pallsubs_cv", "qallsubs_cv"]]
    cohort_dndscv_df = cohort_dndscv_df.sort_values("pallsubs_cv").reset_index(drop=True).reset_index()
    
    return cohort_dndscv_df.rename(columns = {"index" : "dndscv_rank", "gene_name" : "Gene",
                                              "pallsubs_cv" : "dndscv_pval", "qallsubs_cv" : "dndscv_qval"})       


def load_hotmaps(path_c_hotmaps):
    """
    Load gene results from the hotmaps method.
    """
    
    cohort_hotmaps_df = pd.read_csv(path_c_hotmaps, sep = "\t")
    cohort_hotmaps_df = cohort_hotmaps_df.sort_values("Min p-value").reset_index(drop=True)
    cohort_hotmaps_df = cohort_hotmaps_df[["GENE", "Min p-value", "q-value"]].reset_index(drop=True).reset_index()
    
    return cohort_hotmaps_df.rename(columns = {"index" : "hotmaps_rank", "GENE" : "Gene",
                                               "Min p-value" : "hotmaps_pval",
                                               "q-value" : "hotmaps_qval"})   


def load_mutpanning(path_c_mutpanning):
    """
    Load gene results from the mutpanning method. 
    Genes are already sorted from most significant to least.
    If the same gene appears multiple times, the p-val and q-val 
    of the lowest rank is selected.
    """
    
    cohort_mutpanning_df = pd.read_csv(path_c_mutpanning, sep = "\t")
    cohort_mutpanning_df = cohort_mutpanning_df[~cohort_mutpanning_df["Significance"].isna()]
    cohort_mutpanning_df = cohort_mutpanning_df[["Name", "Significance", "FDR"]].reset_index()

    # Keep the lowest rank if there is a gene with multiple q-value
    idx = cohort_mutpanning_df.groupby('Name')['index'].idxmin()
    cohort_mutpanning_df = cohort_mutpanning_df.loc[idx].sort_values("index").drop(columns=["index"])
    cohort_mutpanning_df = cohort_mutpanning_df.reset_index(drop=True).reset_index()
    
    return cohort_mutpanning_df.rename(columns = {"index" : "mutpan_rank", "Name" : "Gene",
                                                  "Significance" : "mutpan_pval",
                                                  "FDR" : "mutpan_qval"})


def load_smreg(path_c_smreg):
    """
    Load gene results from smregions method. If the same gene
    appears multiple times, the row with minimum p-value is selected.
    """
    
    cohort_smreg_df = pd.read_csv(path_c_smreg, sep = "\t")
    cohort_smreg_df = cohort_smreg_df[~cohort_smreg_df["P_VALUE"].isna()]
    cohort_smreg_df = cohort_smreg_df[["HUGO_SYMBOL", "P_VALUE", "Q_VALUE"]]
    
    # Keep the lowest p-val if there is a gene with multiple p-value
    idx = cohort_smreg_df.groupby('HUGO_SYMBOL')['Q_VALUE'].idxmin()
    cohort_smreg_df = cohort_smreg_df.loc[idx].sort_values(["Q_VALUE", "P_VALUE"])
    cohort_smreg_df = cohort_smreg_df.reset_index(drop=True).reset_index()
    
    return cohort_smreg_df.rename(columns = {"index" : "smreg_rank",
                                             "HUGO_SYMBOL" : "Gene", 
                                             "P_VALUE" : "smreg_pval",
                                             "Q_VALUE" : "smreg_qval"})


def load_clustl(path_c_clustl):
    """ 
    Load gene results from OncodriveCLUSTL method.
    """
    
    cohort_clustl_df = pd.read_csv(path_c_clustl, sep = "\t")
    cohort_clustl_df = cohort_clustl_df[["SYMBOL", "P_ANALYTICAL", "Q_ANALYTICAL"]].sort_values("P_ANALYTICAL")
    cohort_clustl_df = cohort_clustl_df.reset_index(drop=True).reset_index()
    
    return cohort_clustl_df.rename(columns={"index" : "clustl_rank", "SYMBOL" : "Gene",
                                            "P_ANALYTICAL" : "clustl_pval",
                                            "Q_ANALYTICAL" : "clustl_qval"})


def load_fml(path_c_fml):
    """
    Load gene results from OncodriveFML method.
    """
    
    cohort_fml_df = pd.read_csv(path_c_fml, sep = "\t")
    cohort_fml_df = cohort_fml_df[["SYMBOL" , "P_VALUE" , "Q_VALUE"]]
    cohort_fml_df = cohort_fml_df.reset_index(drop=True).sort_values("P_VALUE").reset_index()
    
    return cohort_fml_df.rename(columns={"index" : "fml_rank", "SYMBOL" : "Gene",
                                         "P_VALUE" : "fml_pval", "Q_VALUE" : "fml_qval"})


def get_all_methods_results(cohort,
                            input_datasets,
                            o3d_output,
                            intogen_output,
                            ctype=None,
                            ch_genes = None):
    """
    Load the result of each method and get the sorted gene ranks, the p-values and q-values.
    """
    
    # Load CGC and fishy genes
    cgc_df = pd.read_csv(f"{input_datasets}/CGC_list_v99.tsv", sep = "\t")
    cgc_genes = get_cgc_synonyms(cgc_df)
    fishy_df = pd.read_csv(f"{input_datasets}/negative_gene_set.tsv", sep = '\t', header = None)
    fishy_df = fishy_df.rename(columns={0 : "Cancer_Type", 1 : "Fish"})

    # Paths to methods output
    path_3dclust = f"{o3d_output}/{cohort}/{cohort}.3d_clustering_genes.tsv"
    path_cbase = f"{intogen_output}/cbase/{cohort}.cbase.tsv.gz"
    path_dndscv = f"{intogen_output}/dndscv/{cohort}.dndscv.tsv.gz"
    path_hotmaps = f"{intogen_output}/hotmaps/{cohort}.out.gz"
    path_mutpan = f"{intogen_output}/mutpanning/Significance{cohort}.txt"
    path_clustl = f"{intogen_output}/oncodriveclustl/{cohort}.elements_results.txt"
    path_fml = f"{intogen_output}/oncodrivefml/{cohort}-oncodrivefml.tsv.gz"
    path_smreg = f"{intogen_output}/smregions/{cohort}.smregions.tsv.gz"

    # Load all methods results
    
    cohort_df = load_3dclust(path_3dclust)

    if os.path.exists(path_cbase):
        cohort_cbase_df = load_cbase(path_cbase)
        cohort_df = cohort_df.merge(cohort_cbase_df, how="outer", on="Gene")
    else:
        print(f"Cant load cbase in {path_cbase}")

    if os.path.exists(path_dndscv):
        cohort_dndscv_df = load_dndscv(path_dndscv)
        cohort_df = cohort_df.merge(cohort_dndscv_df, how="outer", on="Gene")
    else:
        print(f"Cant load dndscv in {path_dndscv}")

    if os.path.exists(path_hotmaps):
        cohort_hotmaps_df = load_hotmaps(path_hotmaps)
        cohort_df = cohort_df.merge(cohort_hotmaps_df, how="outer", on="Gene")
    else:
        print(f"Cant load hotmaps in {path_hotmaps}")
        
    if os.path.exists(path_mutpan):
        cohort_mutpan_df = load_mutpanning(path_mutpan)
        cohort_df = cohort_df.merge(cohort_mutpan_df, how="outer", on="Gene")
    else:
        print(f"Cant load mutpanning in {path_mutpan}")
        
    if os.path.exists(path_clustl):
        cohort_clustl_df = load_clustl(path_clustl)
        cohort_df = cohort_df.merge(cohort_clustl_df, how="outer", on="Gene")
    else:
        print(f"Cant load oncrodriveclustl in {path_clustl}")
        
    if os.path.exists(path_fml):
        cohort_fml_df = load_fml(path_fml)
        cohort_df = cohort_df.merge(cohort_fml_df, how="outer", on="Gene")
    else:
        print(f"Cant load oncodrive fml in {path_fml}")
        
    if os.path.exists(path_smreg):
        cohort_smreg_df = load_smreg(path_smreg)
        cohort_df = cohort_df.merge(cohort_smreg_df, how="outer", on="Gene") 
    else:
        print(f"Cant load smregion in {path_smreg}")
        
    # Add CGC
    cgc = cohort_df["Gene"].isin(cgc_genes).astype(int)
    cohort_df.insert(4, "CGC", cgc)
    
    # Add fishy (Gene in Fishy list and not in CGC)
    fishy = np.nan
    if ctype:
        if ctype in fishy_df["Cancer_Type"].values:
            fishy = fishy_df[fishy_df["Cancer_Type"] == ctype].Fish.values[0].split(",")
            fishy = pd.concat((cohort_df["Gene"].isin(fishy), 
                               ~cohort_df["Gene"].isin(cgc_genes)), axis=1).all(1).astype(int)
        else:
            fishy = 0
    
    if ch_genes is not None:
        # Add CH
        ch = cohort_df["Gene"].isin(ch_genes).astype(int)  
        cohort_df.insert(4, "CH", ch)
        # Add fishy
        if ctype:
            fishy = fishy_df[fishy_df["Cancer_Type"] == "AML"].Fish.values[0].split(",")
            fishy = cohort_df["Gene"].isin(fishy).astype(int)
    
    cohort_df.insert(5, "Fish", fishy)
    cohort_df.insert(5, "o3d_rank", cohort_df.pop("o3d_rank"))
    
    return cohort_df


def get_genetable(cohort_df,
                  o3d_output,
                  intogen_output,
                  input_datasets,
                  filter_icgc=False,
                  ch_genes=None,
                  genetable_output=None, 
                  filename = None,
                  save = False):
    """
    Get the sorted rank of each method, p-value, q-value. 
    Also, annotate genes with CGC and fishy label.
    """
    
    genetable_df = pd.DataFrame()

    for cohort in cohort_df["COHORT"]:
        print(cohort)

        if filter_icgc:
            if cohort.startswith("ICGC"):
                continue
            
        maf = f"{input_datasets}/maf/{cohort}.in.maf"
        mut_profile = f"{input_datasets}/mut_profile/{cohort}.sig.json"
        ctype = cohort_df[cohort_df["COHORT"] == cohort].CANCER_TYPE.values[0]

        if os.path.isfile(maf) and os.path.isfile(mut_profile):
            try:
                tmp_df = get_all_methods_results(cohort=cohort,
                                                 input_datasets=input_datasets, 
                                                 o3d_output=o3d_output,
                                                 intogen_output=intogen_output,
                                                 ctype=ctype, 
                                                 ch_genes=ch_genes)
                tmp_df["Cancer"] = ctype
                tmp_df["Cohort"] = cohort
                genetable_df = pd.concat((genetable_df, tmp_df))
            except Exception as e:
                warnings.warn(f"{cohort} could not be processed")
                warnings.warn(f"Error: {e}")

        else:
            print(f"MAF or Mut rate not found for {cohort}")
            
    # Save
    if save and genetable_output and filename:
        output_path = f"{genetable_output}/{filename}.genetable.tsv"
        genetable_df.dropna(subset="Gene").to_csv(output_path, sep = "\t", index = False, header = True)
        print(f"Genetable saved in: {output_path}")

    return genetable_df

## Cancer

In [3]:
import pandas as pd
datasets = "/workspace/projects/clustering_3d/o3d_analysys/datasets/input/cancer_202404"
cohort_df = pd.read_csv(f"{datasets}/cohorts.tsv", sep="\t")
cohort_df

Unnamed: 0,COHORT,CANCER_TYPE,PLATFORM,MUTATIONS,SAMPLES
0,STJUDE_WGS_D_EPD_2018,EPM,WGS,10073,39
1,STJUDE_WGS_D_LGG_2018,LGGNOS,WGS,36490,38
2,STJUDE_WGS_D_CM_2018,MEL,WGS,101075,7
3,STJUDE_WGS_D_AML_2018,AML,WGS,30265,21
4,STJUDE_WGS_D_ALL_2018,ALL,WGS,169261,278
...,...,...,...,...,...
267,HARTWIG_WGS_RCC_2023,RCC,WGS,1251564,133
268,HARTWIG_WGS_CHOL_2023,CHOL,WGS,1378086,80
269,HARTWIG_WGS_SKCM_2023,SKCM,WGS,28349563,299
270,HARTWIG_WGS_UCEC_2023,UCEC,WGS,1171774,32


### Human MANE

In [4]:
SUBDIR = "human_mane_raw"
RUN = "run_2024-04-24_05-19-48"

datasets = f"/workspace/projects/clustering_3d/o3d_analysys/datasets/"
input_path = f"{datasets}/input/cancer_202404"
cohort_df = pd.read_csv(f"{input_path}/cohorts.tsv", sep="\t")
o3d_output = f"{datasets}/output/cancer_202404/o3d_output/{SUBDIR}/{RUN}"
intogen_output = f"{datasets}/output/cancer_202404/intogen_output"
genetable_output = f"results/{SUBDIR}"

In [5]:
df = get_genetable(cohort_df=cohort_df,
                   o3d_output=o3d_output,
                   intogen_output=intogen_output,
                   input_datasets = input_path,
                   filter_icgc=True,
                   ch_genes=None,
                   genetable_output=genetable_output, 
                   filename = RUN,
                   save = True)
df

STJUDE_WGS_D_EPD_2018
STJUDE_WGS_D_LGG_2018
STJUDE_WGS_D_CM_2018
STJUDE_WGS_D_AML_2018
STJUDE_WGS_D_ALL_2018
STJUDE_WGS_D_ACC_2018
STJUDE_WGS_D_MB_2018
STJUDE_WGS_R_ALL_2018
STJUDE_WGS_R_NB_2018
STJUDE_WGS_M_OS_2018
STJUDE_WGS_D_OS_2018
STJUDE_WGS_D_NB_2018
STJUDE_WGS_D_EWS_2018
STJUDE_WGS_D_RB_2018
STJUDE_WGS_D_HGG_2018
STJUDE_WGS_D_RHBDS_2018
CBIOP_WXS_BLCA_UTUC_IGBMC_2021
CBIOP_WXS_ACY_2019
CBIOP_WXS_BCC_UNIGE_2016_UNTREAT
CBIOP_WXS_ANGS_TREATED_2020
CBIOP_WXS_BLCA_BGI
CBIOP_WXS_ACYC_MDA_2015
CBIOP_WGS_PRAD_EURUROL_2017
CBIOP_WXS_BLCA_DFARBER_MSKCC_2014
CBIOP_WXS_BCC_UNIGE_2016_TREAT
CBIOP_WGS_STAD_ONCOSG_2018
CBIOP_WXS_ESCA_EGC_TRAP_2020
CBIOP_WXS_ACYC_SANGER_2013
CBIOP_WXS_BRCA_MBCP_PRY_TREAT_2020
CBIOP_WXS_BRCA_MBCP_MET_NOTREAT_2020
CBIOP_WXS_BLCA_VALLEN_2018
CBIOP_WXS_CM_VALLEN_2018
CBIOP_WXS_ANGS_UNTREAT_2020
CBIOP_WXS_DLBC_BROAD_2012
CBIOP_WXS_BRCA_MBCP_PRY_NOTREAT_2020
CBIOP_WXS_HCC_AMC_PRV
CBIOP_WXS_CLL_BROAD_2013
CBIOP_WXS_EGC_TMUCIH_2015
CBIOP_WXS_CSCC_UCSF_2018
CBIOP_WXS_

  cohort_3dclust_df = pd.read_csv(path_c_3dclust, sep="\t")


TCGA_WXS_DLBCLNOS
TCGA_WXS_COADREAD
TCGA_WXS_ESCA
PCAWG_WGS_UTERUS_ADENOCA
TCGA_WXS_CHOL
TARGET_WXS_OS_2020
TCGA_WXS_LGGNOS
TCGA_WXS_CESC
TCGA_WXS_HNSC
TCGA_WXS_PAAD
TCGA_WXS_SOFT_TISSUE
TCGA_WXS_THYM
TCGA_WXS_CCRCC
TCGA_WXS_CHRCC
TCGA_WXS_LUSC
TCGA_WXS_PGNG
TCGA_WXS_OVT
TCGA_WXS_UCS
TCGA_WXS_PLMESO
TCGA_WXS_STAD
TCGA_WXS_UCEC
TCGA_WXS_GBM
TCGA_WXS_PRAD
PEDCBIOP_WXS_HGG_REL
TCGA_WXS_BRCA
TCGA_WXS_UM
PEDCBIOP_WXS_OS_PRY
TCGA_WXS_WDTC
TCGA_WXS_PRCC
TCGA_WXS_MGCT
HARTWIG_WGS_GIST_2023
HARTWIG_WGS_MT_2023
HARTWIG_WGS_LMS_2023
HARTWIG_WGS_GB_2023
HARTWIG_WGS_LUNG_2023
HARTWIG_WGS_CEAD_2023
HARTWIG_WGS_HCC_2023
HARTWIG_WGS_ESCC_2023
HARTWIG_WGS_PLMESO_2023
HARTWIG_WGS_OVT_2023
HARTWIG_WGS_ANSC_2023
HARTWIG_WGS_UTUC_2023
HARTWIG_WGS_WDTC_2023
HARTWIG_WGS_BLCA_2023
HARTWIG_WGS_STOMACH_2023
HARTWIG_WGS_SOFT_TISSUE_2023
HARTWIG_WGS_COAD_2023
HARTWIG_WGS_GBC_2023
HARTWIG_WGS_EGC_2023
HARTWIG_WGS_LIPO_2023
HARTWIG_WGS_HNSC_2023
HARTWIG_WGS_COADREAD_2023
HARTWIG_WGS_NETNOS_2023
HARTWIG_WGS_PANCREAS

Unnamed: 0,Gene,Uniprot_ID,o3d_status,CGC,Fish,o3d_rank,o3d_pval,o3d_qval,cbase_rank,cbase_pval,...,clustl_pval,clustl_qval,fml_rank,fml_pval,fml_qval,smreg_rank,smreg_pval,smreg_qval,Cancer,Cohort
0,BIRC6,Q9NR09,Processed,1,0,0.0,0.014,1.0,2.0,0.000032,...,0.078,0.078,131.0,0.44410,,,,,EPM,STJUDE_WGS_D_EPD_2018
1,CUBN,O60494,No_density,0,0,1.0,,,0.0,0.000019,...,,,41.0,0.15171,0.455130,,,,EPM,STJUDE_WGS_D_EPD_2018
2,DCAF8L1,A6NGE4,No_density,0,0,2.0,,,1.0,0.000019,...,,,16.0,0.06251,0.391725,,,,EPM,STJUDE_WGS_D_EPD_2018
3,EZHIP,A0A515VFR0,No_density,0,0,3.0,,,,,...,,,96.0,0.34277,0.616986,,,,EPM,STJUDE_WGS_D_EPD_2018
4,PIK3CA,P42336,No_density,1,0,4.0,,,3.0,0.000035,...,,,130.0,0.44246,0.632790,,,,EPM,STJUDE_WGS_D_EPD_2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21632,,,,0,0,,,,,,...,,,492.0,0.06870,,,,,SKIN,HARTWIG_WGS_SKIN_2023
21633,,,,0,0,,,,,,...,,,1932.0,0.26042,,,,,SKIN,HARTWIG_WGS_SKIN_2023
21634,,,,0,0,,,,,,...,,,4812.0,0.62313,0.985022,,,,SKIN,HARTWIG_WGS_SKIN_2023
21635,,,,0,0,,,,,,...,,,6858.0,0.88714,,,,,SKIN,HARTWIG_WGS_SKIN_2023


### Human 

In [19]:
SUBDIR = "human_raw"
RUN = "run_2024-06-20_15-43-10"

datasets = f"/workspace/projects/clustering_3d/o3d_analysys/datasets/"
input_path = f"{datasets}/input/cancer_202404"
cohort_df = pd.read_csv(f"{input_path}/cohorts.tsv", sep="\t")
o3d_output = f"{datasets}/output/cancer_202404/o3d_output/{SUBDIR}/{RUN}"
intogen_output = f"{datasets}/output/cancer_202404/intogen_output"
genetable_output = f"results/{SUBDIR}"

In [20]:
df = get_genetable(cohort_df=cohort_df,
                   o3d_output=o3d_output,
                   intogen_output=intogen_output,
                   input_datasets = input_path,
                   filter_icgc=False,
                   ch_genes=None,
                   genetable_output=genetable_output, 
                   filename = RUN,
                   save = True)
df

STJUDE_WGS_D_EPD_2018
STJUDE_WGS_D_LGG_2018
STJUDE_WGS_D_CM_2018
STJUDE_WGS_D_AML_2018
STJUDE_WGS_D_ALL_2018
STJUDE_WGS_D_ACC_2018
STJUDE_WGS_D_MB_2018
STJUDE_WGS_R_ALL_2018
STJUDE_WGS_R_NB_2018
STJUDE_WGS_M_OS_2018
STJUDE_WGS_D_OS_2018
STJUDE_WGS_D_NB_2018
STJUDE_WGS_D_EWS_2018
STJUDE_WGS_D_RB_2018
STJUDE_WGS_D_HGG_2018
STJUDE_WGS_D_RHBDS_2018
CBIOP_WXS_BLCA_UTUC_IGBMC_2021
CBIOP_WXS_ACY_2019
CBIOP_WXS_BCC_UNIGE_2016_UNTREAT
CBIOP_WXS_ANGS_TREATED_2020
CBIOP_WXS_BLCA_BGI
CBIOP_WXS_ACYC_MDA_2015
CBIOP_WGS_PRAD_EURUROL_2017
CBIOP_WXS_BLCA_DFARBER_MSKCC_2014
CBIOP_WXS_BCC_UNIGE_2016_TREAT
CBIOP_WGS_STAD_ONCOSG_2018
CBIOP_WXS_ESCA_EGC_TRAP_2020
CBIOP_WXS_ACYC_SANGER_2013
CBIOP_WXS_BRCA_MBCP_PRY_TREAT_2020
CBIOP_WXS_BRCA_MBCP_MET_NOTREAT_2020
CBIOP_WXS_BLCA_VALLEN_2018
CBIOP_WXS_CM_VALLEN_2018
CBIOP_WXS_ANGS_UNTREAT_2020
CBIOP_WXS_DLBC_BROAD_2012
CBIOP_WXS_BRCA_MBCP_PRY_NOTREAT_2020
CBIOP_WXS_HCC_AMC_PRV
CBIOP_WXS_CLL_BROAD_2013
CBIOP_WXS_EGC_TMUCIH_2015
CBIOP_WXS_CSCC_UCSF_2018
CBIOP_WXS_

Unnamed: 0,Gene,Uniprot_ID,o3d_status,CGC,Fish,o3d_rank,o3d_pval,o3d_qval,cbase_rank,cbase_pval,...,clustl_pval,clustl_qval,fml_rank,fml_pval,fml_qval,smreg_rank,smreg_pval,smreg_qval,Cancer,Cohort
0,BIRC6,Q9NR09,Processed,1,0,0.0,0.014,1.0,2.0,0.000032,...,0.078,0.078,131.0,0.44410,,,,,EPM,STJUDE_WGS_D_EPD_2018
1,CUBN,O60494,No_density,0,0,1.0,,,0.0,0.000019,...,,,41.0,0.15171,0.455130,,,,EPM,STJUDE_WGS_D_EPD_2018
2,DCAF8L1,A6NGE4,No_density,0,0,2.0,,,1.0,0.000019,...,,,16.0,0.06251,0.391725,,,,EPM,STJUDE_WGS_D_EPD_2018
3,EZHIP,Q86X51,No_density,0,0,3.0,,,,,...,,,96.0,0.34277,0.616986,,,,EPM,STJUDE_WGS_D_EPD_2018
4,PIK3CA,P42336,No_density,1,0,4.0,,,3.0,0.000035,...,,,130.0,0.44246,0.632790,,,,EPM,STJUDE_WGS_D_EPD_2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21671,,,,0,0,,,,,,...,,,492.0,0.06870,,,,,SKIN,HARTWIG_WGS_SKIN_2023
21672,,,,0,0,,,,,,...,,,1932.0,0.26042,,,,,SKIN,HARTWIG_WGS_SKIN_2023
21673,,,,0,0,,,,,,...,,,4812.0,0.62313,0.985022,,,,SKIN,HARTWIG_WGS_SKIN_2023
21674,,,,0,0,,,,,,...,,,6858.0,0.88714,,,,,SKIN,HARTWIG_WGS_SKIN_2023


### O3D in IntOGen

In [62]:
df = pd.read_csv("/workspace/datasets/intogen/runs/dev-o3d/intogen_analysis/steps/seismic/HARTWIG_WGS_ANSC_2020.seismic.tsv.gz", sep="\t")
df

Unnamed: 0,region,mutated_tumours,exp_mutated_tumours,p.cohort_skew,fdr.cohort_skew
0,CNGB1,3,0.337144,0.00283,0.082061
1,KMT2C,4,1.132522,0.007603,0.110242
2,MUC16,8,2.822735,0.023947,0.231483
3,FBXW7,3,0.167436,0.08964,0.649888
4,KMT2D,4,1.342295,0.137794,0.799203
5,DYNC2H1,3,1.011219,0.247624,0.968681
6,SPTBN5,3,0.975607,0.257977,0.968681
7,STOML3,3,0.074012,0.378995,0.968681
8,TMEM132C,3,0.306986,0.424387,0.968681
9,SCN7A,3,0.356159,0.449441,0.968681


In [15]:
def load_3dclust(path_c_3dclust):
    """
    Load gene results from the 3d clustering method.
    """
    
    cohort_3dclust_df = pd.read_csv(path_c_3dclust)
    
    # if "pval" not in cohort_3dclust_df.columns:
    #     cohort_3dclust_df["pval"] = np.nan
    cohort_3dclust_df= cohort_3dclust_df[["Gene", "Uniprot_ID", "Status", "pval", "qval"]].reset_index()
    
    return cohort_3dclust_df.rename(columns = {"Status" : "o3d_status", "index" : "o3d_rank",
                                               "pval" : "o3d_pval", "qval" : "o3d_qval"})


def load_seismic(path_c_seismic):
    """
    Load gene results from the dndscv method.
    """
    
    df = pd.read_csv(path_c_seismic, sep = "\t")
    df = df[["region", "p.cohort_skew", "fdr.cohort_skew"]]
    df = df.sort_values("p.cohort_skew").reset_index(drop=True).reset_index()
    
    return df.rename(columns = {"index" : "seismic_rank", "region" : "Gene",
                                "p.cohort_skew" : "seismic_pval", "fdr.cohort_skew" : "seismic_qval"})     


def load_mutpanning(path_c_mutpanning):
    """
    Load gene results from the mutpanning method. 
    Genes are already sorted from most significant to least.
    If the same gene appears multiple times, the p-val and q-val 
    of the lowest rank is selected.
    """
    
    cohort_mutpanning_df = pd.read_csv(path_c_mutpanning, sep = "\t")
    cohort_mutpanning_df = cohort_mutpanning_df[~cohort_mutpanning_df["Significance"].isna()]
    cohort_mutpanning_df = cohort_mutpanning_df[["Name", "Significance", "FDR"]].reset_index()

    # Keep the lowest rank if there is a gene with multiple q-value
    idx = cohort_mutpanning_df.groupby('Name')['index'].idxmin()
    cohort_mutpanning_df = cohort_mutpanning_df.loc[idx].sort_values("index").drop(columns=["index"])
    cohort_mutpanning_df = cohort_mutpanning_df.reset_index(drop=True).reset_index()
    
    return cohort_mutpanning_df.rename(columns = {"index" : "mutpan_rank", "Name" : "Gene",
                                                  "Significance" : "mutpan_pval",
                                                  "FDR" : "mutpan_qval"})


def get_all_methods_results(cohort,
                            datasets,
                            o3d_output,
                            hotmaps_output,
                            intogen_output,
                            ctype=None,
                            ch_genes = None):
    """
    Load the result of each method and get the sorted gene ranks, the p-values and q-values.
    """
    
    # Load CGC and fishy genes
    cgc_df = pd.read_csv(f"{datasets}/CGC_list_v97.tsv", sep = "\t")
    cgc_genes = cgc_df["Gene Symbol"].values
    fishy_df = pd.read_csv(f"{datasets}/negative_gene_set.tsv", sep = '\t', header = None)
    fishy_df = fishy_df.rename(columns={0 : "Cancer_Type", 1 : "Fish"})

    # Paths to methods output
    path_3dclust = f"{o3d_output}/{cohort}.o3d_genes.csv"
    path_cbase = f"{intogen_output}/cbase/{cohort}.cbase.tsv.gz"
    path_seismic = f"{intogen_output}/seismic/{cohort}.seismic.tsv.gz"
    path_dndscv = f"{intogen_output}/dndscv/{cohort}.dndscv.tsv.gz"
    path_hotmaps = f"{hotmaps_output}/{cohort}.out.gz"
    path_mutpan = f"{intogen_output}/mutpanning/out/SignificanceFiltered/Significance{cohort}.txt"
    path_clustl = f"{intogen_output}/oncodriveclustl/{cohort}.elements_results.txt"
    path_fml = f"{intogen_output}/oncodrivefml/out/{cohort}-oncodrivefml.tsv.gz"
    path_smreg = f"{intogen_output}/smregions/{cohort}.smregions.tsv.gz"

    # Load all methods results
    
    cohort_df = load_3dclust(path_3dclust)

    if os.path.exists(path_cbase):
        cohort_cbase_df = load_cbase(path_cbase)
        cohort_df = cohort_df.merge(cohort_cbase_df, how="outer", on="Gene")
    else:
        print(f"Cant load cbase in {path_cbase}")

    if os.path.exists(path_seismic):
        cohort_seismic_df = load_seismic(path_seismic)
        cohort_df = cohort_df.merge(cohort_seismic_df, how="outer", on="Gene")
    else:
        print(f"Cant load cbase in {path_cbase}")

    if os.path.exists(path_dndscv):
        cohort_dndscv_df = load_dndscv(path_dndscv)
        cohort_df = cohort_df.merge(cohort_dndscv_df, how="outer", on="Gene")
    else:
        print(f"Cant load dndscv in {path_dndscv}")

    if os.path.exists(path_hotmaps):
        cohort_hotmaps_df = load_hotmaps(path_hotmaps)
        cohort_df = cohort_df.merge(cohort_hotmaps_df, how="outer", on="Gene")
    else:
        print(f"Cant load hotmaps in {path_hotmaps}")
        
    if os.path.exists(path_mutpan):
        cohort_mutpan_df = load_mutpanning(path_mutpan)
        cohort_df = cohort_df.merge(cohort_mutpan_df, how="outer", on="Gene")
    else:
        print(f"Cant load mutpanning in {path_mutpan}")
        
    if os.path.exists(path_clustl):
        cohort_clustl_df = load_clustl(path_clustl)
        cohort_df = cohort_df.merge(cohort_clustl_df, how="outer", on="Gene")
    else:
        print(f"Cant load oncrodriveclustl in {path_clustl}")
        
    if os.path.exists(path_fml):
        cohort_fml_df = load_fml(path_fml)
        cohort_df = cohort_df.merge(cohort_fml_df, how="outer", on="Gene")
    else:
        print(f"Cant load oncodrive fml in {path_fml}")
        
    if os.path.exists(path_smreg):
        cohort_smreg_df = load_smreg(path_smreg)
        cohort_df = cohort_df.merge(cohort_smreg_df, how="outer", on="Gene") 
    else:
        print(f"Cant load smregion in {path_smreg}")
        
    # Add CGC
    cgc = cohort_df["Gene"].isin(cgc_genes).astype(int)
    cohort_df.insert(4, "CGC", cgc)
    
    # Add fishy (Gene in Fishy list and not in CGC)
    fishy = np.nan
    if ctype:
        if ctype in fishy_df["Cancer_Type"].values:
            fishy = fishy_df[fishy_df["Cancer_Type"] == ctype].Fish.values[0].split(",")
            fishy = pd.concat((cohort_df["Gene"].isin(fishy), 
                               ~cohort_df["Gene"].isin(cgc_genes)), axis=1).all(1).astype(int)
        else:
            fishy = 0
    
    if ch_genes is not None:
        # Add CH
        ch = cohort_df["Gene"].isin(ch_genes).astype(int)  
        cohort_df.insert(4, "CH", ch)
        # Add fishy
        if ctype:
            fishy = fishy_df[fishy_df["Cancer_Type"] == "AML"].Fish.values[0].split(",")
            fishy = cohort_df["Gene"].isin(fishy).astype(int)
    
    cohort_df.insert(5, "Fish", fishy)
    cohort_df.insert(5, "o3d_rank", cohort_df.pop("o3d_rank"))
    
    return cohort_df


def get_genetable(cohort_df,
                  datasets,
                  o3d_output,
                  hotmaps_output,
                  intogen_output,
                  input_maf_profile,
                  filter_icgc=False,
                  ch_genes=None,
                  genetable_output=None, 
                  filename = None,
                  save = False):
    """
    Get the sorted rank of each method, p-value, q-value. 
    Also, annotate genes with CGC and fishy label.
    """
    
    genetable_df = pd.DataFrame()

    for cohort in tqdm(cohort_df["COHORT"]):

        if filter_icgc:
            if cohort.startswith("ICGC"):
                continue
            
        maf = f"{input_maf_profile}/maf/{cohort}.in.maf"
        mut_profile = f"{input_maf_profile}/mut_profile/{cohort}.mutrate.json"
        ctype = cohort_df[cohort_df["COHORT"] == cohort].CANCER_TYPE.values[0]
        
        if os.path.isfile(maf) and os.path.isfile(mut_profile):
            try:
                tmp_df = get_all_methods_results(cohort=cohort,
                                                 datasets=datasets, 
                                                 o3d_output=o3d_output,
                                                 hotmaps_output=hotmaps_output,
                                                 intogen_output=intogen_output,
                                                 ctype=ctype, 
                                                 ch_genes=ch_genes)
                tmp_df["Cancer"] = ctype
                tmp_df["Cohort"] = cohort
                genetable_df = pd.concat((genetable_df, tmp_df))
            except Exception as e:
                warnings.warn(f"{cohort} could not be processed")
                warnings.warn(f"Error: {e}")

        else:
            print(f"MAF or Mut rate not found for {cohort}")
            
    # Save
    if save and genetable_output and filename:
        output_path = f"{genetable_output}/{filename}.genetable.tsv"
        genetable_df.to_csv(output_path, sep = "\t", index = False, header = True)
        print(f"Genetable saved in: {output_path}")

    return genetable_df

In [76]:
RUN = "run_233106.added_seismic_cbase"
intogen_output = "/workspace/datasets/intogen/runs/dev-o3d/intogen_analysis/steps"
hotmaps_output = f"{datasets}/output/cancer/intogen_output/hotmaps"
df = get_genetable(cohort_df=cohort_df,
                    datasets=datasets,
                    o3d_output="/workspace/datasets/intogen/runs/dev-o3d/intogen_analysis/steps/oncodrive3d",
                    hotmaps_output=hotmaps_output,
                    intogen_output=intogen_output,
                    input_maf_profile = input_path,
                    filter_icgc=False,
                    ch_genes=None,
                    genetable_output=genetable_output, 
                    filename = RUN,
                    save = False)

df

In [80]:
f"{genetable_output}/{RUN}.tcga.genetable.tsv"

'/workspace/projects/clustering_3d/o3d_analysys/analysys/genetables/results/run_233106.added_seismic_cbase.tcga.genetable.tsv'

In [81]:
df = df[df["Cohort"].str.startswith("TCGA")]
df.to_csv(f"{genetable_output}/{RUN}.tcga.genetable.tsv", sep = "\t", index = False, header = True)

## CH

In [53]:
RUN = "run_20230512_ch"

datasets = "/workspace/projects/clustering_3d/o3d_analysys/datasets"
o3d_output = f"{datasets}/output/ch/o3d_output/{RUN}"
intogen_output = f"{datasets}/output/ch/intogen_output"
input_path = f"{datasets}/input/ch"
genetable_output = "/workspace/projects/clustering_3d/o3d_analysys/analysys/genetables/results"

In [82]:
!ls /workspace/projects/clustering_3d/o3d_analysys/analysys/genetables/results

run_20230914_5506.genetable.tsv
run_233106.added_seismic_cbase.tcga.genetable.tsv


In [54]:
cohort_df = pd.DataFrame({"COHORT" : ["OTHER_WXS_CH_IMPACT_PANEL"], "CANCER_TYPE" : ["CH"]})
cohort_df

Unnamed: 0,COHORT,CANCER_TYPE
0,OTHER_WXS_CH_IMPACT_PANEL,CH


In [55]:
ch_genes = pd.read_csv(f"{datasets}/ch_genes.tsv", sep = '\t')
ch_genes = ch_genes["Gene"].values

In [56]:
df = get_genetable(cohort_df=cohort_df,
                    datasets=datasets,
                    o3d_output=o3d_output,
                    intogen_output=intogen_output,
                    input_maf_profile = input_path,
                    filter_icgc=False,
                    ch_genes=ch_genes,
                    genetable_output=genetable_output,
                    filename = None,
                    save = False)

df

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Gene,Uniprot_ID,o3d_status,CH,Fish,o3d_rank,CGC,o3d_pval,o3d_qval,cbase_rank,...,clustl_pval,clustl_qval,fml_rank,fml_pval,fml_qval,smreg_rank,smreg_pval,smreg_qval,Cancer,Cohort
0,DNMT3A,Q9Y6K1,Processed,1,0,0.0,1,0.0,0.0,2.0,...,1.110223e-19,3.264056e-18,0.0,0.000001,0.000028,0.0,6.513607e-94,3.302399e-91,CH,OTHER_WXS_CH_IMPACT_PANEL
1,SF3B1,O75533,Processed,1,0,1.0,1,0.0,0.0,19.0,...,1.110223e-19,3.264056e-18,18.0,0.000075,0.001642,,,,CH,OTHER_WXS_CH_IMPACT_PANEL
2,JAK2,O60674,Processed,1,0,2.0,1,0.0,0.0,6.0,...,1.110223e-19,3.264056e-18,9.0,0.000001,0.000028,2.0,5.310583e-42,8.974885e-40,CH,OTHER_WXS_CH_IMPACT_PANEL
3,GNAS,Q5JWF2,Processed,1,0,3.0,1,0.0,0.0,20.0,...,7.817272e-10,1.276821e-08,13.0,0.000001,0.000028,3.0,3.561379e-25,4.514048e-23,CH,OTHER_WXS_CH_IMPACT_PANEL
4,SRSF2,Q01130,Processed,1,0,4.0,1,0.0,0.0,13.0,...,1.110223e-19,3.264056e-18,10.0,0.000001,0.000028,,,,CH,OTHER_WXS_CH_IMPACT_PANEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20760,TMEM247,,,0,0,,0,,,,...,,,,,,,,,CH,OTHER_WXS_CH_IMPACT_PANEL
20761,TTLL8,,,0,1,,0,,,,...,,,,,,,,,CH,OTHER_WXS_CH_IMPACT_PANEL
20762,TXNRD3,,,0,0,,0,,,,...,,,,,,,,,CH,OTHER_WXS_CH_IMPACT_PANEL
20763,USP17L23,,,0,0,,0,,,,...,,,,,,,,,CH,OTHER_WXS_CH_IMPACT_PANEL


In [183]:
test = pd.read_csv("/workspace/projects/clustering_3d/clustering_3d/datasets/seq_for_mut_prob.csv").groupby("Gene").apply(lambda x: len(x))
test.sort_values()

Gene
101F10.2         1
PIGZ SMP3        1
PIH1D1           1
PIH1D1 NOP17     1
PIH1D2           1
                ..
Tcr-alpha        5
KIR2DS5          5
HERVK_113        5
GNT1             9
UGT1            10
Length: 57968, dtype: int64