In [417]:
import sys
sys.path.append('/workspace/projects/clustering_3d/o3d_analysys/o3d_paper/')
from global_variables import *

In [418]:
import pandas as pd
from matplotlib import pyplot as plt
import os
import numpy as np

In [443]:
def capitalize_words(text):
    
    return ' '.join(word.capitalize() for word in text.split())


def is_in_sub(tissue, sub_dict):

    key_ix = np.array([key in tissue for key in sub_dict.keys()]).flatten()
    is_in = key_ix.any()

    if is_in:
        tissue = np.array(list(sub_dict.keys()))[key_ix]
        return is_in, tissue
    else:
        return is_in, np.nan


def get_parent_tissues_dict(lst_tissues, oncotree_df):

    tissue_parent_dict = {}
    upper_ttypes = ["SOLID", "BOWEL", "OTHER", "CANCER", "NON_SOLID", np.nan]
    for tissue in lst_tissues:
        parent = tissue
        while parent not in upper_ttypes:
            if parent not in upper_ttypes:
                if parent not in upper_ttypes:
                    tissue_parent_dict[tissue] = parent
                    parent = oncotree_df[oncotree_df["ID"] == parent].PARENT.values
                    if len(parent) == 0:
                        parent = "OTHER"
                    else:
                        parent = parent[0]

    return tissue_parent_dict

In [444]:
datasets = f"/workspace/projects/clustering_3d/o3d_analysys/datasets/"
input_path = f"{datasets}/input/cancer_202404"
cohort_df = pd.read_csv(f"{input_path}/cohorts.tsv", sep="\t")
cohort_df = cohort_df[~cohort_df.COHORT.str.startswith("ICGC")].reset_index(drop=True)
cohort_release_df = pd.read_csv(f"{input_path}/cohorts_release.tsv", sep="\t")

oncotree_df = pd.read_csv(f"{datasets}/input/cancer_202404/oncotree.tsv", sep="\t")
display(oncotree_df)

Unnamed: 0,ID,PARENT,NAMES
0,AA,SOFT_TISSUE,Aggressive Angiomyxoma
1,AASTR,DIFG,Anaplastic Astrocytoma
2,ABC,DLBCLNOS,Activated B-cell Type
3,ABL,AMLNOS,Acute Basophilic Leukemia
4,ACA,ADRENAL_GLAND,Adrenocortical Adenoma
...,...,...,...
883,WDLS,LIPO,Well-Differentiated Liposarcoma
884,WDTC,THYROID,Well-Differentiated Thyroid Cancer
885,WM,LPL,Waldenstrom Macroglobulinemia
886,WPSCC,PSCC,Warty Penile Squamous Cell Carcinoma


## Parse

In [445]:
# Soft tissue issue 
cohort_df.COHORT = cohort_df.COHORT.replace("TCGA_WXS_SOFT_TISSUE", "TCGA_WXS_SARC")
cohort_df.CANCER_TYPE = cohort_df.CANCER_TYPE.replace("SOFT_TISSUE", "SARC")
cohort_df.CANCER_TYPE = cohort_df.CANCER_TYPE.replace("SARCNOS", "SARC")
cohort_df.CANCER_TYPE.unique()

array(['BLCA', 'EGC', 'LMS', 'STAD', 'MEL', 'AML', 'HCC', 'UCEC', 'EPM',
       'COADREAD', 'GB', 'ANSC', 'RCC', 'OS', 'BCC', 'RMS', 'PRAD', 'ACC',
       'WDTC', 'ES', 'UTUC', 'GIST', 'LIPO', 'MBL', 'ANGS', 'GBC', 'ALL',
       'LUNG', 'BRCA', 'RBL', 'CLLSLL', 'BLADDER', 'SCLC', 'NSCLC',
       'STOMACH', 'SKCM', 'NETNOS', 'SARC', 'COAD', 'HNSC', 'SIC', 'CHOL',
       'ESCC', 'DLBCLNOS', 'GBM', 'HGGNOS', 'NBL', 'ESCA', 'CSCC', 'SKIN',
       'LUAD', 'CEAD', 'PLMESO', 'LGGNOS', 'PAAD', 'OVT', 'LUSC', 'SACA',
       'CESC', 'CCRCC', 'PANCREAS', 'ACYC', 'MT', 'PROSTATE', 'READ',
       'PCM', 'BL', 'VULVA', 'PAST', 'ATRT', 'PANET', 'NHL', 'CHRCC',
       'MDS', 'NPC', 'LIHB', 'WT', 'MGCT', 'UCS', 'UM', 'THYM', 'PRCC',
       'PGNG'], dtype=object)

In [446]:
# Get organ for our cohorts
cohorts_tissue_parent_dict = get_parent_tissues_dict(cohort_df.CANCER_TYPE.unique(), oncotree_df)
cohorts_tissue_parent_dict["SARC"] = "SOFT_TISSUE"
cohorts_tissue_parent_dict["SOFT_TISSUE"] = "SOFT_TISSUE"
cohort_df["GENERAL_CANCER_TYPE"] = cohort_df.CANCER_TYPE.apply(lambda x: cohorts_tissue_parent_dict[x])
cohort_df["GENERAL_CANCER_TYPE"] = cohort_df["GENERAL_CANCER_TYPE"].str.replace("SOFT_TISSUE", "SARC")
cohort_df

Unnamed: 0,COHORT,CANCER_TYPE,PLATFORM,MUTATIONS,SAMPLES,GENERAL_CANCER_TYPE
0,HARTWIG_WGS_BLCA_2023,BLCA,WGS,5017561,144,BLADDER
1,HARTWIG_WGS_EGC_2023,EGC,WGS,573540,21,STOMACH
2,HARTWIG_WGS_LMS_2023,LMS,WGS,339549,43,SARC
3,CBIOP_WGS_STAD_ONCOSG_2018,STAD,WGS,861309,147,STOMACH
4,CBIOP_WXS_CM_VALLEN_2018,MEL,WXS,132960,151,SKIN
...,...,...,...,...,...,...
210,TCGA_WXS_PRCC,PRCC,WXS,34566,276,KIDNEY
211,TCGA_WXS_PRAD,PRAD,WXS,34239,494,PROSTATE
212,TCGA_WXS_PLMESO,PLMESO,WXS,3980,82,PLEURA
213,TCGA_WXS_SARC,SARC,WXS,31484,234,SARC


In [448]:
# Merge full cancer name
cohort_release_df = cohort_release_df.rename(columns={"Cohort" : "COHORT", 
                                                      "Cancer type" : "CANCER_TYPE_FULL", 
                                                      "Age" : "AGE", 
                                                      "Type" : "TYPE"})
cohort_release_df.AGE = cohort_release_df.AGE.replace("AAdult", "Adult")
cohort_release_df.AGE = cohort_release_df.AGE.replace("PPediatric", "Pediatric")
cohort_release_df.TYPE = cohort_release_df.TYPE.replace("PPrimary", "Primary")
cohort_release_df.TYPE = cohort_release_df.TYPE.replace("RRelapse", "Relapse")
cohort_release_df.TYPE = cohort_release_df.TYPE.replace("MMetastasis", "Metastasis")
cohort_release_df = cohort_release_df.drop(columns=["Cancer drivers", "Samples", "Mutations"])

# Replace Soft Tissue with Sarcoma
cohort_release_df.COHORT = cohort_release_df.COHORT.replace("TCGA_WXS_SOFT_TISSUE", "TCGA_WXS_SARC")
cohort_release_df.CANCER_TYPE = cohort_release_df.CANCER_TYPE_FULL.replace("Soft Tissue", "Sarcoma")
cohort_release_df.CANCER_TYPE = cohort_release_df.CANCER_TYPE_FULL.replace("Sarcoma, NOS", "Sarcoma")

# Merge
cohort_df = cohort_df.merge(cohort_release_df, on="COHORT", how="left").drop(columns=["PLATFORM", "MUTATIONS"])
cohort_df["GENERAL_CANCER_TYPE"] = cohort_df["GENERAL_CANCER_TYPE"].str.replace("_", " ")
cohort_df

  cohort_release_df.CANCER_TYPE = cohort_release_df.CANCER_TYPE_FULL.replace("Soft Tissue", "Sarcoma")


Unnamed: 0,COHORT,CANCER_TYPE,SAMPLES,GENERAL_CANCER_TYPE,CANCER_TYPE_FULL,AGE,TYPE
0,HARTWIG_WGS_BLCA_2023,BLCA,144,BLADDER,Bladder Urothelial Carcinoma,Adult,Metastasis
1,HARTWIG_WGS_EGC_2023,EGC,21,STOMACH,Esophagogastric Adenocarcinoma,Adult,Metastasis
2,HARTWIG_WGS_LMS_2023,LMS,43,SARC,Leiomyosarcoma,Adult,Metastasis
3,CBIOP_WGS_STAD_ONCOSG_2018,STAD,147,STOMACH,Stomach Adenocarcinoma,Adult,Primary
4,CBIOP_WXS_CM_VALLEN_2018,MEL,151,SKIN,Melanoma,Adult,Primary
...,...,...,...,...,...,...,...
210,TCGA_WXS_PRCC,PRCC,276,KIDNEY,Papillary Renal Cell Carcinoma,Adult,Primary
211,TCGA_WXS_PRAD,PRAD,494,PROSTATE,Prostate Adenocarcinoma,Adult,Primary
212,TCGA_WXS_PLMESO,PLMESO,82,PLEURA,Pleural Mesothelioma,Adult,Primary
213,TCGA_WXS_SARC,SARC,234,SARC,Soft Tissue,Adult,Primary


## Plot

In [None]:
# Aggregate the samples for GENERAL_CANCER_TYPE and CANCER_TYPE_FULL
general_type_agg = df.groupby("GENERAL_CANCER_TYPE")["SAMPLES"].sum()
full_type_agg = df.groupby(["GENERAL_CANCER_TYPE", "CANCER_TYPE"])["SAMPLES"].sum()

# Colors for the pie chart
colors = plt.cm.tab20.colors



fig, ax = plt.subplots(figsize=(21, 21))
size = 0.3  # Thickness of each ring

# Plot the inner ring
inner_sizes = general_type_agg.values
inner_labels = general_type_agg.index
wedges_inner, _ = ax.pie(inner_sizes, radius=1, labels=None, colors=colors[:len(inner_sizes)],
                         wedgeprops=dict(width=size, edgecolor='w'))

# Add labels
for wedge, label in zip(wedges_inner, inner_labels):
    if df[df["GENERAL_CANCER_TYPE"] == label].SAMPLES.sum() < 30:
        continue
    if label in ["TESTIS", "PNS"]:
            pad = 1.05
    else:
        pad = 1
        
    theta = (wedge.theta1 + wedge.theta2) / 2  # Angle in degrees
    x = 0.85 * np.cos(np.radians(theta)) * pad  
    y = 0.85 * np.sin(np.radians(theta)) * pad
    ax.text(x, y, label, ha="center", va="center", fontsize=9)

# Plot the outer ring 
outer_sizes = full_type_agg.values
outer_labels = full_type_agg.index.get_level_values(1)
wedges_outer, _ = ax.pie(outer_sizes, radius=1 + size, labels=None, colors=colors[:len(outer_sizes)],
                         wedgeprops=dict(width=size, edgecolor='w'))

# Add labels 
pad_out_txt = 1.4 
for i, (wedge, label) in enumerate(zip(wedges_outer, outer_labels)):
    if df[df["CANCER_TYPE"] == label].SAMPLES.sum() < 40:
        continue
    if label in ["PANET", "BCC", "GIST", "UM", "NPC", "CESC", "PROSTATE"]:
        pad = 1.455
    elif label in ["RBL"]:
        pad = 1.505
    elif label in ["SIC", "RCC", "GB", "STOMACH", "ANGS"]: 
        pad = 1.405
    elif label in ["PGNG"]:
        pad = 1.385
    else:
        pad = pad_out_txt 
    theta = (wedge.theta1 + wedge.theta2) / 2  
    x_arrow = np.cos(np.radians(theta)) * pad 
    y_arrow = np.sin(np.radians(theta)) * pad
    x_start = np.cos(np.radians(theta)) * (1 + size) 
    y_start = np.sin(np.radians(theta)) * (1 + size)
    ax.annotate(label, xy=(x_start, y_start), xytext=(x_arrow, y_arrow),
                arrowprops=dict(arrowstyle="-", color="black", lw=0.1),
                ha="center", va="center", fontsize=9)

# Avoid clipping
ax.set_xlim(-2, 2)
ax.set_ylim(-2, 2)

plt.savefig(filename, dpi=300, bbox_inches='tight')
plt.show()