# Quantify how often clusters occurs across distant parts of the primary sequence

In [25]:
import pandas as pd
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, as_completed
import sys
import os
import re 

sys.path.append('../../')
from global_variables import SUBDIR, RUN, WORKSPACE

In [26]:
# Get genes detected by O3D
canonical_metadata = pd.read_table(f"{WORKSPACE}/nobackup/scratch/oncodrive3d/datasets_240506/seq_for_mut_prob.tsv")
canonical_metadata = canonical_metadata[["Gene", "Uniprot_ID"]].drop_duplicates().reset_index(drop=True)

genetable = pd.read_table(f"{WORKSPACE}/projects/clustering_3d/o3d_analysys/o3d_paper/notebooks/0.genetables/results/{SUBDIR}/{RUN}.genetable.tsv")
genetable = genetable[["Gene", "CGC", "Fish", "o3d_qval"]]
genetable = genetable[genetable["o3d_qval"] < 0.01].reset_index(drop=True)
genetable = genetable[["Gene", "CGC"]].drop_duplicates()
genetable = genetable.merge(canonical_metadata, on="Gene", how="left")
genetable

Unnamed: 0,Gene,CGC,Uniprot_ID
0,PIK3CA,1,P42336
1,RXRA,0,P19793
2,FGFR3,1,P22607
3,FBXW7,1,Q969H0
4,TP53,1,P04637
...,...,...,...
268,CNBD1,1,Q8NA66
269,CDH1,1,P12830
270,TRIM49C,0,P0CI26
271,SPTA1,0,P02549


In [27]:
genetable

Unnamed: 0,Gene,CGC,Uniprot_ID
0,PIK3CA,1,P42336
1,RXRA,0,P19793
2,FGFR3,1,P22607
3,FBXW7,1,Q969H0
4,TP53,1,P04637
...,...,...,...
268,CNBD1,1,Q8NA66
269,CDH1,1,P12830
270,TRIM49C,0,P0CI26
271,SPTA1,0,P02549


## Check at cluster level

In [28]:
def get_all_o3d_res_result(cohort_df, o3d_output_dir):

    lst_df = []

    for cohort in cohort_df["COHORT"].values:
        cohort = "TCGA_WXS_SOFT_TISSUE" if cohort == "TCGA_WXS_SARC" else cohort
        path_csv = f"{o3d_output_dir}/{cohort}/{cohort}.3d_clustering_pos.csv"
        if os.path.exists(path_csv):
            df_o3d = pd.read_csv(path_csv, low_memory=False)
            cols = ["Cohort", "Gene", "Pos", "Mut_in_gene", "Score", "Score_obs_sim", "pval", "Rank", "Res_in_cl", 'C', 'C_ext']
            df_o3d = df_o3d[cols].dropna(subset=["pval"]).reset_index(drop=True)
            #df_o3d = df_o3d.reset_index().rename(columns={"index": "Rank"})
            df_o3d.pval = df_o3d.pval.replace(0, 0.00001)
            lst_df.append(df_o3d)
        else:
            print(f"Path doesn't exist: {path_csv}")
            
    df = pd.concat(lst_df)
    df = df.sort_values(["Cohort", "Gene", "Pos"], ascending=[False, True, True]).reset_index(drop=True)

    return df


def get_all_o3d_gene_result(cohort_df, o3d_output_dir):

    lst_df = []
    n_dict = {}
    for cohort in cohort_df["COHORT"].values:
        cohort = "TCGA_WXS_SOFT_TISSUE" if cohort == "TCGA_WXS_SARC" else cohort
        path_csv = f"{o3d_output_dir}/{cohort}/{cohort}.3d_clustering_genes.csv"
        if os.path.exists(path_csv):
            df_o3d = pd.read_csv(path_csv, low_memory=False)
            df_o3d = df_o3d[["Gene", "Uniprot_ID", "Cohort", "pval", "qval", "C_pos", "Score_obs_sim_top_vol", "Mut_in_top_vol", "Mut_in_gene"]].dropna(subset=["pval"]).reset_index(drop=True)
            df_o3d.pval = df_o3d.pval.replace(0, 0.00001)
            lst_df.append(df_o3d)
        else:
            print(f"Path doesn't exist: {path_csv}")
            
    df = pd.concat(lst_df)
    df = df.sort_values(["Cohort", "pval"], ascending=[False, True])

    return df.reset_index(drop=True)


def load_cmap(path, uni_id, f):

    cmap = np.load(f"{path}/{uni_id}-F{f}.npy")
    return (cmap > 0.5).astype(int)


def get_contact(cmap, x):
    return np.where(cmap[x] == 1)[0]

In [29]:
datasets = f"/{WORKSPACE}/projects/clustering_3d/o3d_analysys/datasets/"
input_path = f"{datasets}/input/cancer_202404"
cohort_df = pd.read_csv(f"{input_path}/cohorts.tsv", sep="\t")
cohort_df = cohort_df[~cohort_df.COHORT.str.startswith("ICGC")].reset_index(drop=True)
o3d_output_dir = f"{datasets}/output/cancer_202404/o3d_output/{SUBDIR}/{RUN}"

df_genes = get_all_o3d_gene_result(cohort_df, o3d_output_dir)
df_res = get_all_o3d_res_result(cohort_df, o3d_output_dir)

Path doesn't exist: //workspace/projects/clustering_3d/o3d_analysys/datasets//output/cancer_202404/o3d_output/human_mane_raw/run_2024-07-01_16-04-14/PEDCBIOP_WXS_TALL_REL/PEDCBIOP_WXS_TALL_REL.3d_clustering_genes.csv
Path doesn't exist: //workspace/projects/clustering_3d/o3d_analysys/datasets//output/cancer_202404/o3d_output/human_mane_raw/run_2024-07-01_16-04-14/PEDCBIOP_WXS_TALL_REL/PEDCBIOP_WXS_TALL_REL.3d_clustering_pos.csv


In [30]:
genes_hit = df_genes[df_genes["qval"] < 0.01].reset_index(drop=True)
genes_hit['Cohort_Gene'] = genes_hit['Cohort'] + '|' + genes_hit['Gene']
genes_hit

Unnamed: 0,Gene,Uniprot_ID,Cohort,pval,qval,C_pos,Score_obs_sim_top_vol,Mut_in_top_vol,Mut_in_gene,Cohort_Gene
0,BRAF,P15056,TCGA_WXS_WDTC,0.00001,0.0,[601 600],179.618281,287.0,287,TCGA_WXS_WDTC|BRAF
1,NRAS,P01111,TCGA_WXS_WDTC,0.00001,0.0,[61],15.507143,39.0,39,TCGA_WXS_WDTC|NRAS
2,HRAS,P01112,TCGA_WXS_WDTC,0.00001,0.0,[61],7.066159,17.0,17,TCGA_WXS_WDTC|HRAS
3,GNAQ,A0A024R240,TCGA_WXS_UM,0.00001,0.0,[209 48],18.591696,38.0,40,TCGA_WXS_UM|GNAQ
4,GNA11,P29992,TCGA_WXS_UM,0.00001,0.0,[209],14.753669,34.0,36,TCGA_WXS_UM|GNA11
...,...,...,...,...,...,...,...,...,...,...
614,G3BP1,Q13283,CBIOP_WXS_ANGS_TREATED_2020,0.00001,0.0,[4 5],3.180288,4.0,4,CBIOP_WXS_ANGS_TREATED_2020|G3BP1
615,RETSAT,Q6NUM9,CBIOP_WXS_ACY_2019,0.00001,0.0,[533 536],3.745953,6.0,6,CBIOP_WXS_ACY_2019|RETSAT
616,NOTCH2,Q04721,CBIOP_WXS_ACY_2019,0.00001,0.0,[3],3.656038,4.0,4,CBIOP_WXS_ACY_2019|NOTCH2
617,TP53,K7PPA8,CBIOP_WGS_STAD_ONCOSG_2018,0.00001,0.0,[239 242 245 236 248 273 175 195 237 179 132 2...,7.151944,26.0,47,CBIOP_WGS_STAD_ONCOSG_2018|TP53


In [31]:
cmaps = f"{WORKSPACE}/nobackup/scratch/oncodrive3d/datasets_mane_240506/prob_cmaps"
seq_df = pd.read_table(f"{WORKSPACE}/nobackup/scratch/oncodrive3d/datasets_mane_240506/seq_for_mut_prob.tsv")
seq_df = seq_df[seq_df.Gene.isin(genes_hit.Gene)].reset_index(drop=True)[["Gene", "Uniprot_ID", "F"]]
seq_df

Unnamed: 0,Gene,Uniprot_ID,F
0,ZNF93,P35789,1
1,ZNF91,Q05481,1
2,ZNF880,Q6PDB4,1
3,ZNF83,P51522,1
4,ZNF804A,Q7Z570,1
...,...,...,...
268,AHNAK,Q09666,24M
269,AGAP6,Q5VW22,1
270,ADH4,P08319,1
271,ADGRV1,Q8WXG9,26M


In [32]:
df_res['Cohort_Gene'] = df_res['Cohort'] + '|' + df_res['Gene']
df_res = df_res[df_res.Cohort_Gene.isin(genes_hit.Cohort_Gene)]
df_res = df_res[df_res["C"] == 1].reset_index(drop=True)
df_res['Cohort_Gene'] = df_res['Cohort_Gene'] + '|' + df_res['Pos'].astype(str)
df_res[:60]

Unnamed: 0,Cohort,Gene,Pos,Mut_in_gene,Score,Score_obs_sim,pval,Rank,Res_in_cl,C,C_ext,Cohort_Gene
0,TCGA_WXS_WDTC,BRAF,600,287,1.0,179.618281,1e-05,1,2.0,1,0.0,TCGA_WXS_WDTC|BRAF|600
1,TCGA_WXS_WDTC,BRAF,601,287,1.0,159.730994,1e-05,0,2.0,1,0.0,TCGA_WXS_WDTC|BRAF|601
2,TCGA_WXS_WDTC,HRAS,61,17,1.0,7.066159,1e-05,0,1.0,1,0.0,TCGA_WXS_WDTC|HRAS|61
3,TCGA_WXS_WDTC,NRAS,61,39,1.0,15.507143,1e-05,0,1.0,1,0.0,TCGA_WXS_WDTC|NRAS|61
4,TCGA_WXS_UM,GNA11,209,36,0.880534,14.753669,1e-05,0,1.0,1,0.0,TCGA_WXS_UM|GNA11|209
5,TCGA_WXS_UM,GNAQ,48,40,0.887462,18.591696,1e-05,1,2.0,1,0.0,TCGA_WXS_UM|GNAQ|48
6,TCGA_WXS_UM,GNAQ,209,40,0.894889,16.664473,1e-05,0,2.0,1,0.0,TCGA_WXS_UM|GNAQ|209
7,TCGA_WXS_UM,SF3B1,625,18,1.0,11.743272,1e-05,0,4.0,1,0.0,TCGA_WXS_UM|SF3B1|625
8,TCGA_WXS_UM,SF3B1,662,18,1.0,13.488796,1e-05,2,4.0,1,0.0,TCGA_WXS_UM|SF3B1|662
9,TCGA_WXS_UM,SF3B1,663,18,1.0,14.147752,1e-05,3,4.0,1,0.0,TCGA_WXS_UM|SF3B1|663


In [42]:
df_res = pd.read_csv("data/df_res.csv")
genes_hit = pd.read_csv("data/genes_hit.csv")

In [36]:
df_res.to_csv("data/df_res.csv", index=False)
genes_hit.to_csv("data/genes_hit.csv", index=False)

In [41]:
df_res_sorted = df_res.sort_values(["Gene", "Pos"]).reset_index(drop=True)
clusters = df_res_sorted["Gene"] + "|" + df_res_sorted["Pos"].astype(str)
clusters = clusters.unique()
clusters_dict = {}

current_gene = ""
for cluster in tqdm(clusters):
    gene, pos = cluster.split("|")
    pos = int(pos)
    _, uni_id, f = seq_df[seq_df["Gene"] == gene].values[0]
    if gene != current_gene:
        clusters_dict[gene] = {}
        cmap = load_cmap(cmaps, uni_id, f)
        current_gene = gene
    clusters_dict[gene][pos] = get_contact(cmap, pos-1)

100%|██████████| 1436/1436 [00:01<00:00, 1364.65it/s]


In [None]:
for gene, df in df_res.groupby(["Cohort", "Gene"]):
    print(i)
    display(df)
    break

('CBIOP_WGS_PRAD_EURUROL_2017', 'SPOP')


Unnamed: 0,Cohort,Gene,Pos,Mut_in_gene,Score,Score_obs_sim,pval,Rank,Res_in_cl,C,C_ext,Cohort_Gene
4111,CBIOP_WGS_PRAD_EURUROL_2017,SPOP,87,11,0.840076,5.499366,1e-05,1,5.0,1,0.0,CBIOP_WGS_PRAD_EURUROL_2017-SPOP-87
4112,CBIOP_WGS_PRAD_EURUROL_2017,SPOP,102,11,0.448503,3.155023,1e-05,2,5.0,1,0.0,CBIOP_WGS_PRAD_EURUROL_2017-SPOP-102
4113,CBIOP_WGS_PRAD_EURUROL_2017,SPOP,125,11,0.12842,1.013015,0.3533,4,5.0,1,1.0,CBIOP_WGS_PRAD_EURUROL_2017-SPOP-125
4114,CBIOP_WGS_PRAD_EURUROL_2017,SPOP,131,11,1.0,5.947184,1e-05,0,5.0,1,0.0,CBIOP_WGS_PRAD_EURUROL_2017-SPOP-131
4115,CBIOP_WGS_PRAD_EURUROL_2017,SPOP,133,11,0.381369,2.851625,1e-05,3,5.0,1,0.0,CBIOP_WGS_PRAD_EURUROL_2017-SPOP-133
