In [2]:
cd ..

In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import optuna
import dill
from torch.utils.data import DataLoader
from pytorch_lightning.utilities.seed import isolate_rng
import torch
import seaborn as sns
from lifelines.statistics import multivariate_logrank_test
from lifelines import KaplanMeierFitter, CoxPHFitter
import matplotlib.pyplot as plt
from lifelines.plotting import add_at_risk_counts
from scipy.stats import chi2_contingency, kruskal, wilcoxon

from src import settings
from utils import MultiViewDataset, transform_full_dataset
from optimization import Optimization

In [6]:
clinical_data = pd.read_csv(settings.clinical_data_path, sep="\t")
clinical_data = clinical_data.set_index("Patient ID")
clinical_data.index.name = None
print("clinical_data.shape", clinical_data.shape)
clinical_data.head()

In [7]:
methylation_data = pd.read_csv(settings.methylation_data_path, sep=";", index_col=0, decimal=",")
methylation_data.columns = methylation_data.columns.str.replace(".", "-")
methylation_data = methylation_data.T
methylation_data = methylation_data.astype(np.float32)
print("methylation_data.shape", methylation_data.shape)
methylation_data.head()

In [8]:
rnaseq_data = pd.read_csv(settings.rnaseq_data_path, sep=";", index_col=0, decimal=",")
rnaseq_data = rnaseq_data.T
rnaseq_data = rnaseq_data.astype(np.float32)
print("rnaseq_data.shape", rnaseq_data.shape)
rnaseq_data.head()

In [9]:
samples = methylation_data.index.intersection(rnaseq_data.index)
methylation_data = methylation_data.loc[samples]
rnaseq_data = rnaseq_data.loc[samples]
assert methylation_data.index.equals(rnaseq_data.index)
clinical_data = clinical_data.loc[samples]
assert methylation_data.index.equals(clinical_data.index)
Xs= [rnaseq_data, methylation_data]
print("common samples:", len(samples))

In [30]:
transformed_Xs = transform_full_dataset(Xs=Xs, fit_pipelines = False, results_folder = settings.results_path)
for transformed_X in transformed_Xs:
    print("transformed_X.shape", transformed_X.shape)
    display(transformed_X.head())

In [31]:
clustering_statistical_table = pd.DataFrame([], columns= ["SNF", "iNMF", "jNMF"], index= ["Overall Survival", "Diagnosis age", "AJCC tumor stage",
                                                                                                       "AJCC metastasis stage", "AJCC neoplasm histologic grade", "Sex"])


In [34]:
from snf import compute
from sklearn import cluster
affinities = compute.make_affinity(transformed_Xs, metric='euclidean')
fused = compute.snf(affinities)
n_clusters_snf = compute.get_n_clusters(fused)[0]
preds_snf = cluster.spectral_clustering(fused, n_clusters= n_clusters_snf)
clinical_data["preds_snf"] = preds_snf
ax = clinical_data["preds_snf"].value_counts().plot(kind="bar", title= "Count of samples in clusters", x= "preds_snf", ylabel= "Number of samples")
for container in ax.containers:
    ax.bar_label(container)

In [35]:
logrank_test = multivariate_logrank_test(event_durations= clinical_data["Overall Survival (Months)"], groups= clinical_data["preds_snf"])
logrank_test

In [36]:
ax = plt.subplot(111)
ax.set_title("Survival plot")
ax.text(0, 0, f"p-value= {round(logrank_test.p_value, 3)}")
clustering_statistical_table.loc["Overall Survival", "HC"] = logrank_test.p_value
kmfs = []
for cluster in sorted(clinical_data["preds_snf"].unique()):
    duration = clinical_data["Overall Survival (Months)"][clinical_data["preds_snf"] == cluster]
    kmf = KaplanMeierFitter().fit(duration, label = str(cluster))
    kmfs.append(kmf)
    ax = kmf.plot(ax=ax)

add_at_risk_counts(*kmfs, ax=ax)
plt.tight_layout()

In [37]:
clinical_label = "American Joint Committee on Cancer Tumor Stage Code"
clinical_parameter = clinical_data[clinical_label].str[-1].astype(float).dropna().astype(int)
clinical_label = "AJN Tumor stage"
clinical_parameter = clinical_parameter.rename(clinical_label)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_snf'])).pvalue
clustering_statistical_table.loc["AJCC tumor stage", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_snf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [38]:
clinical_label = "Neoplasm Histologic Grade"
clinical_parameter = clinical_data[clinical_label].str[-1].astype(float).dropna().astype(int)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_snf'])).pvalue
clustering_statistical_table.loc["AJCC neoplasm histologic grade", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_snf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [39]:
clinical_label = "Sex"
clinical_parameter = clinical_data[clinical_label]
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
clinical_parameter = pd.get_dummies(clinical_data[clinical_label], drop_first=True).astype(int).squeeze()
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_snf'])).pvalue
clustering_statistical_table.loc["Sex", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
sns.countplot(data= clinical_data, x= clinical_label, hue= "preds_snf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [40]:
clinical_label = "American Joint Committee on Cancer Metastasis Stage Code"
clinical_parameter = clinical_data[clinical_label].str[-1]
clinical_parameter = clinical_parameter[clinical_parameter != "X"]
clinical_parameter = clinical_parameter.astype(float).dropna().astype(int)
clinical_label = "AJN Cancer Metastasis"
clinical_parameter = clinical_parameter.rename(clinical_label)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_snf'])).pvalue
clustering_statistical_table.loc["AJCC metastasis stage", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
ax= sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_snf")
ax.set_title(f"p-value= {pval}", fontsize= 10)
for container in ax.containers:
    ax.bar_label(container)

In [41]:
clinical_label = "Diagnosis Age"
clinical_parameter = clinical_data[clinical_label].dropna()
pval = kruskal(*[clinical_parameter[clinical_data['preds_snf'] == cl] for cl in clinical_data['preds_snf'].unique()]).pvalue
clustering_statistical_table.loc["Diagnosis age", "HC"] = pval
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
ax = clinical_parameter.boxplot(column= clinical_label, by= "preds_snf", grid=False)
_ = ax.text(1.3, 50, f"p-value= {round(pval, 3)}")

In [65]:
from bignmf.models.jnmf.integrative import IntegrativeJnmf
from sklearn.preprocessing import MinMaxScaler

model = IntegrativeJnmf({0: MinMaxScaler().set_output(transform= "pandas").fit_transform(transformed_Xs[0]), 1: MinMaxScaler().set_output(transform= "pandas").fit_transform(transformed_Xs[1])}, 2, 0.1)
# Runs the model
model.run(50, 100, verbose=0)
model.cluster_data()
clinical_data["preds_inmf"] = np.argmax(model.w_cluster, 1)
ax = clinical_data["preds_inmf"].value_counts().plot(kind="bar", title= "Count of samples in clusters", x= "preds_inmf", ylabel= "Number of samples")
for container in ax.containers:
    ax.bar_label(container)

In [66]:
logrank_test = multivariate_logrank_test(event_durations= clinical_data["Overall Survival (Months)"], groups= clinical_data["preds_inmf"])
logrank_test

In [67]:
ax = plt.subplot(111)
ax.set_title("Survival plot")
ax.text(0, 0, f"p-value= {round(logrank_test.p_value, 3)}")
clustering_statistical_table.loc["Overall Survival", "HC"] = logrank_test.p_value
kmfs = []
for cluster in sorted(clinical_data["preds_inmf"].unique()):
    duration = clinical_data["Overall Survival (Months)"][clinical_data["preds_inmf"] == cluster]
    kmf = KaplanMeierFitter().fit(duration, label = str(cluster))
    kmfs.append(kmf)
    ax = kmf.plot(ax=ax)

add_at_risk_counts(*kmfs, ax=ax)
plt.tight_layout()

In [68]:
clinical_label = "American Joint Committee on Cancer Tumor Stage Code"
clinical_parameter = clinical_data[clinical_label].str[-1].astype(float).dropna().astype(int)
clinical_label = "AJN Tumor stage"
clinical_parameter = clinical_parameter.rename(clinical_label)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])).pvalue
clustering_statistical_table.loc["AJCC tumor stage", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_inmf"] = clinical_data['preds_inmf']
sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_inmf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [69]:
clinical_label = "Neoplasm Histologic Grade"
clinical_parameter = clinical_data[clinical_label].str[-1].astype(float).dropna().astype(int)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])).pvalue
clustering_statistical_table.loc["AJCC neoplasm histologic grade", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_inmf"] = clinical_data['preds_inmf']
sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_inmf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [70]:
clinical_label = "Sex"
clinical_parameter = clinical_data[clinical_label]
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
clinical_parameter = pd.get_dummies(clinical_data[clinical_label], drop_first=True).astype(int).squeeze()
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])).pvalue
clustering_statistical_table.loc["Sex", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_inmf"] = clinical_data['preds_inmf']
sns.countplot(data= clinical_data, x= clinical_label, hue= "preds_inmf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [71]:
clinical_label = "American Joint Committee on Cancer Metastasis Stage Code"
clinical_parameter = clinical_data[clinical_label].str[-1]
clinical_parameter = clinical_parameter[clinical_parameter != "X"]
clinical_parameter = clinical_parameter.astype(float).dropna().astype(int)
clinical_label = "AJN Cancer Metastasis"
clinical_parameter = clinical_parameter.rename(clinical_label)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_inmf'])).pvalue
clustering_statistical_table.loc["AJCC metastasis stage", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_inmf"] = clinical_data['preds_inmf']
ax= sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_inmf")
ax.set_title(f"p-value= {pval}", fontsize= 10)
for container in ax.containers:
    ax.bar_label(container)

In [72]:
clinical_label = "Diagnosis Age"
clinical_parameter = clinical_data[clinical_label].dropna()
pval = kruskal(*[clinical_parameter[clinical_data['preds_inmf'] == cl] for cl in clinical_data['preds_inmf'].unique()]).pvalue
clustering_statistical_table.loc["Diagnosis age", "HC"] = pval
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_inmf"] = clinical_data['preds_inmf']
ax = clinical_parameter.boxplot(column= clinical_label, by= "preds_inmf", grid=False)
_ = ax.text(1.3, 50, f"p-value= {round(pval, 3)}")

In [74]:
from bignmf.models.jnmf.standard import StandardJnmf
from sklearn.preprocessing import MinMaxScaler

model = StandardJnmf({0: MinMaxScaler().set_output(transform= "pandas").fit_transform(transformed_Xs[0]), 1: MinMaxScaler().set_output(transform= "pandas").fit_transform(transformed_Xs[1])}, 2)
# Runs the model
model.run(50, 100, verbose=0)
model.cluster_data()
clinical_data["preds_jnmf"] = np.argmax(model.w_cluster, 1)
ax = clinical_data["preds_jnmf"].value_counts().plot(kind="bar", title= "Count of samples in clusters", x= "preds_jnmf", ylabel= "Number of samples")
for container in ax.containers:
    ax.bar_label(container)

In [75]:
logrank_test = multivariate_logrank_test(event_durations= clinical_data["Overall Survival (Months)"], groups= clinical_data["preds_jnmf"])
logrank_test

In [76]:
ax = plt.subplot(111)
ax.set_title("Survival plot")
ax.text(0, 0, f"p-value= {round(logrank_test.p_value, 3)}")
clustering_statistical_table.loc["Overall Survival", "HC"] = logrank_test.p_value
kmfs = []
for cluster in sorted(clinical_data["preds_jnmf"].unique()):
    duration = clinical_data["Overall Survival (Months)"][clinical_data["preds_jnmf"] == cluster]
    kmf = KaplanMeierFitter().fit(duration, label = str(cluster))
    kmfs.append(kmf)
    ax = kmf.plot(ax=ax)

add_at_risk_counts(*kmfs, ax=ax)
plt.tight_layout()

In [77]:
clinical_label = "American Joint Committee on Cancer Tumor Stage Code"
clinical_parameter = clinical_data[clinical_label].str[-1].astype(float).dropna().astype(int)
clinical_label = "AJN Tumor stage"
clinical_parameter = clinical_parameter.rename(clinical_label)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])).pvalue
clustering_statistical_table.loc["AJCC tumor stage", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_jnmf"] = clinical_data['preds_jnmf']
sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_jnmf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [78]:
clinical_label = "Neoplasm Histologic Grade"
clinical_parameter = clinical_data[clinical_label].str[-1].astype(float).dropna().astype(int)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])).pvalue
clustering_statistical_table.loc["AJCC neoplasm histologic grade", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_jnmf"] = clinical_data['preds_jnmf']
sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_jnmf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [79]:
clinical_label = "Sex"
clinical_parameter = clinical_data[clinical_label]
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
clinical_parameter = pd.get_dummies(clinical_data[clinical_label], drop_first=True).astype(int).squeeze()
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])).pvalue
clustering_statistical_table.loc["Sex", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_jnmf"] = clinical_data['preds_jnmf']
sns.countplot(data= clinical_data, x= clinical_label, hue= "preds_jnmf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [80]:
clinical_label = "American Joint Committee on Cancer Metastasis Stage Code"
clinical_parameter = clinical_data[clinical_label].str[-1]
clinical_parameter = clinical_parameter[clinical_parameter != "X"]
clinical_parameter = clinical_parameter.astype(float).dropna().astype(int)
clinical_label = "AJN Cancer Metastasis"
clinical_parameter = clinical_parameter.rename(clinical_label)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_jnmf'])).pvalue
clustering_statistical_table.loc["AJCC metastasis stage", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_jnmf"] = clinical_data['preds_jnmf']
ax= sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_jnmf")
ax.set_title(f"p-value= {pval}", fontsize= 10)
for container in ax.containers:
    ax.bar_label(container)

In [81]:
clinical_label = "Diagnosis Age"
clinical_parameter = clinical_data[clinical_label].dropna()
pval = kruskal(*[clinical_parameter[clinical_data['preds_jnmf'] == cl] for cl in clinical_data['preds_jnmf'].unique()]).pvalue
clustering_statistical_table.loc["Diagnosis age", "HC"] = pval
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_jnmf"] = clinical_data['preds_jnmf']
ax = clinical_parameter.boxplot(column= clinical_label, by= "preds_jnmf", grid=False)
_ = ax.text(1.3, 50, f"p-value= {round(pval, 3)}")

In [14]:
from snf import compute
from sklearn import cluster
affinities = compute.make_affinity(Xs, metric='euclidean')
fused = compute.snf(affinities)
n_clusters_snf = compute.get_n_clusters(fused)[0]
preds_snf = cluster.spectral_clustering(fused, n_clusters= n_clusters_snf)
clinical_data["preds_snf"] = preds_snf
ax = clinical_data["preds_snf"].value_counts().plot(kind="bar", title= "Count of samples in clusters", x= "preds_snf", ylabel= "Number of samples")
for container in ax.containers:
    ax.bar_label(container)

In [15]:
logrank_test = multivariate_logrank_test(event_durations= clinical_data["Overall Survival (Months)"], groups= clinical_data["preds_snf"])
logrank_test

In [18]:
ax = plt.subplot(111)
ax.set_title("Survival plot")
ax.text(0, 0, f"p-value= {round(logrank_test.p_value, 3)}")
clustering_statistical_table.loc["Overall Survival", "HC"] = logrank_test.p_value
kmfs = []
for cluster in sorted(clinical_data["preds_snf"].unique()):
    duration = clinical_data["Overall Survival (Months)"][clinical_data["preds_snf"] == cluster]
    kmf = KaplanMeierFitter().fit(duration, label = str(cluster))
    kmfs.append(kmf)
    ax = kmf.plot(ax=ax)

add_at_risk_counts(*kmfs, ax=ax)
plt.tight_layout()

In [19]:
clinical_label = "American Joint Committee on Cancer Tumor Stage Code"
clinical_parameter = clinical_data[clinical_label].str[-1].astype(float).dropna().astype(int)
clinical_label = "AJN Tumor stage"
clinical_parameter = clinical_parameter.rename(clinical_label)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_snf'])).pvalue
clustering_statistical_table.loc["AJCC tumor stage", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_snf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [20]:
clinical_label = "Neoplasm Histologic Grade"
clinical_parameter = clinical_data[clinical_label].str[-1].astype(float).dropna().astype(int)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_snf'])).pvalue
clustering_statistical_table.loc["AJCC neoplasm histologic grade", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_snf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [21]:
clinical_label = "Sex"
clinical_parameter = clinical_data[clinical_label]
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
clinical_parameter = pd.get_dummies(clinical_data[clinical_label], drop_first=True).astype(int).squeeze()
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_snf'])).pvalue
clustering_statistical_table.loc["Sex", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
sns.countplot(data= clinical_data, x= clinical_label, hue= "preds_snf", ax= axes[1])
for container in axes[1].containers:
    axes[1].bar_label(container)

In [22]:
clinical_label = "American Joint Committee on Cancer Metastasis Stage Code"
clinical_parameter = clinical_data[clinical_label].str[-1]
clinical_parameter = clinical_parameter[clinical_parameter != "X"]
clinical_parameter = clinical_parameter.astype(float).dropna().astype(int)
clinical_label = "AJN Cancer Metastasis"
clinical_parameter = clinical_parameter.rename(clinical_label)
fig, axes = plt.subplots(1, 2, figsize= (12, 4))
crosstab = pd.crosstab(clinical_parameter, clinical_data['preds_snf'])
sns.heatmap(crosstab * 100 / crosstab.sum(0), annot=True, fmt=".1f", ax= axes[0])
pval = chi2_contingency(pd.crosstab(clinical_parameter, clinical_data['preds_snf'])).pvalue
clustering_statistical_table.loc["AJCC metastasis stage", "HC"] = pval
axes[0].set_title(f"p-value= {round(pval, 3)}", fontsize= 10)
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
ax= sns.countplot(data= clinical_parameter, x= clinical_label, hue= "preds_snf")
ax.set_title(f"p-value= {pval}", fontsize= 10)
for container in ax.containers:
    ax.bar_label(container)

In [23]:
clinical_label = "Diagnosis Age"
clinical_parameter = clinical_data[clinical_label].dropna()
pval = kruskal(*[clinical_parameter[clinical_data['preds_snf'] == cl] for cl in clinical_data['preds_snf'].unique()]).pvalue
clustering_statistical_table.loc["Diagnosis age", "HC"] = pval
clinical_parameter = clinical_parameter.to_frame()
clinical_parameter["preds_snf"] = clinical_data['preds_snf']
ax = clinical_parameter.boxplot(column= clinical_label, by= "preds_snf", grid=False)
_ = ax.text(1.3, 50, f"p-value= {round(pval, 3)}")