In [None]:
pip install git+https://github.com/theislab/cpa

In [None]:
import cpa
import scanpy as sc
import pandas as pd
import seaborn as sns
import scipy
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

In [None]:
sc.settings.set_figure_params(dpi=100)

In [None]:
adata= sc.read("/work/CPA_Healthy_hamstring/new_data_raw_fix/scAtlas_Human_vascular_cells_processed_RAW_1.h5ad")

In [None]:
adata

In [None]:
adata.obs.groupby(['sex', 'cell_type']).size()

In [None]:
# Subset the data
subset_mask = (adata.obs['sex'] == 'male') & (adata.obs['cell_type'] == 'endothelial cell')
subset_adata = adata[subset_mask].copy()  
train_adata = adata[~subset_mask].copy()  # Training data is everything except the subset

In [None]:
train_adata

In [None]:
subset_adata.obs["sex"]

In [None]:
train_adata.obs.groupby(['sex', 'cell_type']).size()

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
cpa.CPA.setup_anndata(train_adata,
                      perturbation_key='sex',
                      dosage_key=None,
                      control_group='female',
                      batch_key=None,
                      is_count_data=True,
                      categorical_covariate_keys=['cell_type', 'self_reported_ethnicity','bmi_group','donor_id', 'surgery', 'fat_type', 'tissue', 'sex'],
                      deg_uns_key=None,
                      deg_uns_cat_key=None,
                      max_comb_len=2,
                     )

In [None]:
model_params = {
    "n_latent": 64,
    "recon_loss": "nb",
    "doser_type": "linear",
    "n_hidden_encoder": 128,
    "n_layers_encoder": 2,
    "n_hidden_decoder": 512,
    "n_layers_decoder": 2,
    "use_batch_norm_encoder": True,
    "use_layer_norm_encoder": False,
    "use_batch_norm_decoder": False,
    "use_layer_norm_decoder": True,
    "dropout_rate_encoder": 0.0,
    "dropout_rate_decoder": 0.1,
    "variational": False,
    "seed": 42,
}

trainer_params = {
    "n_epochs_kl_warmup": None,
    "n_epochs_pretrain_ae": 30,
    "n_epochs_adv_warmup": 50,
    "n_epochs_mixup_warmup": 0,
    "mixup_alpha": 0.0,
    "adv_steps": None,
    "n_hidden_adv": 64,
    "n_layers_adv": 3,
    "use_batch_norm_adv": True,
    "use_layer_norm_adv": False,
    "dropout_rate_adv": 0.3,
    "reg_adv": 20.0,
    "pen_adv": 5.0,
    "lr": 0.0003,
    "wd": 4e-07,
    "adv_lr": 0.0003,
    "adv_wd": 4e-07,
    "adv_loss": "cce",
    "doser_lr": 0.0003,
    "doser_wd": 4e-07,
    "do_clip_grad": True,
    "gradient_clip_value": 1.0,
    "step_size_lr": 10,
}

In [None]:
model = cpa.CPA(adata=train_adata,
                **model_params,
               )

In [None]:
model.train(max_epochs=400,
            use_gpu=True,
            batch_size=512,
            plan_kwargs=trainer_params,
            early_stopping_patience=50,
            check_val_every_n_epoch=50,
            save_path='CPA_project/Results_scAtlas_LOO_smooth_muscle_2000',
           )

In [None]:
cpa.pl.plot_history(model)

In [None]:
model = model.load("/work/scAtlas_runs/CPA_ScAtlasdata/CPA_project/Results_scAtlas_LOO_Final_raw", adata = train_adata)

In [None]:
model

In [None]:
latent_outputs = model.get_latent_representation(train_adata, batch_size=1024)

In [None]:
latent_outputs.keys()

In [None]:
sc.settings.verbosity = 3

In [None]:
latent_basal_adata = latent_outputs['latent_basal']
latent_adata = latent_outputs['latent_after']

In [None]:
adata.layers['X_true'] = adata.X.copy()

In [None]:
#Pull out the female slow-muscle cells from what the model saw in training:
female_endothelial = train_adata[(train_adata.obs["sex"] == "female") &
(train_adata.obs["cell_type"] == "endothelial cell")].copy()

In [None]:
# — 5) Baseline reconstruction (female→female) —
baseline_adata = model.predict(adata=female_endothelial,
                               batch_size=256, return_mean=True)

In [None]:
baseline_adata

In [None]:
# — 6) Counterfactual (female→male) —
cf = female_endothelial.copy()
cf.obs["sex"] = "male"
cf.obsm["X_true"] = cf.X.copy()

In [None]:
cf_predict = model.predict(adata=cf, batch_size=256,return_mean=True)
#    Now cf.obsm['CPA_pred'] is your “what-if male” prediction

In [None]:
# Compute per-gene ground-truth means
# Gt for females is now females with hepatocytes

female_gt = (
    female_endothelial.X.toarray().mean(axis=0)
    if hasattr(female_endothelial.X, "toarray")
    else female_endothelial.X.mean(axis=0)
)


#    Gt for the male is now the held out dataset
male_gt = (
    subset_adata.X.toarray().mean(axis=0)
    if hasattr(subset_adata.X, "toarray")
    else subset_adata.X.mean(axis=0)
)


In [None]:
#  Compute per-gene prediction means 
baseline_pred = female_endothelial.obsm["CPA_pred"].mean(axis=0)
cf_pred       = cf.obsm["CPA_pred"].mean(axis=0)

In [None]:
female_endothelial.obsm["CPA_pred"]

In [None]:
cf.obsm["CPA_pred"]

In [None]:
baseline_adata.X

In [None]:
baseline_pred

In [None]:
cf_pred

In [None]:
f_m = np.sqrt(np.mean((baseline_pred - male_gt)**2)) #pred-F --- TM
m_m = np.sqrt(np.mean((cf_pred - male_gt)**2)) # pred_M --- TM
f_f  =np.sqrt(np.mean((baseline_pred - female_gt)**2)) # pref F ----- TF
m_f = np.sqrt(np.mean((cf_pred - female_gt)**2)) # pref M ---- TF

print(f"RMSE pred F --- TM: {f_m:.4f}")
print(f"RMSE pred M --- TM (counterfactual)  : {m_m:.4f}")
print(f"RMSE pred F ----- TF : {f_f:.4f}")
print(f"RMSE pred M ---- TF (counterfactual)  : {m_f:.4f}")

In [None]:
import matplotlib.pyplot as plt

# Define RMSE values
rmse_data = [
    ["0.0199", "0.4062"],  # True Female
    ["0.1434", "0.4849"],  # True Male
]

# Define labels
column_labels = ["Pred Female", "Pred Male (CF)"]
row_labels = ["True Female", "True Male"]

# Create the figure and axis
fig, ax = plt.subplots(figsize=(6, 2))
ax.axis('tight')
ax.axis('off')

# Create the table
table = ax.table(
    cellText=rmse_data,
    rowLabels=row_labels,
    colLabels=column_labels,
    cellLoc='center',
    loc='center'
)

table.scale(1, 2)  # Increase row height
table.auto_set_font_size(False)
table.set_fontsize(12)

# Color the lower row cells
# Note: Rows and columns are 1-indexed in table.get_celld()
table[(2, 0)].set_facecolor("orange")  # True Male, Pred Female
table[(2, 1)].set_facecolor("lightblue")  # True Male, Pred Male

plt.title("RMSE between Predictions and Ground truth", pad=20)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Log-transform the data
log_male_gt = np.log1p(male_gt)
log_cf_pred = np.log1p(cf_pred)
log_baseline_pred = np.log1p(baseline_pred)

# Scatter plot of log-transformed ground truth vs. predictions
fig, ax = plt.subplots(figsize=(10, 6))

sns.scatterplot(
    x=log_male_gt, y=log_cf_pred,
    alpha=0.7, ax=ax
)

sns.scatterplot(
    x=log_male_gt, y=log_baseline_pred,
    alpha=0.7, ax=ax
)

# Identity line (in log space)
mn = min(log_male_gt.min(), log_male_gt.min())
mx = max(log_male_gt.max(), log_male_gt.max())
ax.plot([mn, mx], [mn, mx], ls="--", color="red")


ax.set_xlabel("Log Ground Truth (Male)")
ax.set_ylabel("Log Predicted Mean Expression")
ax.set_title("Leave One Out analysis" )
ax.legend(loc="upper left")
ax.grid(False)
plt.tight_layout()
plt.show()
