In [None]:
pip install git+https://github.com/theislab/cpa

In [None]:
# Run in terminal : sed -i '/from tkinter import N/d' ~/.local/lib/python3.10/site-packages/cpa/_model.py

In [None]:
import sys
print(sys.version)


In [None]:
pip install scanpy

In [None]:
import cpa
import scanpy as sc
import pandas as pd
import seaborn as sns
import scipy
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

In [None]:
sc.settings.set_figure_params(dpi=100)

In [None]:
adata = sc.read("/work/CPA_Healthy_hamstring/new_data_raw_fix/healthy_hamstring_processed_adata_raw.h5ad")

In [None]:
adata

In [None]:
# Split the data set into train and test
from sklearn.model_selection import train_test_split

split_key = "split"
adata.obs[split_key] = "train"
idx = list(range(len(adata)))
idx_train, idx_test = train_test_split(adata.obs_names, test_size=0.1, random_state=42)
adata.obs.loc[idx_train, split_key] = "train"
adata.obs.loc[idx_test, split_key] = "test"

In [None]:
adata_train = adata[adata.obs[split_key] == "train"].copy()
adata_test = adata[adata.obs[split_key] == "test"].copy()

In [None]:
adata_train

In [None]:
adata.obs["donor_id"]

In [None]:
adata.obs['split'].value_counts()

In [None]:
adata.X = adata.layers["counts"].copy()

In [None]:
cpa.CPA.setup_anndata(adata,
                      perturbation_key='donor_id',
                      control_group='MSK0782',
                      is_count_data=True,
                      categorical_covariate_keys=["cell_type", "sex", "donor_id"],
                      max_comb_len=1,
                     )

In [None]:
adata.uns['rank_genes_groups']

In [None]:
model_params = {
    "n_latent": 64,
    "recon_loss": "nb",
    "doser_type": "linear",
    "n_hidden_encoder": 128,
    "n_layers_encoder": 2,
    "n_hidden_decoder": 512,
    "n_layers_decoder": 2,
    "use_batch_norm_encoder": True,
    "use_layer_norm_encoder": False,
    "use_batch_norm_decoder": False,
    "use_layer_norm_decoder": True,
    "dropout_rate_encoder": 0.0,
    "dropout_rate_decoder": 0.1,
    "variational": False,
    "seed": 42,
}

trainer_params = {
    "n_epochs_kl_warmup": None,
    "n_epochs_pretrain_ae": 30,
    "n_epochs_adv_warmup": 50,
    "n_epochs_mixup_warmup": 0,
    "mixup_alpha": 0.0,
    "adv_steps": None,
    "n_hidden_adv": 64,
    "n_layers_adv": 3,
    "use_batch_norm_adv": True,
    "use_layer_norm_adv": False,
    "dropout_rate_adv": 0.3,
    "reg_adv": 20.0,
    "pen_adv": 5.0,
    "lr": 0.0003,
    "wd": 4e-07,
    "adv_lr": 0.0003,
    "adv_wd": 4e-07,
    "adv_loss": "cce",
    "doser_lr": 0.0003,
    "doser_wd": 4e-07,
    "do_clip_grad": True,
    "gradient_clip_value": 1.0,
    "step_size_lr": 10,
}

In [None]:
adata.obs['split'].value_counts()

In [None]:
model = cpa.CPA(adata=adata,
                split_key='split',
                train_split='train',
                **model_params,
               )

In [None]:
model.train(max_epochs=500,
            use_gpu=True,
            batch_size=512,
            plan_kwargs=trainer_params,
            early_stopping_patience=5,
            check_val_every_n_epoch=5,
            save_path='CPA_Healthy_hamstring/new_raw_models_fixed_batch_effect3',
           )

In [None]:
df = pd.read_csv("/work/CPA_Healthy_hamstring/CPA_Healthy_hamstring/Results/history.csv")

In [None]:
cpa.pl.plot_history(model)

In [None]:
model.save("/work/CPA_Healthy_hamstring/CPA_Healthy_hamstring/HH_removed_batch_effect_3")

In [None]:
cpa_model=model.load("/work/CPA_Healthy_hamstring/CPA_Healthy_hamstring/HH_removed_batch_effect", adata=adata)

In [None]:
df = pd.read_csv("/work/CPA_Healthy_hamstring/CPA_Healthy_hamstring/Results_more_covar/history.csv")

In [None]:
latent_outputs = model.get_latent_representation(adata, batch_size=1024)

In [None]:
adata_test

In [None]:
cpa.CPA.setup_anndata(adata_test,
                      perturbation_key='donor_id',
                      control_group='MSK0782',
                      is_count_data=True,
                      categorical_covariate_keys=["cell_type", "sex", "donor_id"],
                      max_comb_len=1,
                     )

In [None]:
# Predict using cell_type embeddings, removing the batch embeddings (batch corrected)
output_no_batch = model.custom_predict(adata= adata_test,
                   covars_to_add=["cell_type", "sex", "donor_id"],
                   add_batch=False,
                   add_pert=False,
                   batch_size=2048)


In [None]:
print(output_no_batch.keys())

In [None]:
adata_test

In [None]:
CPA_pred = output_no_batch["latent_x_pred"].X


In [None]:
CPA_pred

In [None]:
import scanpy as sc

# Get the prediction matrix
CPA_pred = output_no_batch["latent_x_pred"].X
CPA_pred = CPA_pred.toarray() if hasattr(CPA_pred, "toarray") else CPA_pred

# Add prediction to adata_test
adata_test.obsm["CPA_pred"] = CPA_pred

# Save to file
adata_test.write("adata_HH_with_CPA_pred_removed_batch.h5ad")
