# 2024-05-25-Preprocessing: Generating scGPT embeddings using the pretrained scGPT model

In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.stats import mode
import scanpy as sc
import sklearn
import warnings

import scgpt as scg

In [None]:
data_cache_dir = '../perturbench_data' ## Change this to your local data directory

In [None]:
# whole-human pretrained model is downloaded from:
# https://github.com/bowang-lab/scGPT?tab=readme-ov-file#pretrained-scgpt-model-zoo

model_dir = Path(f"{data_cache_dir}/pretrained_models/scGPT_human")


### norman19

In [8]:
datapath = f'{data_cache_dir}/norman19_processed.h5ad'

In [24]:
adata = sc.read_h5ad(datapath)

In [11]:
adata_copy = adata.copy()

In [12]:
adata_copy.X = adata_copy.layers['counts']

In [None]:
adata_copy_embeddings = scg.tasks.embed_data(
    adata_copy,
    model_dir,
    gene_col='gene_symbol',
    batch_size=128,
    return_new_adata=True,
)

In [20]:
adata_copy_embeddings.X.shape

(91168, 512)

In [25]:
adata.obsm['scgpt_embbeddings'] = adata_copy_embeddings.X

In [26]:
adata

AnnData object with n_obs × n_vars = 91168 × 5575
    obs: 'orig.ident', 'ncounts', 'ngenes', 'cell_barcode', 'guide_identity', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'percent_mito', 'batch', 'condition', 'dose_val', 'control', 'split1', 'split2', 'split3', 'split4', 'split5', 'perturbation', 'perturbation_type', 'dataset', 'cell_type', 'treatment', 'ood_split', 'ood_split_0', 'ood_split_1', 'ood_split_2', 'ood_split_3', 'dose', 'perturbation_raw', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'cov_drug_dose_name'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'hvg', 'gene_symbol', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_coun

In [29]:
outfile = f'{data_cache_dir}/norman19_preprocessed_with_embeddings.h5ad'

In [30]:
adata.write_h5ad(outfile)

### mcfaline23

In [3]:
datapath = f'{data_cache_dir}/mcfaline23_gxe_processed.h5ad'

In [4]:
adata = sc.read_h5ad(datapath)

In [5]:
adata

AnnData object with n_obs × n_vars = 878229 × 15009
    obs: 'orig.ident', 'ncounts', 'ngenes', 'cell', 'sample', 'Size_Factor', 'n.umi', 'PCR_plate', 'new_cell', 'dose', 'treatment', 'gRNA_id', 'gene_id', 'guide_number', 'cell_type', 'drug_dose', 'perturbation_type', 'dataset', 'gene_dose', 'perturbation', 'pert_cl_tr', 'condition', 'condition_plus_treatment', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'control', 'dose_val', 'cov_drug_dose_name'
    var: 'ensembl_id', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg', 'log1p', 'rank_genes_groups_cov'
    layers: 'counts'

In [6]:
adata_copy = adata.copy()

In [7]:
adata_copy.X = adata_copy.layers['counts']

In [None]:
adata_copy_embeddings = scg.tasks.embed_data(
    adata_copy,
    model_dir,
    gene_col='index',
    batch_size=128,
    return_new_adata=True,
)

In [9]:
adata.obsm['scgpt_embbeddings'] = adata_copy_embeddings.X

In [10]:
adata

AnnData object with n_obs × n_vars = 878229 × 15009
    obs: 'orig.ident', 'ncounts', 'ngenes', 'cell', 'sample', 'Size_Factor', 'n.umi', 'PCR_plate', 'new_cell', 'dose', 'treatment', 'gRNA_id', 'gene_id', 'guide_number', 'cell_type', 'drug_dose', 'perturbation_type', 'dataset', 'gene_dose', 'perturbation', 'pert_cl_tr', 'condition', 'condition_plus_treatment', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'control', 'dose_val', 'cov_drug_dose_name'
    var: 'ensembl_id', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'hvg', 'log1p', 'rank_genes_groups_cov'
    obsm: 'scgpt_embbeddings'
    layers: 'counts'

In [11]:
outfile = f'{data_cache_dir}/mcfaline23_gxe_processed_with_embeddings.h5ad'

In [12]:
adata.write_h5ad(outfile)

### srivatsan20

In [13]:
datapath = f'{data_cache_dir}/srivatsan20_processed.h5ad'

In [14]:
adata = sc.read_h5ad(datapath)

In [15]:
adata

AnnData object with n_obs × n_vars = 178213 × 8630
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID', 'dataset', 'cell_type', 'treatment', 'condition', 'dose', 'perturbation_raw', 'pert_cell_type', 'ood_split', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'control', 'dose_val', 'cov_drug_dose_name', '_scvi_cell_type'
    var: 'ensembl_id', 'ncounts', 'ncells', 'gene_symbol', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variable_rank', 'm

In [16]:
adata_copy = adata.copy()

In [17]:
adata_copy.X = adata_copy.layers['counts']

In [None]:
adata_copy_embeddings = scg.tasks.embed_data(
    adata_copy,
    model_dir,
    gene_col='index',
    batch_size=128,
    return_new_adata=True,
)

In [19]:
adata.obsm['scgpt_embbeddings'] = adata_copy_embeddings.X

In [20]:
adata

AnnData object with n_obs × n_vars = 178213 × 8630
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID', 'dataset', 'cell_type', 'treatment', 'condition', 'dose', 'perturbation_raw', 'pert_cell_type', 'ood_split', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'control', 'dose_val', 'cov_drug_dose_name', '_scvi_cell_type'
    var: 'ensembl_id', 'ncounts', 'ncells', 'gene_symbol', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variable_rank', 'm

In [21]:
outfile = f'{data_cache_dir}/srivatsan20_highest_processed_with_embeddings.h5ad'

In [22]:
adata.write_h5ad(outfile)