Preface
-------

I'm running this on my Windows laptop as I don't have a machine with CUDA at hand. To make that work, I'm limiting the number of cells.

In [1]:
%run task1.ipynb

INFO:datasets:PyTorch version 2.6.0 available.


Define directories and data files
---------------------------------

In [2]:
# File with cell annotation
cellAnnFile = "helical\GSE144236\patient_metadata_new.txt"
# File with count matrix
countsFile = "helical\GSE144236\merge10pts_counts.txt"
# Gencode annotation file
gencodeFile = "helical\gencode.v47.annotation.gtf"

Process Gencode gene annotation
-------------------------------

In [3]:
gencode = pd.read_csv(gencodeFile, delimiter="\t", skiprows=5, header=None)
# Keep only the two require columns
gencode = gencode.iloc[:, [2, 8]]
# Keep only gene entries
gencode = gencode[gencode.iloc[:, 0] == "gene"]
# Extract Ensembl ID and gene symbol
gencode.columns = ["type", "info"]
gencode[["gene_id"]] = gencode["info"].str.extract('gene_id "(\\w+)')
gencode[["gene_name"]] = gencode["info"].str.extract('gene_name "(\\w+)"')
gencode = gencode.loc[:, ["gene_id", "gene_name"]]
gencode = gencode.drop_duplicates()

In [4]:
# Read the cell annotation
cells = pd.read_csv(cellAnnFile, delimiter="\t")
# Read count matrix
# Need to limit data read to make this work on my laptop
counts = pd.read_csv(countsFile, delimiter="\t", nrows=1000)

### Create the anndata object

In [5]:
adata = ad.AnnData(counts[2:].transpose())
adata.obs = cells
# This is expected by Geneformer
adata.obs = adata.obs.rename(columns={"nCount_RNA": "n_counts", "tum.norm": "tum_norm"})
adata.var = pd.DataFrame(counts[2:].transpose().columns)
# Add the Ensembl ID
adata.var.columns = ["gene_name"]
adata.var = pd.merge(adata.var, gencode, how="left")
adata.var = adata.var.rename(columns={"gene_id": "ensembl_id"})

### Subset the dataset

Subsetting the dataset happens for several reasons:  
 1. Makes computations feasable on a laptop
 2. We're not interested in all cell types: Perturbations are foremost relevant in tumor cells and normal cells.  

In [6]:
# Select a subset of tumor cells
adataT = adata[adata.obs.tum_norm == "Tumor"]
adataT = adataT[adataT.obs.level2_celltype == "Tumor_KC_Basal"][:100]

# Select a subset of normal cells
adataN = adata[adata.obs.tum_norm == "Normal"][:100]

# Create the new dataset
adataSubset = ad.concat([adataT, adataN])
adataSubset.var = adataT.var


  getattr(self, attr).index = value





Get the Geneformer model and pre-process the data
--------------------------------------------------

In [7]:
model_config = GeneformerConfig(model_name="gf-12L-95M-i4096-CLcancer", batch_size=10, device="cpu")   
gf = Geneformer(configurer=model_config)

INFO:helical.models.geneformer.model:Model finished initializing.
INFO:helical.models.geneformer.model:'gf-12L-95M-i4096-CLcancer' model is in 'eval' mode, on device 'cpu' with embedding mode 'cell'.


Perturb gene
------------

Perturb a gene 

In [8]:
# Calculate original embeddings
gfEmbeddingOriginal, umapOriginal = embedData(adataSubset)

# Up-regulate a gene
adataPerturbed = perturbGene("ENSG00000188761", "upregulate", adataSubset, gf)
# Calculate embeddings using perturbed data
gfEmbeddingPerturbed, umapPerturbed = embedData(adataPerturbed)

INFO:helical.models.geneformer.model:Processing data for Geneformer.
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\andre\AppData\Local\pyensembl\GRCh38\ensembl110\pyensembl\GRCh38\ensembl110\Cache\Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\andre\AppData\Local\pyensembl\GRCh38\ensembl110\pyensembl\GRCh38\ensembl110\Cache\Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\andre\AppData\Local\pyensembl\GRCh38\ensembl110\pyensembl\GRCh38\ensembl110\Cache\Homo_sapiens.GRCh38.pep.all.fa.gz.pickle
  adata.var["ensembl_id"] = pd.Series([None] * len(adata.var), index=adata.var.index)

INFO:helical.utils.mapping:Mapped 773 genes to Ensembl IDs from a total of 998 genes.
  data.var["gene_ids_collapsed"] = gene_ids_collapsed

  getattr(self, attr).index = value


  utils.warn_names_duplicates("var")

INFO:helical.models.geneformer.geneformer_tokeniz

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

INFO:helical.models.geneformer.model:Successfully processed the data for Geneformer.
INFO:helical.models.geneformer.model:Started getting embeddings:


  0%|          | 0/20 [00:00<?, ?it/s]

INFO:helical.models.geneformer.model:Finished getting embeddings.
INFO:helical.models.geneformer.model:Processing data for Geneformer.
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\andre\AppData\Local\pyensembl\GRCh38\ensembl110\pyensembl\GRCh38\ensembl110\Cache\Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\andre\AppData\Local\pyensembl\GRCh38\ensembl110\pyensembl\GRCh38\ensembl110\Cache\Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from C:\Users\andre\AppData\Local\pyensembl\GRCh38\ensembl110\pyensembl\GRCh38\ensembl110\Cache\Homo_sapiens.GRCh38.pep.all.fa.gz.pickle
  adata.var["ensembl_id"] = pd.Series([None] * len(adata.var), index=adata.var.index)

  utils.warn_names_duplicates("var")

  utils.warn_names_duplicates("var")

INFO:helical.utils.mapping:Mapped 773 genes to Ensembl IDs from a total of 998 genes.
  data.var["gene_ids_collapsed"] = gen

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

INFO:helical.models.geneformer.model:Successfully processed the data for Geneformer.
INFO:helical.models.geneformer.model:Started getting embeddings:


  0%|          | 0/20 [00:00<?, ?it/s]

INFO:helical.models.geneformer.model:Finished getting embeddings.
