In [5]:
from helical.models.geneformer import GeneformerConfig, GeneformerFineTuningModel
import anndata as ad

# Load the data
ann_data = ad.read_h5ad("../yolksac_human.h5ad")

# Get the column for fine-tuning
cell_types = list(ann_data.obs["LVL1"][:10])
label_set = set(cell_types)

# Create a GeneformerConfig object
geneformer_config = GeneformerConfig(model_name="gf-12L-95M-i4096", batch_size=10)

# Create a GeneformerFineTuningModel object
geneformer_fine_tune = GeneformerFineTuningModel(geneformer_config=geneformer_config, fine_tuning_head="classification", output_size=len(label_set))

# Process the data
dataset = geneformer_fine_tune.process_data(ann_data[:10])

# Add column to the dataset
dataset = dataset.add_column('cell_types', cell_types)

# Create a dictionary to map cell types to ids
class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))

def classes_to_ids(example):
    example["cell_types"] = class_id_dict[example["cell_types"]]
    return example

# Convert cell types to ids
dataset = dataset.map(classes_to_ids, num_proc=1)

# Fine-tune the model
geneformer_fine_tune.train(train_dataset=dataset, label="cell_types")

# Get logits from the fine-tuned model
outputs = geneformer_fine_tune.get_outputs(dataset)
print(outputs[:10])

# Get embeddings from the fine-tuned model
embeddings = geneformer_fine_tune.get_embeddings(dataset)
print(embeddings[:10])

INFO:helical.models.geneformer.model:Model finished initializing.
INFO:helical.models.geneformer.model:'gf-12L-95M-i4096' model is in 'eval' mode, on device 'cpu' with embedding mode 'cell'.
INFO:helical.models.geneformer.model:Processing data for Geneformer.
  adata.var["index"] = adata.var.index

INFO:pyensembl.database:Creating database: /iridisfs/ddnb/Ahmed/AI_hackathon25/.cache/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.db
INFO:pyensembl.database:Reading GTF from /iridisfs/ddnb/Ahmed/AI_hackathon25/.cache/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.gz
INFO:root:Extracted GTF attributes: ['gene_id', 'gene_version', 'gene_name', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_biotype', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'ccds_id']
INFO:datacache.database_helpers:Creating database /iridisfs/ddnb/Ahmed/AI_hackathon25/.cache/pyensembl/GRCh38/ensembl110/Homo

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

INFO:helical.models.geneformer.model:Successfully processed the data for Geneformer.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

INFO:helical.models.geneformer.fine_tuning_model:Freezing the first 2 encoder layers of the Geneformer model during fine-tuning.
INFO:helical.models.geneformer.fine_tuning_model:Starting Fine-Tuning
Fine-Tuning: epoch 1/1: 100%|██████████| 1/1 [00:19<00:00, 19.82s/it, loss=1.16]
INFO:helical.models.geneformer.fine_tuning_model:Fine-Tuning Complete. Epochs: 1
Generating Outputs: 100%|█████████████████████████| 1/1 [00:06<00:00,  6.16s/it]
INFO:helical.models.geneformer.model:Started getting embeddings:


[[ 0.285794    0.01204093 -0.30726266]
 [ 0.21766442  0.15150346 -0.38557863]
 [ 0.3452555  -0.06431133 -0.5028102 ]
 [ 0.2683251  -0.14803466 -0.5695817 ]
 [ 0.17190824 -0.26257074 -0.4268266 ]
 [ 0.03220536  0.07862708 -0.49298882]
 [ 0.05168021  0.23596346 -0.29532525]
 [ 0.1862576   0.00183347 -0.600322  ]
 [ 0.49341303  0.00272126 -0.5205361 ]
 [ 0.22576949  0.11065894 -0.4939588 ]]


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:helical.models.geneformer.model:Finished getting embeddings.


[[ 0.01619959  0.11678661  0.21124226 ...  0.7615435  -0.302199
   0.08557646]
 [ 0.00982001  0.06953491  0.11761754 ...  0.44828066 -0.18884479
  -0.00693384]
 [ 0.01174849  0.08122256  0.18548074 ...  0.4932952  -0.30384162
   0.10719164]
 ...
 [-0.07201622  0.1854285   0.17602563 ...  0.75571764 -0.29450884
  -0.07167692]
 [-0.03774584  0.04044073  0.13247138 ...  0.37551144 -0.17332639
   0.02853031]
 [ 0.01157347  0.09756953  0.21961123 ...  0.49387243 -0.34522453
   0.05067428]]


In [3]:
import os
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")


Current working directory: /iridisfs/ddnb/Ahmed/AI_hackathon25/notebooks
