In [6]:
from helical.models.geneformer import GeneformerConfig, GeneformerFineTuningModel
import anndata as ad

# Load the data
ann_data = ad.read_h5ad("../yolksac_human.h5ad")

# Get the column for fine-tuning
cell_types = list(ann_data.obs["LVL1"][:10])
label_set = set(cell_types)

# Create a GeneformerConfig object
geneformer_config = GeneformerConfig(model_name="gf-12L-95M-i4096", batch_size=10)

# Create a GeneformerFineTuningModel object
geneformer_fine_tune = GeneformerFineTuningModel(geneformer_config=geneformer_config, fine_tuning_head="classification", output_size=len(label_set))

# Process the data
dataset = geneformer_fine_tune.process_data(ann_data[:10])

# Add column to the dataset
dataset = dataset.add_column('cell_types', cell_types)

# Create a dictionary to map cell types to ids
class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))

def classes_to_ids(example):
    example["cell_types"] = class_id_dict[example["cell_types"]]
    return example

# Convert cell types to ids
dataset = dataset.map(classes_to_ids, num_proc=1)

# Fine-tune the model
geneformer_fine_tune.train(train_dataset=dataset, label="cell_types")

# Get logits from the fine-tuned model
outputs = geneformer_fine_tune.get_outputs(dataset)
print(outputs[:10])

# Get embeddings from the fine-tuned model
embeddings = geneformer_fine_tune.get_embeddings(dataset)
print(embeddings[:10])

INFO:helical.models.geneformer.model:Model finished initializing.
INFO:helical.models.geneformer.model:'gf-12L-95M-i4096' model is in 'eval' mode, on device 'cpu' with embedding mode 'cell'.
INFO:helical.models.geneformer.model:Processing data for Geneformer.
  adata.var["index"] = adata.var.index

INFO:pyensembl.database:Creating database: /iridisfs/ddnb/Ahmed/AI_hackathon25/.cache/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.db
INFO:pyensembl.database:Reading GTF from /iridisfs/ddnb/Ahmed/AI_hackathon25/.cache/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.gz
INFO:root:Extracted GTF attributes: ['gene_id', 'gene_version', 'gene_name', 'gene_biotype', 'transcript_id', 'transcript_version', 'transcript_name', 'transcript_biotype', 'transcript_support_level', 'exon_number', 'exon_id', 'exon_version', 'protein_id', 'protein_version', 'ccds_id']
INFO:datacache.database_helpers:Creating database /iridisfs/ddnb/Ahmed/AI_hackathon25/.cache/pyensembl/GRCh38/ensembl110/Homo

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

INFO:helical.models.geneformer.model:Successfully processed the data for Geneformer.


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

INFO:helical.models.geneformer.fine_tuning_model:Freezing the first 2 encoder layers of the Geneformer model during fine-tuning.
INFO:helical.models.geneformer.fine_tuning_model:Starting Fine-Tuning
Fine-Tuning: epoch 1/1: 100%|██████████| 1/1 [00:19<00:00, 19.39s/it, loss=1.08]
INFO:helical.models.geneformer.fine_tuning_model:Fine-Tuning Complete. Epochs: 1
Generating Outputs: 100%|█████████████████████████| 1/1 [00:06<00:00,  6.18s/it]
INFO:helical.models.geneformer.model:Started getting embeddings:


[[ 0.15081748 -0.06346078 -0.56110454]
 [-0.00679038  0.02107927 -0.6898344 ]
 [ 0.0529997  -0.21104313 -0.6146271 ]
 [-0.05096861 -0.12723312 -0.43499652]
 [-0.01879785 -0.02841391 -0.13580431]
 [-0.0149862   0.23591338 -0.440272  ]
 [ 0.07100906  0.2652189  -0.45508307]
 [ 0.17037587  0.11608269 -0.6285559 ]
 [-0.12839933  0.11869971 -0.5772698 ]
 [ 0.12664373  0.22253424 -0.5146089 ]]


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:helical.models.geneformer.model:Finished getting embeddings.


[[ 3.82001474e-02  1.35832861e-01  2.00044245e-01 ...  7.83510447e-01
  -3.24058771e-01  9.67910737e-02]
 [ 2.63552703e-02  8.81082714e-02  1.08287781e-01 ...  4.40737635e-01
  -1.95383936e-01 -3.65763757e-04]
 [ 2.92194672e-02  1.03832863e-01  1.76487669e-01 ...  4.98648375e-01
  -3.19269329e-01  1.17930502e-01]
 ...
 [-5.35750389e-02  2.06559882e-01  1.58944502e-01 ...  7.81549692e-01
  -3.26168060e-01 -6.59578145e-02]
 [-2.49309912e-02  5.23413159e-02  1.26074329e-01 ...  3.68590921e-01
  -1.78751573e-01  3.24401967e-02]
 [ 3.30880769e-02  1.19081765e-01  2.11631417e-01 ...  5.12477517e-01
  -3.67965877e-01  6.01701476e-02]]


In [3]:
import os
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")


Current working directory: /iridisfs/ddnb/Ahmed/AI_hackathon25/notebooks
