In [1]:
import os
from pathlib import Path
import pandas as pd
import pyranges as pr
import peft
from peft import PeftType
from functools import partial

from greyhound.model import GreyhoundConfig, Greyhound
from greyhound.model.locon import add_locon
from greyhound.data import ChromatinDataset, train_filter, val_filter, test_filter
from enformer_pytorch import GenomeIntervalDataset
from transformers import Trainer, TrainingArguments

  import pkg_resources
  from .autonotebook import tqdm as notebook_tqdm


In [10]:
model_config = GreyhoundConfig(
    borzoi_model_name="johahi/borzoi-replicate-0",
    n_labels=2,
    use_autocast=True,
    borzoi_kwargs={
        "enable_mouse_head": False,
    },
)
model = Greyhound(config=model_config)
model.init_borzoi_weights()

Loaded Borzoi weights from johahi/borzoi-replicate-0 into Greyhound model.


In [11]:
model

Greyhound(
  (borzoi): Borzoi(
    (conv_dna): ConvDna(
      (conv_layer): Conv1d(4, 512, kernel_size=(15,), stride=(1,), padding=same)
      (max_pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (_max_pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (res_tower): Sequential(
      (0): ConvBlock(
        (norm): BatchNorm1d(512, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (activation): GELU(approximate='tanh')
        (conv_layer): Conv1d(512, 608, kernel_size=(5,), stride=(1,), padding=same)
      )
      (1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (2): ConvBlock(
        (norm): BatchNorm1d(608, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (activation): GELU(approximate='tanh')
        (conv_layer): Conv1d(608, 736, kernel_size=(5,), stride=(1,), padding=same)
      )
      (3): MaxPool1d(kernel_size=2, strid

In [None]:
model = add_locon(model, **{"r": 8, "lora_alpha": 16}, conv_select=4)

params added/unfrozen by locon: 722684


In [13]:
model

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): Greyhound(
      (borzoi): Borzoi(
        (conv_dna): ConvDna(
          (conv_layer): Conv1d(4, 512, kernel_size=(15,), stride=(1,), padding=same)
          (max_pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        )
        (_max_pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
        (res_tower): Sequential(
          (0): ConvBlock(
            (norm): BatchNorm1d(512, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (activation): GELU(approximate='tanh')
            (conv_layer): Conv1d(512, 608, kernel_size=(5,), stride=(1,), padding=same)
          )
          (1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
          (2): ConvBlock(
            (norm): BatchNorm1d(608, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
            (activation): GELU(approximate='tanh'

# Check with a bit of data

In [6]:
genome_datasets = {}
for dataset in {"train": train_filter, "val": val_filter, "test": test_filter}.items():
    name, filter_func = dataset

    filter_func = partial(
        filter_func,
        test_fold=3,  # Adjust these values as needed
        val_fold=4,  # Adjust these values as needed
    )

    genome_datasets[name] = GenomeIntervalDataset(
        bed_file="/Users/asmith/Documents/software/greyhound/data/resources/sequences_human.bed.gz",
        fasta_file="/Users/asmith/Documents/reference/hg38.fa",
        return_augs=True,
        rc_aug=True,
        return_seq_indices=False,
        shift_augs=[-3, 3],
        context_length=524_288,
        filter_df_fn=filter_func,
    )


ds_train = ChromatinDataset(
    genome_dataset=genome_datasets["train"],
    bigwig_dir="/Users/asmith/Desktop/borzoi-training-data/"
)
ds_val = ChromatinDataset(
    genome_dataset=genome_datasets["val"],
    bigwig_dir="/Users/asmith/Desktop/borzoi-training-data/"
)
ds_test = ChromatinDataset(
    genome_dataset=genome_datasets["test"],
    bigwig_dir="/Users/asmith/Desktop/borzoi-training-data/"
)


In [7]:
# training_args = TrainingArguments(
#     bf16_full_eval=False,
#     bf16=False,
#     dataloader_num_workers=4,
#     dataloader_pin_memory=True,
#     eval_accumulation_steps=10,
#     eval_steps=5,
#     eval_strategy="steps",
#     gradient_accumulation_steps=8,
#     label_names=["labels"],
#     learning_rate=1e-4,
#     load_best_model_at_end=True,
#     logging_steps=10,
#     logging_dir="logs",
#     num_train_epochs=5,
#     output_dir="checkpoints/locon",
#     per_device_eval_batch_size=1,
#     per_device_train_batch_size=2,
#     prediction_loss_only=False,
#     remove_unused_columns=False,
#     report_to="wandb",
#     save_steps=5,
#     lr_scheduler_type="cosine",
#     save_strategy="steps",
#     weight_decay=1e-6,
#     use_mps_device=True
# )
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=ds_train,
#     eval_dataset=ds_test,
# )

# ##### TRAINING #####
# trainer.train()