[GROVER](https://www.nature.com/articles/s42256-024-00872-0) ("Genome Rules Obtained Via Extracted Representations") is a foundation DNA language model with an optimized vocabulary for the human genome.  
This is the Python code to the paper published in Nature Machine Intelligence 2024 https://www.nature.com/articles/s42256-024-00872-0. The R code can be found in a separate document.  

Melissa Sanabria, Jonas Hirsch, Pierre M. Joubert, Anna R. Poetsch

Biomedical Genomics, Biotechnology Center, Center for Molecular and Cellular Bioengineering, Technische Universität Dresden  
melissa.sanabria@tu-dresden.de, arpoetsch@gmail.com  

## Figure 2

Performance based selection of the vocabulary identifies 600 cycles of Byte-Pair Tokenization as optimal

### Figure 2A. 
Selection of the optimal vocabulary through accuracy of next-token prediction as a fine- tuning task for the foundation models using prediction of 2 to 6 nucleotide long next-k-mers as readout. Depicted is accuracy with a loess fit and 95% confidence interval.

In [21]:
root_folder = "chr21/"

In [None]:
import random
import glob
import pickle
import os

from transformers import get_linear_schedule_with_warmup, AdamW, BertTokenizer, DataCollatorForLanguageModeling, Trainer
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

from itertools import product

In [None]:
## Dataset. 400000 sequences from chromsome 21 for training. And 100000 sequences from chromosome 21 for testing.

train_file_path = root_folder + "train.csv"
test_file_path = root_folder + "test.csv"

In [22]:
## Vocabulary information per iteration
vocabs_file_path = root_folder + "vocabs_chr21.pkl"

In [None]:
## Tokenize sequences

def find_indices(seq, first, second):
    # find indices of all first tokens of most commong bigram
    ixs = {}
    ix = 0
    just_added = False # make sure there are no overlapping tokens like AAA gets tokenized in AA A and not in AA AA
    for ch1, ch2 in zip(seq, seq[1:]):
        if ((ch1 == first) and (ch2 == second)) and not just_added:
            ixs[ix] = 1
            just_added = True
        else:
            just_added = False
        ix += 1
    return ixs

def merge_tokens(seq, ixs, first, second):
    # merge most common tokens inplace at the first token (remove second token later)
    new_token = first + second
    for i in ixs.keys():
        seq[i] = new_token
    
    # remove the token, that got merged with its predecessor token
    seq = [x for i, x in enumerate(seq) if i-1 not in ixs]

    return seq

vocabs = pickle.load(open(vocabs_file_path, "rb"))

for iteration in vocabs.keys():
    iteration_path = root_path + str(iteration)+"/"
    if not os.path.exists(iteration_path):
        os.mkdir(iteration_path)
    else:
        continue
    
    data_path = iteration_path + "data/"
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    for set_name in ["train", "test"]:
        with open(root_path + set_name + ".txt", "r") as samples_file:
            with open(data_path + set_name + ".txt", "w") as iter_set_file:
                for sample in samples_file.readlines(): 
                    tokenized_sequence = list(sample.replace("\n", ""))                        
                    for token, info in vocabs[iteration].items(): 
                        merge_rule = info[1]
                        ixs = find_indices(tokenized_sequence, merge_rule[0], merge_rule[1])
                        tokenized_sequence = merge_tokens(tokenized_sequence, ixs, merge_rule[0], merge_rule[1])
                    sample = " ".join(tokenized_sequence[:512])
                    iter_set_file.write(sample + "\n")

The following code is an example for one of the vocabulary iterations. You can repeat that for each one of them

In [None]:
iteration = 600
iteration_path = root_path + str(iteration)+"/"
with open(iteration_path +"vocab.txt", "w") as vocab_file:
        vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for key, value in vocabs[iteration].items():
            vocab_file.write(value[0] + "\n")
            
tokenizer = BertTokenizer.from_pretrained(iteration_path +"vocab.txt")


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):

        with open(file_path, encoding="utf-8") as f:

            lines = f.read().splitlines()[:-1]
            self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


test_dataset = LineByLineTextDataset(tokenizer, root_path + str(iteration)+"/data/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, root_path + str(iteration)+"/data/train.txt")
    

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
        mlm_probability=mlm_probability)


model = BertForMaskedLM(config = root_path + "config.json")
output_path = ""

training_args = TrainingArguments(
    gradient_accumulation_steps=25,
    per_gpu_train_batch_size=10,
    per_gpu_eval_batch_size=6,
    save_steps=500,
    save_total_limit=20,
    max_steps=20000,
    learning_rate=4e-4,
    block_size=512,
    adam_epsilon=1e-6,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2 = 0.98,
    mlm_probability=0.022,
    warmup_steps=1000,
    num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1,
    evaluate_during_training = True,
    output_path = output_path
    
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
    
)

## Introduction

Scripts to reproduce each figure can be found on R code, they contain all the necesary data to reproduce the figures of the paper.

However, the following code shows how every data file was generated. You will find a flag **FINAL_FILE** that indicates the generation of a file that will be used in the R code. 

trainer.train()
trainer.save_model() 

perplexity_history = {}

step = 500
for log_history in trainer.state.log_history:
    if 'eval_loss' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['loss'])
        step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))

#### Fine-tuning on next-k-mer

We choose the training step with best performance (best perplexity) on the test set.

Here we will show an example to predict next-2mer. The process for the other next-k-mers is the same.

In [None]:
## Creation of the next-k-mer dataset
next_kmer = 2

kmers_vocab = {}
kmers_vocab_1 = {"A": "0", "C":"1", "G":"2", "T": "3"}
kmers_vocab = {}
count = 0
for nuc in kmers_vocabs[1]:
    for sec_nuc in kmers_vocabs[1]:
        kmers_vocabs[nuc+sec_nuc] = str(count)
        count += 1

grover_data = pd.read_csv(root_path + str(iteration)+"/data/test.txt", header=None, names=["sequence"])
grover_data_next_kmer = pd.DataFrame()

grover_data_next_kmer["sequence"] = grover_data["sequence"].apply(lambda x: x.split(" ")[:50])
grover_data_next_kmer["next_kmer"] = grover_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
       
grover_data_next_kmer["class"] = grover_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
test_data_grover = pd.DataFrame({'X': grover_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})


grover_data = pd.read_csv(root_path + str(iteration)+"/data/train.txt", header=None, names=["sequence"])
grover_data_next_kmer = pd.DataFrame()

grover_data_next_kmer["sequence"] = grover_data["sequence"].apply(lambda x: x.split(" ")[:50])
grover_data_next_kmer["next_kmer"] = grover_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
       
grover_data_next_kmer["class"] = grover_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
train_data_grover = pd.DataFrame({'X': grover_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})

In [None]:
class GroverDataSet(Dataset):
    def __init__(self, sequences, y, tokenizer, max_length=50):
        print("Loading GROVER Dataset")
        self.BERTtokenizer = tokenizer # to convert ATG ATCGA CG -> [CLS] ATG ATCGA CG [SEP] -> [2, 123, 456, 789, 101, 3]
        self.sequences = sequences
        self.max_length = max_length
        self.y = np.array(y, dtype=np.float32).reshape(-1, 1)

        self.label_encoder = OneHotEncoder(sparse_output=False, categories=[list(range(16))])
        self.label_encoder.fit(self.y)
        self.y = self.label_encoder.transform(self.y)

        self.y = torch.tensor(self.y)
        print(self.y.shape)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
        ids = tokenizer_res["input_ids"].squeeze(0)
        attention_masks = tokenizer_res["attention_mask"]
        return ids, self.y[idx], attention_masks, idx



In [None]:
grover_tokenizer = BertTokenizer.from_pretrained(output_path)
train_grover = GroverDataSet(train_data_grover["X"].values, train_data_grover["class"], grover_tokenizer, max_length=50)
test_grover = GroverDataSet(test_data_grover["X"].values, test_data_grover["class"], grover_tokenizer, max_length=50)

In [None]:
grover = BertForSequenceClassification.from_pretrained(output_path, num_labels=len(kmer_vocab)).to(device)
output_path_next_kmer = ""

training_args = TrainingArguments(
    num_train_epochs = 100,
    learning_rate = 1e-6,
    per_gpu_train_batch_size=16,
    per_gpu_eval_batch_size=16,
    save_steps = 1000,
    logging_steps = 1000,
    evaluate_during_training=True,
    output_path = output_path_next_kmer
    
)
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=1)
    return {
        'accuracy': (preds == labels).mean()
    }

# Initialize our Trainer
trainer = Trainer(
    model=grover,
    train_dataset=train_grover,
    eval_dataset=test_grover,
    tokenizer=grover_tokenizer,
    compute_metrics = compute_metrics
    
)

In [None]:
trainer.train()
trainer.save_model()

perplexity_history = {}

step = 1000
for log_history in trainer.state.log_history:
    if 'eval_accuracy' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['eval_accuracy'])
        step += 1000
pickle.dump(perplexity_history, open(output_path_next_kmer + "accuracy.pkl", "wb"))

In [None]:
## Creating performance file
results_dict = {}

for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:
    results_dict[iteration] = {}
    for n_mer in range(2, 7):
        results_dict[iteration][n_mer] = pickle.load(open(output_path_next_kmer + "accuracy.pkl", "rb"))
        
performance_to_save = {}
for idx, n_mer in enumerate(range(2, 7)):
    performance_to_save[n_mer] = {}
    for iteration in [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 4000, 5000]:       
        performance_to_save[n_mer][iteration] = np.max(results_dict[iteration][n_mer])

df = pd.DataFrame.from_dict(performance_to_save)
df.to_csv("performance.csv", index=False)
## FINAL_FILE

### Figure 2B. 
Performance comparison using accuracy of next-k-mer prediction as a fine-tuning task. Compared are GROVER with 600 cycles of Byte-Pair Tokenization with models based on k-mer-tokenization, with length of 4, 5, and 6 nucleotides.

In [None]:
letters = ['A', 'C', 'G', 'T']
for kmer in [4, 5, 6]:
    kmer_path = root_path + str(kmer)+"/"
    if not os.path.exists(kmer_path):
        os.mkdir(kmer_path)
    else:
        continue
    
    vocab = product(letters, repeat=kmer)
    with open(kmer_path +"vocab.txt", "w") as vocab_file:
        vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for token in vocab:
            vocab_file.write(token + "\n")
    
    data_path = kmer_path + "data/"
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    for set_name in ["train", "test"]:
        with open(root_path + set_name + ".txt", "r") as samples_file:
            with open(data_path + set_name + ".txt", "w") as iter_set_file:
                for sample in samples_file.readlines(): 
                    new_line = ""
                    count = 0
                    for init in range(0, len(sample) - kmer, kmer):
                        new_line += sample[init: init + kmer] + " "
                        count += 1
                        if count > 510:
                            break
                    new_line = " ".join(new_line[:-1].split(" "))
                    iter_set_file.write(new_line + "\n")

We will show how to do it for one k-mer and it can be repeted for the others

In [None]:
kmer = 4

kmer_path = root_path + str(kmer)+"/"

tokenizer = BertTokenizer.from_pretrained(kmer_path +"vocab.txt")


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):

        with open(file_path, encoding="utf-8") as f:

            lines = f.read().splitlines()[:-1]
            self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


test_dataset = LineByLineTextDataset(tokenizer, kmer_path+"/data/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, kmer_path+"/data/train.txt")

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
        mlm_probability=mlm_probability)


model = BertForMaskedLM(config = root_path + "config_4mer.json")
output_path = ""

training_args = TrainingArguments(
    gradient_accumulation_steps=25,
    per_gpu_train_batch_size=10,
    per_gpu_eval_batch_size=6,
    save_steps=500,
    save_total_limit=20,
    max_steps=20000,
    learning_rate=4e-4,
    block_size=512,
    adam_epsilon=1e-6,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2 = 0.98,
    mlm_probability=0.022,
    warmup_steps=1000,
    num_train_epochs = max_steps // (len(train_dataset) // 25) + 1,
    evaluate_during_training = True,
    output_path = output_path
    
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
    
)

In [None]:
trainer.train()
trainer.save_model()

perplexity_history = {}

step = 500
for log_history in trainer.state.log_history:
    if 'eval_loss' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['loss'])
        step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))

#### Fine-tuning on next-k-mers

We choose the training step with best performance on the test set.

Here we will show an example to predict next-2mer. The process for the other next-k-mer is the same.

In [None]:
## Creation of the next-k-mers dataset
next_kmer = 2

kmers_vocab = {}
kmers_vocab_1 = {"A": "0", "C":"1", "G":"2", "T": "3"}
kmers_vocab = {}
count = 0
for nuc in kmers_vocabs[1]:
    for sec_nuc in kmers_vocabs[1]:
        kmers_vocabs[nuc+sec_nuc] = str(count)
        count += 1

kmer_data = pd.read_csv(kmer_path+"/data/test.txt", header=None, names=["sequence"])
kmer_data_next_kmer = pd.DataFrame()

kmer_data_next_kmer["sequence"] = kmer_data["sequence"].apply(lambda x: x.split(" ")[:50])
kmer_data_next_kmer["next_kmer"] = kmer_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
       
kmer_data_next_kmer["class"] = kmer_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
test_data_kmer = pd.DataFrame({'X': kmer_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})


kmer_data = pd.read_csv(kmer_path+"/data/train.txt", header=None, names=["sequence"])
kmer_data_next_kmer = pd.DataFrame()

kmer_data_next_kmer["sequence"] = kmer_data["sequence"].apply(lambda x: x.split(" ")[:50])
kmer_data_next_kmer["next_kmer"] = kmer_data["sequence"].apply(lambda x: "".join(x.split(" ")[50:])[:next_kmer])
       
kmer_data_next_kmer["class"] = kmer_data_next_kmer["next_kmer"].apply(lambda x: kmer_vocab[x])
train_data_kmer = pd.DataFrame({'X': kmer_data_next_kmer["sequence"], 'class': grover_data_next_kmer["class"]})

In [None]:
class GroverDataSet(Dataset):
    def __init__(self, sequences, y, tokenizer, max_length=50):
        print("Loading GROVER Dataset")
        self.BERTtokenizer = tokenizer # to convert ATG ATCGA CG -> [CLS] ATG ATCGA CG [SEP] -> [2, 123, 456, 789, 101, 3]
        self.sequences = sequences
        self.max_length = max_length
        self.y = np.array(y, dtype=np.float32).reshape(-1, 1)

        self.label_encoder = OneHotEncoder(sparse_output=False, categories=[list(range(16))])
        self.label_encoder.fit(self.y)
        self.y = self.label_encoder.transform(self.y)

        self.y = torch.tensor(self.y)
        print(self.y.shape)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        tokenizer_res = self.BERTtokenizer.encode_plus(seq, add_special_tokens=True, padding="max_length", return_tensors="pt", max_length=self.max_length, truncation=True)
        ids = tokenizer_res["input_ids"].squeeze(0)
        attention_masks = tokenizer_res["attention_mask"]
        return ids, self.y[idx], attention_masks, idx

In [None]:
kmer_tokenizer = BertTokenizer.from_pretrained(output_path)
train_kmer = GroverDataSet(train_data_kmer["X"].values, train_data_kmer["class"], kmer_tokenizer, max_length=50)
test_kmer = GroverDataSet(test_data_kmer["X"].values, test_data_kmer["class"], kmer_tokenizer, max_length=50)

In [None]:
num_labels=len(kmer_vocab)

if "NT" in model_name:
    kmer_model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=num_labels, trust_remote_code=True)
elif "DNABERT2" in model_name:
    kmer_model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNABERT-2-117M", num_labels=num_labels, trust_remote_code=True)
else:
    kmer_model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)

kmer_model.to('cuda')

if "NT" in model_name:
    kmer_tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref",trust_remote_code=True)
    kmer_tokenizer.eos_token = tokenizer.pad_token
elif "DNABERT2" in model_name:
    kmer_tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
else:
    kmer_tokenizer = BertTokenizer.from_pretrained(output_path)

In [None]:
output_path_next_kmer = ""

training_args = TrainingArguments(
                            per_device_train_batch_size=12,
                            per_device_eval_batch_size=12,
                            evaluation_strategy="epoch", 
                            num_train_epochs=5,
                            logging_strategy="epoch",
                            save_strategy="epoch",
                            load_best_model_at_end=True,
                            learning_rate=5e-6,
                            metric_for_best_model="f1",
                            output_dir=output_path_next_kmer,
                            warmup_steps=100,
                            seed=50,
                            dataloader_num_workers=10,
                            weight_decay=0.01
                            )

def compute_metrics(logits: np.ndarray, labels: np.ndarray):
    predictions = np.argmax(logits, axis=-1)
    valid_mask = labels != -100 
    valid_predictions = predictions[valid_mask]
    valid_labels = labels[valid_mask]
    return {
        "accuracy": accuracy_score(valid_labels, valid_predictions),
        "f1": f1_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
        "matthews_correlation": matthews_corrcoef(
            valid_labels, valid_predictions
        ),
        "precision": precision_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
        "recall": recall_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
    }

# Initialize our Trainer
trainer = Trainer(
    model=kmer_model,
    train_dataset=train_kmer,
    eval_dataset=test_kmer,
    tokenizer=kmer_tokenizer,
    compute_metrics = compute_metrics
    
)

In [None]:
trainer.train()

trainer.save_state()
safe_save_model_for_hf_trainer(trainer=trainer, output_dir=output_path)

results = trainer.evaluate(eval_dataset=test_dataset)
with open(os.path.join(output_path_next_kmer, "eval_results.json"), "w") as f:
    json.dump(results, f)

### Figure 2C. 
Comparison of accuracy to Term Frequency-Inverse Document Frequency (TF-IDF) models, which use 2 to 6 nucleotide long kmers and the GROVER vocabulary (BPE-600). These models take only token frequencies into account, which are used to train a random forrest model. They are not learning context between tokens.

In [None]:
kmer = 2 # 3,4,5,6
sentences = [text for text in train.sequence]
X_train = []
for text in sentences:
    X_train.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))

sentences = [text for text in test.sequence]
X_test = []
for text in sentences:
    X_test.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))

sentences = [text for text in val.sequence]
X_val = []
for text in sentences:
    X_val.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

pickle.dump(vectorizer, open(output_path + "/TfidfVectorizer.pkl", "wb"))

best_val_mcc = -1

for n_estimators in range(100, 2001, 200):
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=0, n_jobs=-1)

    clf.fit(X_train, train.label)

    y_val = clf.predict(X_val)

    scores_val = calculate_metric_with_sklearn(val.label, y_val)

    if scores_val["matthews_correlation"] > best_val_mcc:
        y_test = clf.predict(X_test)

        scores_test = calculate_metric_with_sklearn(test.label, y_test)
        with open(os.path.join(output_path, "eval_results.json"), "w") as f:
            json.dump(scores_test, f)
        
        best_val_mcc = scores_test["matthews_correlation"]

In [None]:
X_train = [" ".join(tokenizer.tokenize(text)) for text in train.sequence]
X_test = [" ".join(tokenizer.tokenize(text)) for text in test.sequence]
X_val = [" ".join(tokenizer.tokenize(text)) for text in val.sequence]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)

pickle.dump(vectorizer, open(output_path + "/TfidfVectorizer.pkl", "wb"))

best_val_mcc = -1

for n_estimators in range(100, 2001, 200):
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=0, n_jobs=-1)

    clf.fit(X_train, train.label)

    y_val = clf.predict(X_val)

    scores_val = calculate_metric_with_sklearn(val.label, y_val)

    if scores_val["matthews_correlation"] > best_val_mcc:
        y_test = clf.predict(X_test)

        scores_test = calculate_metric_with_sklearn(test.label, y_test)
        with open(os.path.join(output_path, "eval_results.json"), "w") as f:
            json.dump(scores_test, f)
        
        best_val_mcc = scores_test["matthews_correlation"]

### Figure 2D. 
Performance assessment of GROVER with 600 cycles Byte-Pair Tokenization using accuracy for the masked token being predicted as the TOP 1 token, up to TOP 60, i.e. the TOP 10%.

First, we will explain how we obtained the data from the whole reference genome. Then we will show how to train GROVER.

In [21]:
root_folder = "wg/"

The tokenized chromosomes can be found iin the folder tokenized_chromosomes. In case you want to know they were generated, these are the steps to tokenize the whole genome.

In [None]:
import random
import glob
import pickle
import os
import argparse
from Bio import SeqIO

def n_intervals(seq):
    for n_code in ["R", "Y", "S", "W", "K", "M", "B", "D", "H", "V"]:
        seq = seq.replace(n_code, "N")

    N_positions = [i for i in range(len(seq)) if seq.startswith("N", i)]
    N_intervals = []
    current_interval = [N_positions[0], 0]
    for idx in range(1, len(N_positions)):
        if N_positions[idx] != (N_positions[idx - 1] + 1):
            current_interval[1] = N_positions[idx - 1] + 1
            N_intervals.append([N_positions[idx - 1] + 1, N_positions[idx]])
            current_interval[0] = N_positions[idx]
    return N_intervals

def find_indices(seq, first, second):
    # find indices of all first tokens of most commong bigram
    ixs = {}
    ix = 0
    just_added = False # make sure there are no overlapping tokens like AAA gets tokenized in AA A and not in AA AA
    for ch1, ch2 in zip(seq, seq[1:]):
        if ((ch1 == first) and (ch2 == second)) and not just_added:
            ixs[ix] = 1
            just_added = True
        else:
            just_added = False
        ix += 1
    return ixs

def merge_tokens(seq, ixs, first, second):
    # merge most common tokens inplace at the first token (remove second token later)
    new_token = first + second
    for i in ixs.keys():
        seq[i] = new_token
    
    # remove the token, that got merged with its predecessor token
    seq = [x for i, x in enumerate(seq) if i-1 not in ixs]

    return seq

for chrom in range(1, 25):

    chrom_file = "seq_chr_"+str(chrom)+".fasta" ## Fasta file per chromosome
    chrom_seq = SeqIO.read(chrom_file, "fasta").seq
    these_n_intervals = n_intervals(chrom_seq)
    sequences = []
    for interval in these_n_intervals:
        sequences.append(chrom_seq[interval[0] : interval[1]])

    vocab_iter_file = root_folder + "/vocab_info.pkl"
    vocab_iter = pickle.load(open(vocab_iter_file, "rb"))

    tok_chrom = []
    for sequence in sequences:
        tokenized_sequence = list(sequence)
        for key, value in vocab_iter.items(): 
            rule = value[1]
            ixs = find_indices(tokenized_sequence, rule[0], rule[1])
            tokenized_sequence = merge_tokens(tokenized_sequence, ixs, rule[0], rule[1])

        tok_chrom.append(tokenized_sequence)
        
    pickle.dump(tok_chrom, open("chr_"+str(chrom)+".pkl", "wb"))

You can access the train and test data from the root folder.

In case you want to know how we generated the data, this is the code.

In [None]:
## Create windows
import random
import pickle
from random import shuffle

windows = []
for chrom in range(1, 25):
    tok_chrom = pickle.load(open("tokenized_chromosomes/chr_"+str(chrom)+".pkl", "rb"))
    for s, sequence in enumerate(tok_chrom):
        length_seq = len(sequence)
        window_start = 0
        remaining_length = length_seq - window_start
        while remaining_length > 510:
            prob = random.uniform(0, 1)
            if prob < 0.5:
                length = 510
            else:
                length = random.randint(20, 510)
            windows.append([chrom, s, window_start, window_start + length])
            
            window_start = window_start + length
            remaining_length = length_seq - window_start

            
shuffle(windows)
windows_per_chromosome = {}
for w, window in enumerate(windows):
    chrom = window[0]
    if chrom not in windows_per_chromosome:
        windows_per_chromosome[chrom] = {}
        windows_per_chromosome[chrom]["train"] = []
        windows_per_chromosome[chrom]["test"] = []
    if w > train_length:
        split = "test"
    else:
        split = "train"
    
    windows_per_chromosome[chrom][split].append(window[1:])
    
samples_per_chromosome = {}
train_factor = 3
test_factor = 2

nb_train_windows = 0
nb_test_windows = 0
for chrom in windows_per_chromosome.keys():
    for split in windows_per_chromosome[chrom].keys():
        if split in "train":
            factor = train_factor
        if split in "test":
            factor = test_factor
        
        new_windows = []
        for window in windows_per_chromosome[chrom][split]:
            window_start = window[-2]
            window_end = window[-1]
            window_length = window_end - window_start
            if window_length > 50:
                for i in range(factor):
                    random_start = random.randint(window_start + 1, window_end - 20)
                    random_end = random.randint(random_start + 20, window_end)
                    new_windows.append([window[0], random_start, random_end])
        windows_per_chromosome[chrom][split] += new_windows
        
with open(root_folder + "train.txt", "w") as train_file:
    for chrom in range(1, 25):
        tok_chrom = pickle.load(open("chr_"+str(chrom)+".pkl", "rb"))
        for window in windows_per_chromosome[chrom]["train"]:
            sample = tok_chrom[window[0]][window[1]: window[2]]
            train_file.write(" ".join(sample) + "\n")
            
with open(root_folder + "test.txt", "w") as train_file:
    for chrom in range(1, 25):
        tok_chrom = pickle.load(open("chr_"+str(chrom)+".pkl", "rb"))
        for window in windows_per_chromosome[chrom]["test"]:
            sample = tok_chrom[window[0]][window[1]: window[2]]
            train_file.write(" ".join(sample) + "\n")

In [None]:
## Training GROVER

with open(root_path +"vocab.txt", "w") as vocab_file:
        vocab_file.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for key, value in vocabs[iteration].items():
            vocab_file.write(value[0] + "\n")
            
tokenizer = BertTokenizer.from_pretrained(root_path +"vocab.txt")


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):

        with open(file_path, encoding="utf-8") as f:

            lines = f.read().splitlines()[:-1]
            self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


test_dataset = LineByLineTextDataset(tokenizer, root_path + "/test.txt")
train_dataset = LineByLineTextDataset(tokenizer, root_path + "/train.txt")
    

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
        mlm_probability=mlm_probability)


model = BertForMaskedLM(config = root_path + "config.json")
output_path = ""

training_args = TrainingArguments(
    gradient_accumulation_steps=25,
    per_gpu_train_batch_size=10,
    per_gpu_eval_batch_size=6,
    save_steps=500,
    save_total_limit=20,
    max_steps=20000,
    learning_rate=4e-4,
    block_size=512,
    adam_epsilon=1e-6,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2 = 0.98,
    mlm_probability=0.022,
    warmup_steps=1000,
    num_train_epochs = max_steps // (len(train_dataset) // 25) + 1,
    evaluate_during_training = True,
    output_path = output_path
    
)

# Initialize our Trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
    
)

In [None]:
trainer.train()
trainer.save_model()  

perplexity_history = {}

step = 500
for log_history in trainer.state.log_history:
    if 'eval_loss' in log_history.keys():
        perplexity_history[step] = math.exp(log_history['loss'])
        step += 500
pickle.dump(perplexity_history, open(root_path + "perplexity.pkl", "wb"))

In [None]:
## Prediction of a masked token per sample

from sklearn.metrics import top_k_accuracy_score
from scipy.special import softmax

def collate(examples: List[torch.Tensor]):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

eval_sampler = SequentialSampler(test_dataset)
eval_dataloader = DataLoader(
    test_dataset, sampler=eval_sampler, batch_size=32, collate_fn=collate)


model = BertModel.from_pretrained(output_path, config=root_path + "config.json")
predictions = []
labels = []
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    with torch.no_grad():
        
        probability_matrix = torch.rand(batch.shape)
        special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in batch.tolist()]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        if tokenizer._pad_token is not None:
            padding_mask = batch.eq(tokenizer.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)
        
        token_per_sample = torch.argmax(probability_matrix, dim=1)
        
        batch[torch.arange(batch.shape[0]), token_per_sample] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
        
        batch = batch.to("cuda:0")
        outputs = model(batch)
        prediction = outputs[0].detach().cpu().numpy()[torch.arange(batch.shape[0]), token_per_sample]
        predictions.append(softmax(prediction, axis=-1))
        
        labels.append(batch[np.arange(batch.shape[0]), token_per_sample.numpy()].numpy())
        
labels = np.hstack(labels)
predictions = np.vstack(predictions)
top_k = np.zeros(300)
for k in range(1,300):
   
    top_k[k-1] = top_k_accuracy_score(labels, predictions, k=k, labels=np.arange(609))

    print(k, top_k[k-1] )

pickle.dump(top_k, open(root_path + "top_k.pkl", "wb")) ## FINAL_FILE
pickle.dump(labels, open(root_path + "labels.pkl", "wb"))
pickle.dump(predictions, open(root_path + "predictions.pkl", "wb"))

### Figure 2E.  
Performance assessment of GROVER with 600 cycles Byte-Pair Tokenization using perplexity, divided by the total number of words in the dictionary. Comparison with models based on k-mer-tokenization, with length of 4, 5, and 6 nucleotides.

## Figure 3
The frequency balanced GROVER vocabulary shows differential learning performance by token length

To generate Figure 3F. and Figure 3G. we need the metrics per token.

In [None]:
from sklearn.metrics import roc_auc_score

def roc_auc_score_multiclass(actual_class, pred_class, average = "weighted"):
    
    #creating a set of all the unique classes using the actual class list
    unique_class = list(set(actual_class))
    roc_auc_dict = {}
    for per_class in unique_class:
        
        #creating a list of all the classes except the current class 
        other_class = [x for x in unique_class if x != per_class]

        #marking the current class as 1 and all other classes as 0
        new_actual_class = [0 if x in other_class else 1 for x in actual_class]
        new_pred_class = [0 if x in other_class else 1 for x in pred_class]

        #using the sklearn metrics method to calculate the roc_auc_score
        roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
        roc_auc_dict[per_class] = roc_auc
        
    return roc_auc_dict


labels = pickle.load(open(root_path + "labels.pkl", "rb"))
predictions = pickle.load(open(root_path + "predictions.pkl", "rb"))

acc_per_token = np.zeros((609, 300))
for k in range(300):
    for t in range(609):
        this_token_ids = np.where(labels == t)[0]
        if len(this_token_ids) > 0:
            this_token_labels = labels[this_token_ids]
            this_token_predictions = predictions[this_token_ids]
            acc_per_token[t, k] = top_k_accuracy_score(this_token_labels, this_token_predictions, k=k, labels=np.arange(609))
        

auc_per_token = roc_auc_score_multiclass(labels, predictions, "macro")        
        
        
output_file_path = "metrics_per_token.csv" ## FINAL_FILE
with open(output_file_path, "w") as output:
    header = "token_id,token,auc,"
    for k in range(1,301):
        header += "top"+str(k)+","
    header = header[:-1] ## Remove comma
    output.write(header + "\n")
    for t, token in enumerate(tokens):
        auc = 0
        if t in auc_per_token:
            auc = auc_per_token[t]
        
        line_to_write = str(t) +","+ token +","+ str(auc) +","
        for k in range(300):
            line_to_write += str(acc_per_token[t,k]) + ","
        output.write(line_to_write[:-1] + "\n")

## Figure 4
Average GROVER token embedding shows learning of genome information content

In [None]:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA

with open(root_path + "/train.txt") as train_file:
    data = train_file.readlines()[:-1]
sent = [row.split() for row in data[:300000]]
w2v_model = gensim.models.Word2Vec(sent, min_count = 1, vector_size = 768, window = 5, workers=10)

pickle.dump(w2v_model.wv.vectors, open(root_path + "iter600_w2v.pkl", "wb")) ## FINAL_FILE

with open(root_path + "iter600_w2v_vocab.txt", "w") as vocab_file: ## FINAL_FILE
    for word in w2v_model.wv.index_to_key:
        vocab_file.write(word + "\n")

In [None]:
model = BertModel.from_pretrained(output_path, config=root_path + "config.json")

vocabulary = ""
with open(root_path +"vocab.txt", "r") as vocabulary_file:
    for line in vocabulary_file.readlines()[5:]:
        vocabulary += line.replace("\n", "") + " "
        
embedding_matrix = model.embeddings.word_embeddings.weight.to("cpu").detach().numpy()

pickle.dump(embedding_matrix, open("vocab_embedding.pkl", "wb")) ## FINAL_FILE

## Figure 5
GROVER learns token context and genome annotation

### Figure 5A.
Self-similarity per token sequence as extracted by cosine similarity of the same token in different contexts throughout the 12 transformer layers.

In [None]:
def collate(examples: List[torch.Tensor]):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

eval_sampler = SequentialSampler(test_dataset)
eval_dataloader = DataLoader(
    test_dataset, sampler=eval_sampler, batch_size=32, collate_fn=collate)

model = BertModel.from_pretrained(output_path, config=root_path + "config.json")
model.eval()
count = 0
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    with torch.no_grad():
        batch = batch.to("cuda:0")
        outputs = model(batch)
        embeddings = torch.stack(list(outputs[1]), dim=0).detach().cpu().numpy()
        pickle.dump(embeddings, open("embeddings_test/"+str(count)+".pkl", "wb"))
        del embeddings
        count += 1


## Group the embeddings per token
tokens = []
with open(root_path +"vocab.txt") as vocab_file:
    for line in vocab_file.readlines():
        tokens.append(line.replace("\n", ""))

tokens_to_get = np.asarray([[7] + list(range(9, len(tokens)))][0])

total_samples = 5000

for layer in range(1,13):
    token_embeddings = [[] for _ in range(len(tokens_to_get))]
    token_counts = [0 for _ in range(len(tokens_to_get))]
    count = 0
    for batch in eval_dataloader:
        batch = batch.numpy()
        if not os.path.exists("embeddings_test/"+str(count)+".pkl"):
            break
        embeddings = pickle.load(open("embeddings_test/"+str(count)+".pkl", "rb"))[layer]
        for t_idx, token_id in enumerate(tokens_to_get):
            if token_counts[t_idx] >= total_samples:
                print("Token ",tokens[token_id], "is full")
                continue   
            condition = np.nonzero(batch == token_id)
            times = len(condition[0])
            if times > 0:   
                token_embeddings[t_idx].append(embeddings[condition])
                token_counts[t_idx] += times


        del embeddings
        count += 1
        all_full = True
        for t_idx in range(len(tokens_to_get)):
            if token_counts[t_idx] < total_samples:
                all_full = False
                break
        if all_full:
            break
    for t_idx, token_id in enumerate(tokens_to_get):
        print("Saving file for token ", tokens[token_id])
        pickle.dump(np.vstack(token_embeddings[t_idx]), open(str(token_id)+"_layer_"+str(layer)+".pkl", "wb"))
        
self_sim_per_token = np.zeros((len(tokens_to_get), 12))
for t_idx, token_id in enumerate(tokens_to_get):
    for l_idx, layer in enumerate(range(1,13)):
        token_embeddings = pickle.load(open(str(token_id)+"_layer_"+str(layer)+".pkl", "rb"))[:5000]
        sim = cosine_similarity(token_embeddings)
        upper = sim[np.triu_indices(len(token_embeddings), k = 1)]
        self_sim_per_token[t_idx, l_idx] = upper.mean()
        
        
output_file_path = "self_similarity_per_token.csv" ## FINAL_FILE
with open(output_file_path, "w") as output:
    header = "token_id,token,"
    for k in range(12):
        header += "layer"+str(k + 1)+","
    header = header[:-1] 
    output.write(header + "\n")
    for t_idx, t in enumerate(tokens_to_get):
        token = tokens[t]
        line_to_write = str(t) +","+ token +","
        for k in range(12):
            line_to_write += str(self_sim_per_token[t_idx,k]) + ","
        output.write(line_to_write[:-1] + "\n")

### Figure 5B,5C,5D.
Embedding of regions in the genome 510 tokens in size

In [None]:
import os.path
import pickle
from tqdm import tqdm
import argparse

from typing import List
from torch.nn.utils.rnn import pad_sequence
import torch
from transformers import BertTokenizer,  BertModel, PreTrainedTokenizer
from torch.utils.data import DataLoader, SequentialSampler, Dataset


class notNsample(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, tokenized_chromosome, indices_list_path, sample_size, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.tokenized_chromosome = tokenized_chromosome
        self.indices_list = []

        if os.path.exists(indices_list_path):
            print("Loading indices_list from cached file %s", indices_list_path)
            with open(indices_list_path, "rb") as handle:
                self.indices_list = pickle.load(handle)
        else:
            print("Creating features from dataset file at", indices_list_path)
            for start in range(0, len(tokenized_chromosome) - sample_size, 510):
                this_sample = tokenized_chromosome[start: start + sample_size]
                if "N" not in this_sample:
                    self.indices_list.append([start, start + sample_size])
            pickle.dump(self.indices_list, open(indices_list_path, "wb"))

    def __len__(self):
        return len(self.indices_list)

    def __getitem__(self, i):
        start = self.indices_list[i][0]
        end = self.indices_list[i][1]
        sample = self.tokenized_chromosome[start:end]
        tokenized_text = self.tokenizer.convert_tokens_to_ids(sample)
        tokenized_text = self.tokenizer.build_inputs_with_special_tokens(tokenized_text)
        return torch.tensor(tokenized_text, dtype=torch.long)

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--chromosome", default=None, type=str, required=True
    )
    args = parser.parse_args()
    chromosome = args.chromosome
    output_path = ""
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    ## Create data
    tokenized_chromosome = pickle.load(open("tokenized_chromosomes/chr_"+chromosome+".pkl", "rb"))
    sample_size = 510
    batch_size = 32

    model_path = ""
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertModel.from_pretrained(model_path)
    model.to('cuda')

    def collate(examples):
        samples = pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
        
        attention_mask = torch.ones_like(samples)
        attention_mask[samples == tokenizer.pad_token_id] = 0

        return {"samples": samples, "attention_mask": attention_mask}

    indices_list_path = output_path + "samples_chr_"+chromosome+"_indices_list.pkl"
    eval_dataset = notNsample(tokenizer, tokenized_chromosome, indices_list_path, sample_size)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=batch_size, collate_fn=collate
    )
    model.eval()


    ## Predicting
    
    embeddings = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        with torch.no_grad():
            samples = batch["samples"]
            attention_mask = batch["attention_mask"]
            batch = samples.to("cuda")
            attention_mask = attention_mask.to("cuda:0")
            outputs = model(batch, attention_mask = attention_mask)
            outputs = model(batch)[0][:,0]
            embeddings += outputs.detach().cpu().tolist()

    pickle.dump(embeddings, open(output_path + "chr_"+chromosome+"_cls_tokens.pkl", "wb"))

if __name__ == "__main__":
    main()

## Figure 6.
GROVER outperforms other models for biological fine-tuning tasks

###### Figure 6B,6C,6E,6F,6H,6I

#### Datasets

annotations come from: https://epd.epfl.ch/human/human_database.php?db=human

In [None]:
root_path = "finetuning_tasks/Prom300/"

In [None]:
annotations = pd.read_csv(root_path+'promoterRangesHg19.bed', sep='\t', header=None)
annotations.columns = ['chr', 'start', 'end', 'name', 'score', 'strand']
annotations.sort_values(by=['chr', 'start'], inplace=True)

annotations["start"] = annotations.apply(lambda x: x["start"]-249 if x["strand"] == "+" else x["start"]-50, axis=1)
annotations["end"] = annotations.apply(lambda x: x["end"]+50 if x["strand"] == "+" else x["end"]+249, axis=1)


# load BP chromosome mapper
def load_mapper(BP_chr):
    # create mapper that maps nucleotide position to BP position
    mapper = []
    curr_BP_pos = 0
    for i in range(len(BP_chr)):
        for e in range(len(BP_chr[i])):
            mapper.append(i)
    return mapper


# iterate through each chromosome
BP_seqs = []
for chrom in annotations["chr"].unique():
    chrom_nr = chrom[3:]
    if chrom_nr == "X":
        chrom_nr = 23
    elif chrom_nr == "Y":
        chrom_nr = 24
    # slice the annotations for the current chromosome
    annotations_chr = annotations.loc[annotations["chr"] == chrom]
    # load BP tokenized chromosome
    with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
        BP_chr = pickle.load(f)
    
    # load mapper that maps nucleotide position to BP position
    print(f"loading mapper {chrom}")
    mapper = load_mapper(BP_chr)

    BP_starts = annotations.loc[annotations["chr"] == chrom]["start"].apply(lambda x: mapper[x])
    BP_ends = annotations.loc[annotations["chr"] == chrom]["end"].apply(lambda x: mapper[x])
    for start, end in zip(BP_starts, BP_ends):
        BP_seqs.append(BP_chr[start:end])

annotations["BPseq"] = BP_seqs
annotations["sequence"] = annotations["BPseq"].apply(lambda x: list("".join(x)))

##### Random mutate

Preprocessing like https://github.com/egochao/DeePromoter

Procedure for create negative dataset as described in paper:

    Step 1: Break the sequence in N parts(20 as in the paper)

    Old Step 2: Random choose M parts of the original sequence to keep it, and random initialize the rest

    New Step 2: Randomly choose M parts of the original sequence to keep it, shuffle the rest around (preserves sequence attributes like GC content and AG balance)

In [None]:
# function that returns the indices of all elements of k chunks of size (len(lst) // k) in a list
def get_chunks(lst, k):
    chunkSize = len(lst) // k
    indices = []
    for i in range(0, len(lst), chunkSize):
        indices.append([e for e in range(i, min(i + chunkSize, len(lst)))]) # min is used to avoid index out of bounds
    return indices

def choose_random_chunks(chunks, nrOfChunksToMutate):
    random_indices = np.random.choice(len(chunks), nrOfChunksToMutate, replace=False)
    return [chunks[i] for i in random_indices]

def unroll_chunks(chunks):
    return [e for chunk in chunks for e in chunk]

def shuffleIndices(indices):
    shuffleIndices = indices.copy()
    np.random.shuffle(shuffleIndices)
    return shuffleIndices

def random_mutate(seq, k, nrOfChunksToMutate):
    seq = seq.copy()
    chunks = get_chunks(seq, k=k)
    chunksToMutate = choose_random_chunks(chunks, nrOfChunksToMutate=nrOfChunksToMutate)
    indicesToMutate = unroll_chunks(chunksToMutate)
    shuffledIndicesToMutate = shuffleIndices(indicesToMutate)
    for x,y in zip(indicesToMutate,shuffledIndicesToMutate):
        seq[x] = seq[y]
    return seq

##### Shuffle GROVER sequence

In [None]:
seq = annotations.loc[1, 'BPseq']
nrOfChunks = 8 # in how many splits do we want to divide the sequence
nrOfChunksToMutate = 6 # how many of these splits do we want to mutate (randomly shuffle)

annotations["BPseqMutated"] = annotations["BPseq"].apply(lambda x: random_mutate(x, nrOfChunks, nrOfChunksToMutate))

data_non_mutated = pd.DataFrame({"X": annotations["BPseq"], "y": [1] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
data_mutated = pd.DataFrame({"X": annotations["BPseqMutated"], "y": [0] * len(annotations), "start": annotations["start"], "end": annotations["end"], "name": annotations["name"], "strand": annotations["strand"], "chr": annotations["chr"]})
data = pd.concat([data_non_mutated, data_mutated], ignore_index=True)

In [None]:
# split data into train, val, test: 80%, 10%, 10%
train = data.sample(frac=0.8, random_state=42)
val_test = data.drop(train.index)
val = val_test.sample(frac=0.5, random_state=42)
test = val_test.drop(val.index)

train.to_csv(root_path + "train.tsv", index=False, sep='\t')
val.to_csv(root_path + "validate.tsv", index=False, sep='\t')
test.to_csv(root_path + "test.tsv", index=False, sep='\t')

In [None]:
root_path = "finetuning_tasks/PromScan/"

In [None]:
annotations = pd.read_csv(root_path + 'promoterRangesHg19.bed', sep='\t', header=None)
annotations.columns = ['chr', 'start', 'end', 'name', 'score', 'strand']

annotations["tss"] = annotations["start"].copy()
annotations["start"] = annotations["start"] - 5000
annotations["end"] = annotations["end"] + 5000

In [None]:
## Split long sequences in overlapping windows of size 1001 nucleotides and stride of 300 nucl.

def split_in_overlapping_windows(start, end, window_size, stride, tss_pos, chr, strand, name):
    windows = []
    for i in range(start, end, stride):
        this_start = i
        this_end = i + window_size
        this_window = {"start": this_start, "end": this_end, "chr": chr, "is_ tss": this_start <= tss_pos and tss_pos <= this_end, "strand": strand, "name": name}
        windows.append(this_window)
    return windows

stacked_windows = annotations.apply(lambda x: split_in_overlapping_windows(x["start"], x["end"], 1001, 300, x["tss"], x["chr"], x["strand"], x["name"]), axis=1)


windows_dict = {"start": [], "end": [], "chr": [], "is_tss": [], "strand": [], "name": []}

for sample in stacked_windows:
    for window in sample:
        windows_dict["start"].append(window["start"])
        windows_dict["end"].append(window["end"])
        windows_dict["chr"].append(window["chr"])
        windows_dict["is_tss"].append(window["is_ tss"])
        windows_dict["strand"].append(window["strand"])
        windows_dict["name"].append(window["name"])
        
window_df = pd.DataFrame(windows_dict)

In [None]:
# load BP chromosome mapper
def load_mapper(BP_chr):
    # create mapper that maps nucleotide position to BP position
    mapper = []
    curr_BP_pos = 0
    for i in range(len(BP_chr)):
        for e in range(len(BP_chr[i])):
            mapper.append(i)
    return mapper


# iterate through each chromosome
BP_seqs = []
for chrom in annotations["chr"].unique():
    chrom_nr = chrom[3:]
    if chrom_nr == "X":
        chrom_nr = 23
    elif chrom_nr == "Y":
        chrom_nr = 24
    # slice the annotations for the current chromosome
    annotations_chr = window_df.loc[window_df["chr"] == chrom]
    # load BP tokenized chromosome
    with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
        BP_chr = pickle.load(f)
    
    # load mapper that maps nucleotide position to BP position
    print(f"loading mapper {chrom}")
    mapper = load_mapper(BP_chr)

    BP_starts = window_df.loc[window_df["chr"] == chrom]["start"].apply(lambda x: mapper[x])
    BP_ends = window_df.loc[window_df["chr"] == chrom]["end"].apply(lambda x: mapper[x])
    for start, end in zip(BP_starts, BP_ends):
        BP_seqs.append(BP_chr[start:end])

window_df["BPseq"] = BP_seqs
window_df["sequence"] = annotations["BPseq"].apply(lambda x: list("".join(x)))

window_df.rename(columns={"BPseq": "X", "is_tss": "y"}, inplace=True)

In [None]:
# split into train, val, test: 80/10/10
train, validate, test = np.split(window_df.sample(frac=1, random_state=42), [int(.8*len(window_df)), int(.9*len(window_df))])

In [None]:
train.to_csv(root_path + "train.tsv", index=False, sep="\t")
validate.to_csv(root_path + "validate.tsv", index=False, sep="\t")
test.to_csv(root_path + "test.tsv", index=False, sep="\t")

CTCF peaks from https://www.encodeproject.org/experiments/ENCSR000BIE/

In [None]:
root_path = "finetuning_tasks/TF_binding/"

In [None]:
peaks = pd.read_csv(root_path + 'ENCFF915BIE.bed', sep='\t', header=None)
motif_sites  = pd.read_csv(root_path + 'CTCF_motif_sites.gff', sep='\t', header=None, skiprows=1)

run in command line:
- intersectBed -loj -a CTCF_motif_sites.gff -b ENCFF915BIE.bed > CTCF_motifs_with_peak_annotation.bed

In [None]:
motifs_with_peaks = pd.read_csv(root_path + 'CTCF_motifs_with_peak_annotation.bed', sep='\t', header=None)

In [None]:
## Getting the center of motif
motifs_with_peaks["center_of_motif"] = motifs_with_peaks[3] + (motifs_with_peaks[4] - motifs_with_peaks[3])//2

## Get 1kb area around motif
motifs_with_peaks["start_of_bin"] = motifs_with_peaks["center_of_motif"] - 500
motifs_with_peaks["end_of_bin"] = motifs_with_peaks["center_of_motif"] + 500
motifs_with_peaks["width"] = motifs_with_peaks["end_of_bin"] - motifs_with_peaks["start_of_bin"]

## Add target annotation column for machine learning task
motifs_with_peaks["y"] = motifs_with_peaks[9].apply(lambda x: 1 if x != '.' else 0)

In [None]:
## Retrieve GROVER tokens

data = motifs_with_peaks[[0, "start_of_bin", "end_of_bin", "y"]]
data = data.loc[data[0].apply(lambda x: True if len(x) <= 5 else False)] # some random lines with chr1_gl000191_random etc
data = data.sort_values(by=[0, "start_of_bin"], inplace=False)

In [None]:
def load_mapper(BP_chr):
    # create mapper that maps nucleotide position to BP position
    mapper = []
    for i in range(len(BP_chr)):
        for e in range(len(BP_chr[i])):
            mapper.append(i)
    return mapper

# iterate through each chromosome
BP_seqs = []
for chrom in data[0].unique():
    chrom_nr = chrom[3:]
    if chrom_nr == "X":
        chrom_nr = 23
    elif chrom_nr == "Y":
        chrom_nr = 24
    # slice the data for the current chromosome
    data_chr = data.loc[data[0] == chrom]
    # load BP tokenized chromosome
    with open(f"tokenized_chromosomes/chr_{chrom_nr}.pkl", "rb") as f:
        BP_chr = pickle.load(f)
    
    # load mapper that maps nucleotide position to BP position
    print(f"loading mapper {chrom}")
    mapper = load_mapper(BP_chr)

    BP_starts = data.loc[data[0] == chrom]["start_of_bin"].apply(lambda x: mapper[x])
    BP_ends = data.loc[data[0] == chrom]["end_of_bin"].apply(lambda x: mapper[x])
    for start, end in zip(BP_starts, BP_ends):
        BP_seqs.append(BP_chr[start:end])

In [None]:
data["X"] = BP_seqs

## create train val test (80/10/10)
train = data.sample(frac=0.8, random_state=0)
val = data.drop(train.index)
test = val.sample(frac=0.5, random_state=0)
val = val.drop(test.index)

train.to_csv(root_path + "CTCF_train.tsv", index=False, sep="\t")
val.to_csv(root_path + "CTCF_val.tsv", index=False, sep="\t")
test.to_csv(root_path + "CTCF_test.tsv", index=False, sep="\t")

In [None]:
import argparse
import pandas as pd
import json
import os
import numpy as np
from typing import Dict, Sequence
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score, accuracy_score, f1_score

from transformers import BertForSequenceClassification, TrainingArguments, BertTokenizer, PreTrainedTokenizerFast, AutoTokenizer, AutoModelForSequenceClassification
import transformers
import torch
from torch.utils.data import Dataset
from dataclasses import dataclass, field

@dataclass
class TrainingArguments(transformers.TrainingArguments):
    run_name: str = field(default="run")
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(default=512, metadata={"help": "Maximum sequence length."})
    gradient_accumulation_steps: int = field(default=1)
    per_device_train_batch_size: int = field(default=1)
    per_device_eval_batch_size: int = field(default=1)
    num_train_epochs: int = field(default=1)
    fp16: bool = field(default=False)
    logging_steps: int = field(default=100)
    save_steps: int = field(default=100)
    eval_steps: int = field(default=100)
    evaluation_strategy: str = field(default="steps"),
    warmup_steps: int = field(default=50)
    weight_decay: float = field(default=0.01)
    learning_rate: float = field(default=1e-4)
    save_total_limit: int = field(default=3)
    load_best_model_at_end: bool = field(default=True)
    output_dir: str = field(default="output")
    find_unused_parameters: bool = field(default=False)
    checkpointing: bool = field(default=False)
    dataloader_pin_memory: bool = field(default=False)
    eval_and_save_results: bool = field(default=True)
    save_model: bool = field(default=False)
    seed: int = field(default=42)

@dataclass
class DataCollatorForSupervisedDataset(object):

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.Tensor(labels).long()
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

class SupervisedDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_length, kmer):

        super(SupervisedDataset, self).__init__()

        texts = ["".join(text) for text in texts]

        if kmer:
            sequences = []
            for text in texts:
                sequences.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))
        else:
            sequences = texts
            
        output = tokenizer(
            sequences,
            add_special_tokens=True,
            max_length=max_length,
            padding="longest",
            return_tensors="pt",
            truncation=True
        )
        
        self.input_ids = output["input_ids"]
        self.attention_mask = output["attention_mask"]
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
    
def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
    """Collects the state dict and dump to disk."""
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
        del state_dict
        trainer._save(output_dir, state_dict=cpu_state_dict) 

def calculate_metric_with_sklearn(logits: np.ndarray, labels: np.ndarray):
    predictions = np.argmax(logits, axis=-1)
    valid_mask = labels != -100  
    valid_predictions = predictions[valid_mask]
    valid_labels = labels[valid_mask]
    return {
        "accuracy": accuracy_score(valid_labels, valid_predictions),
        "f1": f1_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
        "matthews_correlation": matthews_corrcoef(
            valid_labels, valid_predictions
        ),
        "precision": precision_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
        "recall": recall_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
    }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):  # Unpack logits if it's a tuple
        logits = logits[0]
    return calculate_metric_with_sklearn(logits, labels)

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--task", default=None, type=str, required=True
    )
    parser.add_argument(
        "--num_labels", default=2, type=int
    )
    parser.add_argument(
        "--model_path", default=None
    )
    parser.add_argument(
        "--tokenizer_path", default=None
    )
    parser.add_argument(
        "--epochs", default=5, type=int
    )
    parser.add_argument(
        "--model_name", default=None, required=True
    )
    parser.add_argument(
        "--kmer", type=int
    )
    parser.add_argument(
        "--max_length", default=512, type=int
    )

    args = parser.parse_args()
    num_labels = int(args.num_labels)
    task_path = args.task
    data_path = '/beegfs/ws/1/mesa972e-paper_revisions/finetuning_tasks/'
    model_path = args.model_path
    tokenizer_path = args.tokenizer_path
    max_length = args.max_length
    epochs = int(args.epochs)
    model_name = args.model_name
    kmer = args.kmer

    output_path = data_path + task_path + "/"+model_name+"/"
    os.makedirs(output_path, exist_ok=True)

    train = pd.read_csv(data_path + task_path + "/train.tsv", sep='\t', converters={'X': eval, 'y': eval})
    test = pd.read_csv(data_path + task_path + "/test.tsv", sep='\t', converters={'X': eval, 'y': eval})
    val = pd.read_csv(data_path + task_path + "/val.tsv", sep='\t', converters={'X': eval, 'y': eval})

    

    if "NT" in model_name:
        model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=num_labels,trust_remote_code=True)
    elif "DNABERT2" in model_name:
        model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNABERT-2-117M", num_labels=num_labels, trust_remote_code=True)
    else:
        model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
    
    model.to('cuda')

    if "NT" in model_name:
        tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref",trust_remote_code=True)
        tokenizer.eos_token = tokenizer.pad_token
    elif "DNABERT2" in model_name:
        tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
    elif kmer:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_path, 
                                                    do_lower_case=False, 
                                                    padding_side="right",
                                                    add_special_tokens=True
                                                    )
    else:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path + "tokenizer.json",
                                                    do_lower_case=False,
                                                    padding_side="right"
                                                    )
    tokenizer.pad_token = "[PAD]"
    tokenizer.sep_token = "[SEP]"
    tokenizer.mask_token = "[MASK]"
    tokenizer.cls_token = "[CLS]"

    train_dataset = SupervisedDataset(train.X, train.y, tokenizer, max_length, kmer)
    test_dataset = SupervisedDataset(test.X, test.y, tokenizer, max_length, kmer)
    val_dataset = SupervisedDataset(val.X, val.y, tokenizer, max_length, kmer)
   
    
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)


    optim = "adamw_torch"
    per_device_train_batch_size = 16
    per_device_eval_batch_size = 16
    fp16 = True

    evaluation_strategy = "epoch"
    save_strategy = "epoch"
    logging_strategy = "epoch"
    warmup_steps = 50
    weight_decay = 0.01
    learning_rate = 1e-6
    save_total_limit = 3
    load_best_model_at_end = True
    find_unused_parameters = False
    checkpointing = False
    dataloader_pin_memory = False
    eval_and_save_results = True
    save_model = False
    seed = 42
    overwrite_output_dir = True
    

    train_args = TrainingArguments(
                                per_device_train_batch_size=per_device_train_batch_size,
                                per_device_eval_batch_size=per_device_eval_batch_size,
                                num_train_epochs=epochs,
                                evaluation_strategy=evaluation_strategy,
                                save_strategy=save_strategy,
                                logging_strategy=logging_strategy,
                                load_best_model_at_end=load_best_model_at_end,
                                # metric_for_best_model="matthews_correlation",
                                learning_rate=learning_rate,
                                output_dir=output_path,
                                warmup_steps=warmup_steps,
                                weight_decay=weight_decay,
                                seed=seed,
                                save_total_limit = save_total_limit,
                                find_unused_parameters=find_unused_parameters,
                                checkpointing=checkpointing,
                                dataloader_pin_memory=dataloader_pin_memory,
                                eval_and_save_results=eval_and_save_results,
                                save_model=save_model,
                                optim=optim,
                                model_max_length=max_length,
                                gradient_accumulation_steps=gradient_accumulation_steps,
                                fp16 = fp16,
                                overwrite_output_dir=overwrite_output_dir,
                                eval_accumulation_steps=2
                                )

    trainer = transformers.Trainer(
                                model=model,
                                tokenizer=tokenizer,
                                compute_metrics=compute_metrics,
                                train_dataset=train_dataset,
                                eval_dataset=val_dataset,
                                data_collator=data_collator,
                                args = train_args
                                   )
    trainer.train()

    trainer.save_state()
    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=output_path)

    # get the evaluation results from trainer

    results = trainer.evaluate(eval_dataset=test_dataset)
    os.makedirs(output_path, exist_ok=True)
    with open(os.path.join(output_path, "eval_results.json"), "w") as f:
        json.dump(results, f)
if __name__ == "__main__":
    main()

In [None]:
## kmer TF-IDF

from sklearn.metrics import matthews_corrcoef, precision_score, recall_score, accuracy_score, f1_score
import numpy as np
import argparse
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import os
import pickle
import json

def calculate_metric_with_sklearn(logits: np.ndarray, labels: np.ndarray):
    return {
        "accuracy": accuracy_score(labels, logits),
        "f1": f1_score(
            labels, logits, average="macro", zero_division=0
        ),
        "matthews_correlation": matthews_corrcoef(
            labels, logits
        ),
        "precision": precision_score(
            labels, logits, average="macro", zero_division=0
        ),
        "recall": recall_score(
            labels, logits, average="macro", zero_division=0
        ),
    }

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--task", default=None, type=str, required=True # Prom300
    )
    parser.add_argument(
        "--kmer", type=int
    )

    args = parser.parse_args()
    
    task = args.task
    kmer = args.kmer

    print("kmer", kmer)

    print("-task", task)
    data_path = 'finetuning_tasks/'

    output_path = data_path + task + "/TF-IDF/"+str(kmer)+"mer/"
    os.makedirs(output_path, exist_ok=True)

    train = pd.read_csv(data_path + task + "/train.csv")
    test = pd.read_csv(data_path + task + "/test.csv")
    val = pd.read_csv(data_path + task + "/val.csv")

    sentences = [text for text in train.sequence]
    X_train = []
    for text in sentences:
        X_train.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))
    
    sentences = [text for text in test.sequence]
    X_test = []
    for text in sentences:
        X_test.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))

    sentences = [text for text in val.sequence]
    X_val = []
    for text in sentences:
        X_val.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))

    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    X_test = vectorizer.transform(X_test)

    pickle.dump(vectorizer, open(output_path + "/TfidfVectorizer.pkl", "wb"))

    best_val_mcc = 0

    for n_estimators in range(100, 5001, 100):
        clf = RandomForestClassifier(n_estimators=n_estimators, random_state=0, n_jobs=-1)

        clf.fit(X_train, train.label)

        y_val = clf.predict(X_val)

        scores_val = calculate_metric_with_sklearn(val.label, y_val)

        if scores_val["matthews_correlation"] > best_val_mcc:
            pickle.dump(clf, open(output_path + "/RF_"+str(n_estimators)+".pkl", "wb"))
            
            y_test = clf.predict(X_test)

            scores_test = calculate_metric_with_sklearn(test.label, y_test)
            with open(os.path.join(output_path, "eval_results.json"), "w") as f:
                json.dump(scores_test, f)
            
            best_val_mcc = scores_test["matthews_correlation"]
                

if __name__ == "__main__":
    main()

#### Figure 6J
Performance for the NT tasks, tasks for which human data are available from the Nucleotide Transformer study 

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, BertTokenizer, PreTrainedTokenizerFast, AutoTokenizer, AutoModelForSequenceClassification, Trainer
from sklearn.metrics import matthews_corrcoef, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import argparse
import os
from datasets import load_dataset, Dataset
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score, accuracy_score, f1_score
import json

def safe_save_model_for_hf_trainer(trainer: Trainer, output_dir: str):
    """Collects the state dict and dump to disk."""
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
        del state_dict
        trainer._save(output_dir, state_dict=cpu_state_dict)

def calculate_metric_with_sklearn(logits: np.ndarray, labels: np.ndarray):
    predictions = np.argmax(logits, axis=-1)
    valid_mask = labels != -100  
    valid_predictions = predictions[valid_mask]
    valid_labels = labels[valid_mask]
    return {
        "accuracy": accuracy_score(valid_labels, valid_predictions),
        "f1": f1_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
        "matthews_correlation": matthews_corrcoef(
            valid_labels, valid_predictions
        ),
        "precision": precision_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
        "recall": recall_score(
            valid_labels, valid_predictions, average="macro", zero_division=0
        ),
    }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple): 
        logits = logits[0]
    return calculate_metric_with_sklearn(logits, labels)
     

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--task", default=None, type=str, required=True
    )
    parser.add_argument(
        "--num_labels", default=2, type=int
    )
    parser.add_argument(
        "--model_path", default=None
    )
    parser.add_argument(
        "--tokenizer_path", default=None
    )
    parser.add_argument(
        "--epochs", default=5, type=int
    )
    parser.add_argument(
        "--model_name", default=None, required=True
    )
    parser.add_argument(
        "--kmer", type=int
    )
    parser.add_argument(
        "--max_length", default=512, type=int
    )

    args = parser.parse_args()
    num_labels = int(args.num_labels)
    task_path = args.task
    model_path = args.model_path
    tokenizer_path = args.tokenizer_path
    max_length = args.max_length
    epochs = int(args.epochs)
    model_name = args.model_name
    kmer = args.kmer

    output_path = task_path + "/"+model_name+"/"
    os.makedirs(output_path, exist_ok=True)
    

    if "NT" in model_name:
        model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=num_labels,trust_remote_code=True)
    elif "DNABERT2" in model_name:
        model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNABERT-2-117M", num_labels=num_labels, trust_remote_code=True)
    else:
        model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
    
    if "NT" in model_name:
        tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref",trust_remote_code=True)
        tokenizer.eos_token = tokenizer.pad_token
    elif "DNABERT2" in model_name:
        tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
    elif kmer:
        tokenizer = BertTokenizer.from_pretrained(tokenizer_path, 
                                                    do_lower_case=False, 
                                                    padding_side="right",
                                                    add_special_tokens=True
                                                    )
    else:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path + "tokenizer.json",
                                                    do_lower_case=False,
                                                    padding_side="right"
                                                    )
    tokenizer.pad_token = "[PAD]"
    tokenizer.sep_token = "[SEP]"
    tokenizer.mask_token = "[MASK]"
    tokenizer.cls_token = "[CLS]"

    model.to('cuda')


    dataset_name = task_path
    train_dataset = load_dataset(
            "InstaDeepAI/nucleotide_transformer_downstream_tasks",
            dataset_name,
            split="train",
            streaming= False,
        )
    test_dataset = load_dataset(
            "InstaDeepAI/nucleotide_transformer_downstream_tasks",
            dataset_name,
            split="test",
            streaming= False,
        )
    
    # Get training data
    train_sequences = train_dataset['sequence']
    train_labels = train_dataset['label']

    # Split the dataset into a training and a validation dataset
    train_sequences, validation_sequences, train_labels, validation_labels = train_test_split(train_sequences,
                                                                                train_labels, test_size=0.05, random_state=42)

    # Get test data
    test_sequences = test_dataset['sequence']
    test_labels = test_dataset['label']
    

    ds_train = Dataset.from_dict({"data": train_sequences,'labels':train_labels})
    ds_validation = Dataset.from_dict({"data": validation_sequences,'labels':validation_labels})
    ds_test = Dataset.from_dict({"data": test_sequences,'labels':test_labels})
     
    def tokenize_function(examples):
        outputs = tokenizer(examples["data"], max_length=max_length, padding="longest", truncation=True)
        return outputs


    # Creating tokenized promoter dataset
    tokenized_datasets_train = ds_train.map(
        tokenize_function,
        batched=True,
        remove_columns=["data"],
    )
    tokenized_datasets_validation = ds_validation.map(
        tokenize_function,
        batched=True,
        remove_columns=["data"],
    )
    tokenized_datasets_test = ds_test.map(
        tokenize_function,
        batched=True,
        remove_columns=["data"],
    )


    batch_size = 8
    args = TrainingArguments(
        remove_unused_columns=False,
        evaluation_strategy="steps",
        save_strategy="steps",
        learning_rate=1e-5,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps= 1,
        per_device_eval_batch_size= 64,
        num_train_epochs= epochs,
        logging_steps= 100,
        eval_steps = 100,
        save_steps = 100,
        load_best_model_at_end=True,  # Keep the best model according to the evaluation
        metric_for_best_model="matthews_correlation",
        label_names=["labels"],
        # dataloader_drop_last=True,
        # max_steps= 1000,
        output_dir=output_path
    )

    trainer = Trainer(
        model,
        args,
        train_dataset= tokenized_datasets_train,
        eval_dataset= tokenized_datasets_validation,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    trainer.save_state()
    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=output_path)

    # get the evaluation results from trainer

    results = trainer.evaluate(eval_dataset=tokenized_datasets_test)
    print(results)
    os.makedirs(output_path, exist_ok=True)
    with open(os.path.join(output_path, "eval_results.json"), "w") as f:
        json.dump(results, f)
     

if __name__ == "__main__":
    main()

     


In [None]:
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score, accuracy_score, f1_score
import numpy as np
from itertools import product
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import os
import pickle
import json
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
import joblib
import argparse

def calculate_metric_with_sklearn(logits: np.ndarray, labels: np.ndarray):
    return {
        "accuracy": accuracy_score(labels, logits),
        "f1": f1_score(
            labels, logits, average="macro", zero_division=0
        ),
        "matthews_correlation": matthews_corrcoef(
            labels, logits
        ),
        "precision": precision_score(
            labels, logits, average="macro", zero_division=0
        ),
        "recall": recall_score(
            labels, logits, average="macro", zero_division=0
        ),
    }

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--task", default=None, type=str, required=True # "enhancers", "enhancers_types", "promoter_all", "promoter_no_tata", "promoter_tata", "splice_sites_acceptors","splice_sites_all","splice_sites_donors"
    )
    parser.add_argument(
        "--kmer", type=int
    )

    args = parser.parse_args()
    
    task = args.task
    kmer = args.kmer

    print("kmer", kmer)

    print("-task", task)


    dataset_name = task
    train_dataset = load_dataset(
        "InstaDeepAI/nucleotide_transformer_downstream_tasks",
        dataset_name,
        split="train",
        streaming= False,
    )
    test_dataset = load_dataset(
        "InstaDeepAI/nucleotide_transformer_downstream_tasks",
        dataset_name,
        split="test",
        streaming= False,
    )


    output_path = task + "/TF-IDF/"+str(kmer)+"mer/"
    os.makedirs(output_path, exist_ok=True)

    # Get training data
    train_sequences = train_dataset['sequence']
    train_labels = train_dataset['label']

    # Split the dataset into a training and a validation dataset
    train_sequences, validation_sequences, train_labels, validation_labels = train_test_split(train_sequences,
                                                                                train_labels, test_size=0.05, random_state=42)

    # Get test data
    test_sequences = test_dataset['sequence']
    test_labels = test_dataset['label']

    X_train = []
    for text in train_sequences:
        X_train.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))
    
    X_test = []
    for text in test_sequences:
        X_test.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))

    X_val = []
    for text in validation_sequences:
        X_val.append(" ".join([text[i: i + kmer] for i in range(len(text) - kmer + 1)]))

    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_val = vectorizer.transform(X_val)
    X_test = vectorizer.transform(X_test)

    pickle.dump(vectorizer, open(output_path + "/TfidfVectorizer.pkl", "wb"))

    best_val_mcc = -1

    for n_estimators in range(100, 2001, 100):
        
        clf = RandomForestClassifier(n_estimators=n_estimators, random_state=0, n_jobs=-1)

        clf.fit(X_train, train_labels)

        y_val = clf.predict(X_val)

        scores_val = calculate_metric_with_sklearn(val_labels, y_val)

        if scores_val["matthews_correlation"] > best_val_mcc:
            joblib.dump(clf, output_path + "/RF_model.joblib", compress=3) 
            
            y_test = clf.predict(X_test)

            scores_test = calculate_metric_with_sklearn(test.label, y_test)
            with open(os.path.join(output_path, "eval_results.json"), "w") as f:
                json.dump(scores_test, f)
            
            best_val_mcc = scores_val["matthews_correlation"]
            

if __name__ == "__main__":
    main()