In [None]:
def mask_token(sentence, tokenizer):
    # Tokenize the sentence and get the non-padding tokens
    tokens = tokenizer.encode_plus(sentence, add_special_tokens=True, return_tensors="pt",max_length = 15, padding="max_length", truncation=True)
    
    input_ids = tokens["input_ids"][0]
    non_padding_tokens = [token for token in input_ids if token != tokenizer.pad_token_id]
    
    # Determine the index of the word to mask
    if len(non_padding_tokens) > 0:
        middle_index = len(input_ids) // 2
        if input_ids[middle_index] == tokenizer.pad_token_id:
            mask_index = 1
        else:
            mask_index = middle_index
    else:
        return None

    # Mask the selected token and get its label
    masked_tokens = input_ids.detach().clone()
    masked_tokens[mask_index] = tokenizer.mask_token_id
    label = input_ids[mask_index]
    
    # Convert the masked tokens and label back to strings
    masked_sentence = tokenizer.decode(masked_tokens, skip_special_tokens=False)
    label = tokenizer.decode([label], skip_special_tokens=False)

    return masked_sentence, label

def divide_into_sentences(input_string, sentence_length=10, step_size=1):
    words = input_string.split()
    sentences = []
    for i in range(0, len(words) - sentence_length + 1, step_size):
        sentences.append(" ".join(words[i: i + sentence_length]))    
    return sentences


def split_data(data, train_ratio=0.8, validate_ratio=0.1, test_ratio=0.1, seed=123):
    random.seed(seed)
    random.shuffle(data)
    n = len(data)
    train_end = int(train_ratio * n)
    validate_end = int((train_ratio + validate_ratio) * n)
    train_data = data[:train_end]
    validate_data = data[train_end:validate_end]
    test_data = data[validate_end:]
    return train_data, validate_data, test_data



In [1]:
import pickle
import os

In [None]:
import re
from collections import Counter
from transformers import BertTokenizer
import os
from tqdm import tqdm


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModelForMaskedLM, BertConfig
import random

model_name = 'bert-base-uncased'

# Load BERT model with new vocabulary
config = AutoModelForMaskedLM.from_scratch(model_name, num_labels=len(tokens))
model = BertForMaskedLM.from_pretrained(model_name, config=config)
model.resize_token_embeddings(len(tokenizer))


loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

all_sentences = []
absolute_path_genomes = "/Users/daulettoibazar/Desktop/Research/models_and_data/annotation_extended/"
path_genome_files = os.listdir("/Users/daulettoibazar/Desktop/Research/models_and_data/annotation_extended/")

for file_name in tqdm(path_genome_files):
    with open(os.path.join(absolute_path_genomes, file_name), "r", encoding="latin_1") as infile:
        content = infile.read()
        short_sentences = divide_into_sentences(content)
        all_sentences+=short_sentences

train_sentences, validation_sentences, test_sentences = split_data(all_sentences)

for epoch in range(1, 5):
    running_loss = 0.0
    model.train()
    for i, sentence in enumerate(train_sentences, 0):
        masked_sentence, label  = mask_token(sentence)
        
        tokenized_text = masked_sentence.split()
        masked_index = tokenized_text.index('[MASK]')
        tokenized_text[masked_index] = '[MASK]'
        
        tokens = tokenizer.encode_plus(masked_sentence, add_special_tokens=False, return_tensors="pt",max_length = 15, padding="max_length", truncation=True)
        input_ids = tokens["input_ids"][0]
        labels = tokenizer.encode_plus(masked_sentence, add_special_tokens=False, return_tensors="pt",max_length = 15, padding="max_length", truncation=True)
        ids_to_check = labels["input_ids"][0]
        

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, labels = ids_to_check, attention_mask = tokens["attention_mask"][0])
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

        # Print the statistics
        running_loss += loss.item()
        if i % 1000 == 999:    # Print every 1000 sentences
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 1000))
            running_loss = 0.0
        

print('Finished Training')


AttributeError: type object 'AutoModelForMaskedLM' has no attribute 'from_scratch'

In [None]:


with torch.no_grad():
    correct = 0
    total = 0
    for sentence in validation_sentences:
        masked_sentence, label  = mask_token(sentence)
        
        tokenized_text = masked_sentence.split()
        masked_index = tokenized_text.index('[MASK]')
        tokenized_text[masked_index] = '[MASK]'
        
        tokens = tokenizer.encode_plus(masked_sentence, add_special_tokens=False, return_tensors="pt",max_length = 15, padding="max_length", truncation=True)
        input_ids = tokens["input_ids"][0]
        labels = tokenizer.encode_plus(masked_sentence, add_special_tokens=False, return_tensors="pt",max_length = 15, padding="max_length", truncation=True)
        ids_to_check = labels["input_ids"][0]
        


        # Forward pass
        outputs = model(input_ids)
        logits = outputs.logits[0, masked_index, :]
        probs = torch.softmax(logits, dim=-1)
        predicted_token_id = torch.argmax(logits, dim=-1)
        predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_id)
        target = tokenizer.convert_ids_to_tokens(ids_to_check)
        # Count the number of correct predictions
        total += 1
        correct += (predicted == target).sum().item()

    # Print the accuracy
    accuracy = 100.0 * correct / total
    print('Accuracy: %.2f %%' % accuracy)

In [None]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for sentence in test_sentences:
        masked_sentence, label  = mask_token(sentence)
        
        tokenized_text = masked_sentence.split()
        masked_index = tokenized_text.index('[MASK]')
        tokenized_text[masked_index] = '[MASK]'
        
        tokens = tokenizer.encode_plus(masked_sentence, add_special_tokens=False, return_tensors="pt",max_length = 15, padding="max_length", truncation=True)
        input_ids = tokens["input_ids"][0]
        labels = tokenizer.encode_plus(masked_sentence, add_special_tokens=False, return_tensors="pt",max_length = 15, padding="max_length", truncation=True)
        ids_to_check = labels["input_ids"][0]
        

        # Forward pass
        outputs = model(input_ids)
        logits = outputs.logits[0, masked_index, :]
        probs = torch.softmax(logits, dim=-1)
        predicted_token_id = torch.argmax(logits, dim=-1)
        predicted_token = tokenizer.convert_ids_to_tokens(predicted_token_id)
        target = tokenizer.convert_ids_to_tokens(ids_to_check)
        # Count the number of correct predictions
        total += 1
        correct += (predicted == target).sum().item()

    # Print the accuracy
    accuracy = 100.0 * correct / total
    print('Accuracy: %.2f %%' % accuracy)
