In [276]:
"""
Create k-mer from sequence.
@param  sequence: a string.
@param  size_k: size of k in k-mer.
"""
def create_kmer(sequence, size_k):
    """
    Remove 'N' if found in sequence and leave only A, T, G, and C in sequence.
    """
    kmers = ''.join([s if s not in ['N', 'n'] else '' for s in sequence])
    kmers = [kmers[i:i+size_k] for i in range(len(kmers)+1-size_k)]
    return ' '.join([k for k in kmers])

import pandas as pd
from data_preparation import kmer

def get_sequences(csv_path, n_sample=10, random_state=1337):
    r"""
    Get sequence from certain CSV. CSV has header such as 'sequence', 'label_prom', 'label_ss', 'label_polya' XXX.
    """
    df = pd.read_csv(csv_path)
    if (n_sample > 0):
        df = df.sample(n=n_sample, random_state=random_state)
    sequence = list(df['sequence'])
    label_prom = list(df['label_prom'])
    label_ss = list(df['label_ss'])
    label_polya = list(df['label_polya'])

    return sequence, label_prom, label_ss, label_polya

import torch
def preprocessing(data, tokenizer, size_k=3, max_length=512):
    """
    Preprocessing for pretrained BERT.
    @param  data (np.array): array of texts to be processed.
    @param  tokenizer (Tokenizer): tokenizer initialized from pretrained values.
    @return input_ids (torch.Tensor): tensor of token ids to be fed to model.
    @return attention_masks (torch.Tensor): tensor of indices (a bunch of 'indexes') specifiying which token needs to be attended by model.
    """
    input_ids = []
    attention_masks = []

    for sequence in data:
        if len(sequence) > max_length:
            sequence = sequence[0:512]
        t = kmer(sequence, size_k=size_k)
        encoded_sent = tokenizer.encode_plus(
            text=t,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert input_ids and attention_masks to tensor.
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

"""
Initialize tokenizer using BertTokenizer with pretrained weights from DNABert.
"""
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('./pretrained/3-new-12w-0')

In [284]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

sequences, label_prom, label_ss, label_polya = get_sequences('./sample/training_sample.csv', n_sample=-1)
val_seq, val_label_prom, val_label_ss, val_label_polya = get_sequences('./sample/testing_sample.csv', n_sample=-1)

test_seq, test_label_prom, test_label_ss, test_label_polya = get_sequences('./sample/testing_sample.csv', n_sample=-1)

"""
Create dataloader.
"""
BATCH_SIZE = 1
EPOCH_SIZE = 4

train_label_prom = torch.tensor(label_prom)
train_label_ss = torch.tensor(label_ss)
train_label_polya = torch.tensor(label_polya)

val_label_prom = torch.tensor(val_label_prom)
val_label_ss = torch.tensor(val_label_ss)
val_label_polya = torch.tensor(val_label_polya)

test_label_prom = torch.tensor(test_label_prom)
test_label_ss = torch.tensor(test_label_ss)
test_label_polya = torch.tensor(test_label_polya)

train_inputs_ids, train_masks = preprocessing(sequences, tokenizer)
train_data = TensorDataset(train_inputs_ids, train_masks, train_label_prom, train_label_ss, train_label_polya)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_input_ids, val_masks = preprocessing(val_seq, tokenizer)
val_data = TensorDataset(val_input_ids, val_masks, val_label_prom, val_label_ss, val_label_polya)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

test_input_ids, test_masks = preprocessing(test_seq, tokenizer)
test_data = TensorDataset(test_input_ids, test_masks, test_label_prom, test_label_ss, test_label_polya)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

print('# of training data: {}'.format(len(sequences)))
print(('# of validation data: {}'.format(len(val_seq))))
print(('# of testing data: {}'.format(len(test_seq))))


# of training data: 640
# of validation data: 80
# of testing data: 80


In [285]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
import numpy as np

model.to(_device)
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
training_steps = len(train_dataloader) * EPOCH_SIZE
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=training_steps)
loss_fn = CrossEntropyLoss()

def evaluate(model, dataloader, device='cpu'):
    model.eval()
    model.to(device)
    val_prom_acc = []
    val_prom_loss = []
    val_ss_acc = []
    val_ss_loss = []
    val_polya_acc = []
    val_polya_loss = []

    for batch in dataloader:
        b_input_ids, b_attn_masks, b_label_prom, b_label_ss, b_label_polya = tuple(t.to(device) for t in batch)

        # Compute logits.
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_masks)

            prom_logits = logits['prom']
            ss_logits = logits['ss']
            polya_logits = logits['polya']

            # Compute loss.
            prom_loss = loss_fn(prom_logits, b_label_prom)
            ss_loss = loss_fn(ss_logits, b_label_ss)
            polya_loss = loss_fn(polya_logits, b_label_polya)
            val_prom_loss.append(prom_loss)
            val_ss_loss.append(ss_loss)
            val_polya_loss.append(polya_loss)

            # Prediction.
            preds_prom = torch.argmax(prom_logits, dim=1).flatten()
            preds_ss = torch.argmax(ss_logits, dim=1).flatten()
            preds_polya = torch.argmax(polya_logits, dim=1).flatten()

            # Accuracy
            prom_acc = (preds_prom == b_label_prom).cpu().numpy().mean() * 100
            ss_acc = (preds_ss == b_label_ss).cpu().numpy().mean() * 100
            polya_acc = (preds_polya == b_label_polya).cpu().numpy().mean() * 100
            val_prom_acc.append(prom_acc)
            val_ss_acc.append(ss_acc)
            val_polya_acc.append(polya_acc)

    # Compute average acc and loss.
    avg_prom_acc = np.mean(val_prom_acc)
    avg_ss_acc = np.mean(val_ss_acc)
    avg_polya_acc = np.mean(val_polya_acc)
    avg_prom_loss = np.mean(val_prom_loss)
    avg_ss_loss = np.mean(val_ss_loss)
    avg_polya_loss = np.mean(val_polya_loss)

    return avg_prom_acc, avg_ss_acc, avg_polya_acc, avg_prom_loss, avg_ss_loss, avg_polya_loss

    


In [286]:
import torch

def train(dataloader, model, loss_fn, optimizer, scheduler, batch_size, epoch_size, device='cpu', eval=False, val_dataloader=None):
    size = len(dataloader.dataset)
    model.to(device)
    model.train()
    batch_counts = 0
    batch_loss = 0
    batch_loss_prom, batch_loss_ss, batch_loss_polya = 0, 0, 0
    total_loss = 0
    total_loss_prom, total_loss_ss, total_loss_polya = 0, 0, 0

    for i in range(epoch_size):
        for step, batch in enumerate(dataloader):
            batch_counts += 1
            batch_loss 

            # Load batch to device.
            b_input_ids, b_attn_masks, b_labels_prom, b_labels_ss, b_labels_polya = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients.
            model.zero_grad()
            
            # Perform forward pass.
            outputs = model(b_input_ids, b_attn_masks)

            # Compute error.
            loss_prom = loss_fn(outputs['prom'], b_labels_prom)
            loss_ss = loss_fn(outputs['ss'], b_labels_ss)
            loss_polya = loss_fn(outputs['polya'], b_labels_polya)

            # Following MTDNN (Liu et. al., 2019), loss is summed.
            loss = loss_prom + loss_ss + loss_polya

            # Compute this batch error.
            batch_loss_prom += loss_prom
            batch_loss_ss += loss_ss
            batch_loss_polya += loss_polya
            batch_loss += loss

            # Backpropagation.
            loss.backward()

            # Update parameters and learning rate.
            optimizer.step()
            scheduler.step()

            # Print training process.
            if (step % batch_size == 0 and step != 0) or (step == len(dataloader) - 1):
                print('batch loss {}, batch loss prom: {}, batch loss ss: {}, batch loss polya: {}'.format(batch_loss, batch_loss_prom, batch_loss_ss, batch_loss_polya))
                batch_loss = 0
                batch_loss_prom = 0
                batch_loss_ss = 0
                batch_loss_polya = 0
                batch_counts = 0
        # endfor batch.

        # Evaluate.
        if eval:
            pa, ssa, pola, pl, ssl, poll = evaluate(model, val_dataloader)
            print('-----')
            print('prom acc: {}, prom loss: {}'.format(pa, pl))
            print('ss acc: {}, ss loss: {}'.format(ssa, ssl))
            print('polya acc: {}, polya loss: {}'.format(pola, poll))
            print('-----')
    # endfor epoch.
        
def test(dataloader, model, loss_fn, optimizer, batch_size, device="cpu"):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval() # Set model on evaluation model.
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            test_loss /= num_batches
            correct /= size
            print(f"Test error: \n Accuracy: {(100*correct):>0.1f}% \n Avg Loss: {test_loss:>8f} \n")



In [287]:
train(train_dataloader, model, loss_fn, optimizer, scheduler, BATCH_SIZE, EPOCH_SIZE, 'cpu', eval=True, val_dataloader=val_dataloader)

batch loss 0.0923478826880455, batch loss prom: 0.015425542369484901, batch loss ss: 0.06967712938785553, batch loss polya: 0.007245213259011507
batch loss 0.032309390604496, batch loss prom: 0.010534040629863739, batch loss ss: 0.021281111985445023, batch loss polya: 0.0004942387458868325
batch loss 0.01371399313211441, batch loss prom: 0.004375289659947157, batch loss ss: 0.0032876271288841963, batch loss polya: 0.006051077041774988
batch loss 0.022069502621889114, batch loss prom: 0.01163435634225607, batch loss ss: 0.004250302445143461, batch loss polya: 0.006184843368828297
batch loss 0.711395263671875, batch loss prom: 0.005767726805061102, batch loss ss: 0.704287588596344, batch loss polya: 0.0013399679446592927
batch loss 4.128385543823242, batch loss prom: 0.8188514709472656, batch loss ss: 3.3033692836761475, batch loss polya: 0.006164702586829662
batch loss 1.7927719354629517, batch loss prom: 1.65945303440094, batch loss ss: 0.13291916251182556, batch loss polya: 0.00039974

In [290]:
#torch.save(model, './result/24012022.pth')
model.shared_layer.save_pretrained('./result/24012022/')

In [195]:
s = "GTACGATCGACTAGACACTATATATA"
prom = 0
ss = 0
polya = 0

kmer = create_kmer(s, 3)
tokenizer = BertTokenizer.from_pretrained('./pretrained/3-new-12w-0')
output = tokenizer.encode_plus(text=kmer, padding='max_length', return_attention_mask=True)
ids = []
attns = []
prom_labels = []
ss_labels = []
polya_labels = []
ids.append(output['input_ids'])
attns.append(output['attention_mask'])
prom_labels.append(0)
ss_labels.append(1)
polya_labels.append(0)

input_ids = torch.tensor(ids)
attention_masks = torch.tensor(ids)
prom_labels = torch.tensor(prom_labels)
ss_labels = torch.tensor(ss_labels)
polya_labels = torch.tensor(polya_labels)


outputs = model(input_ids, attention_masks)
prom = outputs['prom']
ss = outputs['ss']
polya = outputs['polya']
print(outputs)
print('prom pred {}; label {}'.format(prom, prom_labels))
print('ss pred {}; label {}'.format(ss, ss_labels))
print('polya pred {}; label {}'.format(polya, polya_labels))
loss_fn = torch.nn.CrossEntropyLoss()

loss_prom = loss_fn(prom, prom_labels)
loss_ss = loss_fn(ss, ss_labels)
loss_polya = loss_fn(polya, polya_labels)
print('loss prom {}, ss {}, polya {}'.format(loss_prom, loss_ss, loss_polya))


{'prom': tensor([[ 1.6117, -2.9922]], grad_fn=<AddmmBackward0>), 'ss': tensor([[ 2.3533, -1.9821]], grad_fn=<AddmmBackward0>), 'polya': tensor([[-0.2783,  1.3930]], grad_fn=<AddmmBackward0>)}
prom pred tensor([[ 1.6117, -2.9922]], grad_fn=<AddmmBackward0>); label tensor([0])
ss pred tensor([[ 2.3533, -1.9821]], grad_fn=<AddmmBackward0>); label tensor([1])
polya pred tensor([[-0.2783,  1.3930]], grad_fn=<AddmmBackward0>); label tensor([0])
loss prom 0.009962832555174828, ss 4.348450660705566, polya 1.8435773849487305


In [199]:
bert = model.shared_layer
Y = bert(input_ids=input_ids, attention_mask=attention_masks)
Y

tensor([[[-0.2962,  0.0700,  0.0424,  ...,  0.3942,  0.8422,  0.2371],
         [-0.2962,  0.0700,  0.0423,  ...,  0.3942,  0.8423,  0.2370],
         [-0.2963,  0.0697,  0.0428,  ...,  0.3942,  0.8423,  0.2371],
         ...,
         [-0.2962,  0.0700,  0.0428,  ...,  0.3942,  0.8423,  0.2371],
         [-0.2962,  0.0700,  0.0424,  ...,  0.3943,  0.8423,  0.2370],
         [-0.2962,  0.0700,  0.0424,  ...,  0.3942,  0.8422,  0.2371]]],
       grad_fn=<NativeLayerNormBackward0>)

In [270]:
print(len(Y[0][0]))
print(len(Y[0][0][0]))

512
768


In [1]:
from transformers import BertForMaskedLM, BertTokenizer
from dnabert import initialize_training_model, initialize_sequence_labelling_model

pretrained_path = './pretrained/3-new-12w-0'
mtl_model = initialize_training_model(pretrained_path)
dnaseq_model = initialize_sequence_labelling_model(pretrained_path)


Some weights of the model checkpoint at ./pretrained/3-new-12w-0 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
dnaseq_model

DNASeqLabelling(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(69, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)