In [1]:
"""
Check if CUDA is supported.
"""
import torch
torch.cuda.is_available()
torch.device('cuda:0')
torch.cuda.get_device_name(0)
_device = torch.device('cuda:0')


In [1]:
import pandas as pd
from data_preparation import kmer

def get_sequences(csv_path, n_sample=10, random_state=1337):
    r"""
    Get sequence from certain CSV. CSV has header such as 'sequence', 'label_prom', 'label_ss', 'label_polya'.
    """
    df = pd.read_csv(csv_path)
    if (n_sample > 0):
        df = df.sample(n=n_sample, random_state=random_state)
    sequence = list(df['sequence'])
    label_prom = list(df['label_prom'])
    label_ss = list(df['label_ss'])
    label_polya = list(df['label_polya'])

    return sequence, label_prom, label_ss, label_polya

import torch
def preprocessing(data, tokenizer):
    """
    Preprocessing for pretrained BERT.
    @param  data (string): string containing kmers separated by spaces.
    @param  tokenizer (Tokenizer): tokenizer initialized from pretrained values.
    @return input_ids (torch.Tensor): tensor of token ids to be fed to model.
    @return attention_masks (torch.Tensor): tensor of indices (a bunch of 'indexes') specifiying which token needs to be attended by model.
    """
    input_ids = []
    attention_masks = []

    _count = 0
    _len_data = len(data)
    for sequence in data:
        """
        Sequence is 512 characters long.
        """
        _count += 1
        if _count < _len_data:
            print("Seq length = {} [{}/{}]".format(len(sequence.split(' ')), _count, _len_data), end='\r')
        else:
            print("Seq length = {} [{}/{}]".format(len(sequence.split(' ')), _count, _len_data))
        encoded_sent = tokenizer.encode_plus(
            text=sequence,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert input_ids and attention_masks to tensor.
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

"""
Initialize tokenizer using BertTokenizer with pretrained weights from DNABert.
"""
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('./pretrained/3-new-12w-0')

In [4]:
sum([0.1, 0.9])

1.0

In [1]:
"""
Split dataset into two parts: train and validation.
"""
from data_dir import workspace_dir, dataset_full_dir
from data_preparation import split_and_store_csv
_src_csv = "{}/train.csv".format(dataset_full_dir)
_fractions = [0.99, 0.01]
_store_paths = [
    "{}/{}".format(workspace_dir, 'train.csv'),
    "{}/{}".format(workspace_dir, 'validation.csv'),
]
print("Splitting source {}: {}".format(_src_csv, split_and_store_csv(_src_csv, fractions=_fractions, store_paths=_store_paths)))

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from data_dir import workspace_dir

train_seq, train_label_prom, train_label_ss, train_label_polya = get_sequences('{}/train.csv'.format(workspace_dir), n_sample=10000)
validation_seq, val_label_prom, val_label_ss, val_label_polya = get_sequences('{}/validation.csv'.format(workspace_dir), n_sample=100)

"""
Create dataloader.
"""
BATCH_SIZE = 1
EPOCH_SIZE = 4

_device = torch.device('cuda:0')
train_label_prom = torch.tensor(train_label_prom, device=_device)
train_label_ss = torch.tensor(train_label_ss, device=_device)
train_label_polya = torch.tensor(train_label_polya, device=_device)

train_inputs_ids, train_masks = preprocessing(train_seq, tokenizer)
train_data = TensorDataset(train_inputs_ids, train_masks, train_label_prom, train_label_ss, train_label_polya)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_label_prom = torch.tensor(val_label_prom, device=_device)
val_label_ss = torch.tensor(val_label_ss, device=_device)
val_label_polya = torch.tensor(val_label_polya, device=_device)

val_input_ids, val_masks = preprocessing(validation_seq, tokenizer)
val_data = TensorDataset(val_input_ids, val_masks, val_label_prom, val_label_ss, val_label_polya)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

print('# of training data: {}'.format(len(train_seq)))  

Seq length = 510 [100/100]
# of training data: 100


In [4]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
import numpy as np

from multitask_learning import PolyAHead, PromoterHead, SpliceSiteHead, MTModel
from transformers import BertForMaskedLM
from data_dir import pretrained_3kmer_dir

polya_head = PolyAHead(_device)
promoter_head = PromoterHead(_device)
splice_head = SpliceSiteHead(_device)

dnabert_3_pretrained = pretrained_3kmer_dir
shared_parameter = BertForMaskedLM.from_pretrained(dnabert_3_pretrained).bert

model = MTModel(shared_parameters=shared_parameter, promoter_head=promoter_head, polya_head=polya_head, splice_site_head=splice_head).to(_device)
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
training_steps = len(train_dataloader) * EPOCH_SIZE
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=training_steps)
loss_fn = CrossEntropyLoss()



In [17]:
import torch
import datetime
import os

_now = datetime.datetime.now()

_log_folder = "./logs/{}".format(_now.strftime("%y-%m-%d"))
if not os.path.exists(_log_folder):
    os.mkdir(_log_folder)
_log_name = "{}.csv".format(_now.strftime("%y-%m-%d-%H-%M-%S"))
_log_path = "{}/{}".format(_log_folder, _log_name)
print(_log_path)



./logs/22-02-13/22-02-13-18-11-30.csv


In [18]:
from multitask_learning import train
train(train_dataloader, model, loss_fn, optimizer, scheduler, BATCH_SIZE, EPOCH_SIZE, _log_path, _device, eval=False)

Epoch 3, Step 99

True

In [14]:
#torch.save(model, './result/24012022.pth')
import datetime
_now = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
model.shared_layer.save_pretrained('./result/gpu/{}/'.format(_now))

In [14]:
_device = torch.device('cuda:0')
train(train_dataloader, model, loss_fn, optimizer, scheduler, BATCH_SIZE, EPOCH_SIZE, _device, eval=True, val_dataloader=val_dataloader)
import datetime
_now = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
model.shared_layer.save_pretrained('./result/gpu/{}/'.format(_now))

batch loss 4.678438186645508, batch loss prom: 1.5165265798568726, batch loss ss: 1.4876850843429565, batch loss polya: 1.6742267608642578
batch loss 1.7870014905929565, batch loss prom: 0.5639010667800903, batch loss ss: 0.6062026023864746, batch loss polya: 0.6168978214263916
batch loss 1.5076560974121094, batch loss prom: 0.5132335424423218, batch loss ss: 0.4427833557128906, batch loss polya: 0.5516392588615417
batch loss 1.681549310684204, batch loss prom: 0.49047523736953735, batch loss ss: 0.3959461450576782, batch loss polya: 0.7951279878616333
batch loss 1.3229281902313232, batch loss prom: 0.4208376705646515, batch loss ss: 0.3389909863471985, batch loss polya: 0.5630995631217957
batch loss 1.431191325187683, batch loss prom: 0.374896377325058, batch loss ss: 0.3177795112133026, batch loss polya: 0.7385154366493225
batch loss 1.2189515829086304, batch loss prom: 0.30863240361213684, batch loss ss: 0.2716456651687622, batch loss polya: 0.6386735439300537
batch loss 1.220835924

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)

In [195]:
s = "GTACGATCGACTAGACACTATATATA"
prom = 0
ss = 0
polya = 0

kmer = create_kmer(s, 3)
tokenizer = BertTokenizer.from_pretrained('./pretrained/3-new-12w-0')
output = tokenizer.encode_plus(text=kmer, padding='max_length', return_attention_mask=True)
ids = []
attns = []
prom_labels = []
ss_labels = []
polya_labels = []
ids.append(output['input_ids'])
attns.append(output['attention_mask'])
prom_labels.append(0)
ss_labels.append(1)
polya_labels.append(0)

input_ids = torch.tensor(ids)
attention_masks = torch.tensor(ids)
prom_labels = torch.tensor(prom_labels)
ss_labels = torch.tensor(ss_labels)
polya_labels = torch.tensor(polya_labels)


outputs = model(input_ids, attention_masks)
prom = outputs['prom']
ss = outputs['ss']
polya = outputs['polya']
print(outputs)
print('prom pred {}; label {}'.format(prom, prom_labels))
print('ss pred {}; label {}'.format(ss, ss_labels))
print('polya pred {}; label {}'.format(polya, polya_labels))
loss_fn = torch.nn.CrossEntropyLoss()

loss_prom = loss_fn(prom, prom_labels)
loss_ss = loss_fn(ss, ss_labels)
loss_polya = loss_fn(polya, polya_labels)
print('loss prom {}, ss {}, polya {}'.format(loss_prom, loss_ss, loss_polya))


{'prom': tensor([[ 1.6117, -2.9922]], grad_fn=<AddmmBackward0>), 'ss': tensor([[ 2.3533, -1.9821]], grad_fn=<AddmmBackward0>), 'polya': tensor([[-0.2783,  1.3930]], grad_fn=<AddmmBackward0>)}
prom pred tensor([[ 1.6117, -2.9922]], grad_fn=<AddmmBackward0>); label tensor([0])
ss pred tensor([[ 2.3533, -1.9821]], grad_fn=<AddmmBackward0>); label tensor([1])
polya pred tensor([[-0.2783,  1.3930]], grad_fn=<AddmmBackward0>); label tensor([0])
loss prom 0.009962832555174828, ss 4.348450660705566, polya 1.8435773849487305


In [199]:
bert = model.shared_layer
Y = bert(input_ids=input_ids, attention_mask=attention_masks)
Y

tensor([[[-0.2962,  0.0700,  0.0424,  ...,  0.3942,  0.8422,  0.2371],
         [-0.2962,  0.0700,  0.0423,  ...,  0.3942,  0.8423,  0.2370],
         [-0.2963,  0.0697,  0.0428,  ...,  0.3942,  0.8423,  0.2371],
         ...,
         [-0.2962,  0.0700,  0.0428,  ...,  0.3942,  0.8423,  0.2371],
         [-0.2962,  0.0700,  0.0424,  ...,  0.3943,  0.8423,  0.2370],
         [-0.2962,  0.0700,  0.0424,  ...,  0.3942,  0.8422,  0.2371]]],
       grad_fn=<NativeLayerNormBackward0>)

In [270]:
print(len(Y[0][0]))
print(len(Y[0][0][0]))

512
768


In [1]:
from transformers import BertForMaskedLM, BertTokenizer
from dnabert import initialize_training_model, initialize_sequence_labelling_model

pretrained_path = './pretrained/3-new-12w-0'
mtl_model = initialize_training_model(pretrained_path)
dnaseq_model = initialize_sequence_labelling_model(pretrained_path)


Some weights of the model checkpoint at ./pretrained/3-new-12w-0 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
dnaseq_model

DNASeqLabelling(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(69, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)