In [2]:
"""
Check if CUDA is supported.
"""
import torch
torch.cuda.is_available()
torch.device('cuda:0')
torch.cuda.get_device_name(0)
_device = torch.device('cuda:0')


In [3]:
import pandas as pd
from data_preparation import kmer

def get_sequences(csv_path, n_sample=10, random_state=1337):
    r"""
    Get sequence from certain CSV. CSV has header such as 'sequence', 'label_prom', 'label_ss', 'label_polya'.
    """
    df = pd.read_csv(csv_path)
    if (n_sample > 0):
        df = df.sample(n=n_sample, random_state=random_state)
    sequence = list(df['sequence'])
    label_prom = list(df['label_prom'])
    label_ss = list(df['label_ss'])
    label_polya = list(df['label_polya'])

    return sequence, label_prom, label_ss, label_polya

import torch
def preprocessing(data, tokenizer):
    """
    Preprocessing for pretrained BERT.
    @param  data (string): string containing kmers separated by spaces.
    @param  tokenizer (Tokenizer): tokenizer initialized from pretrained values.
    @return input_ids (torch.Tensor): tensor of token ids to be fed to model.
    @return attention_masks (torch.Tensor): tensor of indices (a bunch of 'indexes') specifiying which token needs to be attended by model.
    """
    input_ids = []
    attention_masks = []

    _count = 0
    _len_data = len(data)
    for sequence in data:
        """
        Sequence is 512 characters long.
        """
        _count += 1
        if _count < _len_data:
            print("Seq length = {} [{}/{}]".format(len(sequence.split(' ')), _count, _len_data), end='\r')
        else:
            print("Seq length = {} [{}/{}]".format(len(sequence.split(' ')), _count, _len_data))
        encoded_sent = tokenizer.encode_plus(
            text=sequence,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert input_ids and attention_masks to tensor.
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

"""
Initialize tokenizer using BertTokenizer with pretrained weights from DNABert.
"""
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('./pretrained/3-new-12w-0')

In [4]:
"""
Split dataset into two parts: train and validation.
"""
from data_dir import workspace_dir, dataset_full_dir
from data_preparation import split_and_store_csv
_src_csv = "{}/train.csv".format(dataset_full_dir)
_fractions = [0.9, 0.1]
_store_paths = [
    "{}/{}".format(workspace_dir, 'train.csv'),
    "{}/{}".format(workspace_dir, 'validation.csv'),
]
print("Splitting source {}: {}".format(_src_csv, split_and_store_csv(_src_csv, fractions=_fractions, store_paths=_store_paths)))

Splitting and storing split to ./workspace/train.csv
Splitting and storing split to ./workspace/validation.csv
Splitting source ./dataset/full/train.csv: True


In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from data_dir import workspace_dir

train_seq, train_label_prom, train_label_ss, train_label_polya = get_sequences('{}/train.csv'.format(workspace_dir), n_sample=500000)
validation_seq, val_label_prom, val_label_ss, val_label_polya = get_sequences('{}/validation.csv'.format(workspace_dir), n_sample=100)

"""
Create dataloader.
"""
BATCH_SIZE = 2
EPOCH_SIZE = 4

_device = torch.device('cuda:0')
train_label_prom = torch.tensor(train_label_prom, device=_device)
train_label_ss = torch.tensor(train_label_ss, device=_device)
train_label_polya = torch.tensor(train_label_polya, device=_device)

train_inputs_ids, train_masks = preprocessing(train_seq, tokenizer)
train_data = TensorDataset(train_inputs_ids, train_masks, train_label_prom, train_label_ss, train_label_polya)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_label_prom = torch.tensor(val_label_prom, device=_device)
val_label_ss = torch.tensor(val_label_ss, device=_device)
val_label_polya = torch.tensor(val_label_polya, device=_device)

val_input_ids, val_masks = preprocessing(validation_seq, tokenizer)
val_data = TensorDataset(val_input_ids, val_masks, val_label_prom, val_label_ss, val_label_polya)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

print('# of training data: {}'.format(len(train_seq)))  
print('# of training data: {}'.format(len(validation_seq)))

Seq length = 510 [500000/500000]
Seq length = 510 [100/100]
# of training data: 500000
# of training data: 100


In [1]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
import numpy as np
import os

from multitask_learning import init_model_mtl
from transformers import BertForMaskedLM
from data_dir import pretrained_3kmer_dir

# model = MTModel(shared_parameters=shared_parameter, promoter_head=promoter_head, polya_head=polya_head, splice_site_head=splice_head)
model = init_model_mtl(pretrained_3kmer_dir, head="bert", config=os.path.join("models", "config", "mtl.json"))
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

In [2]:
import torch
import datetime
import os
from data_preparation import kmer

_now = datetime.datetime.now()

_log_file = os.path.join('logs', 'notebooks', '2022-02.24.csv')
os.makedirs(_log_file, exist_ok=True)

#seqs = ["ATGC" * 128, "GATC" * 128, "CCAT" * 128]
seqs = ["ATGC" * 128]
seqs = [' '.join(kmer(s, 3)) for s in seqs]
prom_labels = [1] #, 0, 0]
ss_labels = [0] #, 1, 0]
polya_labels = [0] #, 0, 1]

def _format_prom_label(label):
    return [label]

def _format_other_label(label):
    return label

"""
Initialize BERT tokenizer.
"""
from transformers import BertTokenizer
from data_dir import pretrained_3kmer_dir
import torch
tokenizer = BertTokenizer.from_pretrained(pretrained_3kmer_dir)

arr_input_ids = []
arr_attention_mask = []
arr_prom_label = []
arr_ss_label = []
arr_polya_label = []
for i in range(len(seqs)):
    s = seqs[i]
    prom = prom_labels[i]
    ss = ss_labels[i]
    polya = polya_labels[i]

    encoded = tokenizer.encode_plus(text=s, padding="max_length", return_attention_mask=True)
    arr_input_ids.append(encoded.get('input_ids'))
    arr_attention_mask.append(encoded.get('attention_mask'))
    arr_prom_label.append(_format_prom_label(prom))
    arr_ss_label.append(_format_other_label(ss))
    arr_polya_label.append(_format_other_label(polya))
#endfor
arr_input_ids = torch.tensor(arr_input_ids)
arr_attention_mask = torch.tensor(arr_attention_mask)
prom_labels = torch.tensor(arr_prom_label)
ss_labels = torch.tensor(arr_ss_label)
polya_labels = torch.tensor(arr_polya_label)

from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(arr_input_ids, arr_attention_mask, prom_labels, ss_labels, polya_labels)
dataloader = DataLoader(dataset, batch_size=1)

In [18]:
import torch
target = torch.tensor([1])
target.view(1, 1)

tensor([[1]])

In [22]:
from tqdm import tqdm
import torch
from torch.nn import CrossEntropyLoss, BCELoss
from utils.data_generator import _data_generator_mtl
from multitask_learning import init_model_mtl
from data_dir import pretrained_3kmer_dir

dataloader = _data_generator_mtl()
model = init_model_mtl(pretrained_3kmer_dir)
crossentropy_function = CrossEntropyLoss()
binary_crossentropy_function = BCELoss()
model
model.zero_grad()
model.train()
count_prom_correct = 0
count_ss_correct = 0
count_polya_correct = 0
# for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
#for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
for step, batch in enumerate(dataloader):
    in_ids, attn_mask, label_prom, label_ss, label_polya = tuple(t for t in batch)
    bert_output = model.shared_layer(in_ids, attn_mask)
    # print(bert_output[0][:,0])
    
    pred_prom = model.promoter_layer(bert_output)
    pred_ss = model.splice_site_layer(bert_output)
    pred_polya = model.polya_layer(bert_output)

    print(pred_prom, label_prom)
    print(pred_ss, label_ss)
    print(pred_polya, label_polya)
    
    label_prom = label_prom.view(1,1)
    label_prom = label_prom.to(torch.float32)
    loss_prom = binary_crossentropy_function(pred_prom, label_prom)
    loss_ss = crossentropy_function(pred_ss, label_ss)
    loss_polya = crossentropy_function(pred_polya, label_polya)

    print(pred_prom, label_prom, loss_prom)
    print(pred_ss, label_ss, loss_ss)
    print(pred_polya, label_polya, loss_polya)


    #predicted_prom = torch.round(pred_prom).item()
    #actual_prom = label_prom.float().item()
    #if (predicted_prom == actual_prom):
    #    count_prom_correct += 1
    #print(pred_prom, label_prom, predicted_prom)

    #predicted_ss, predicted_ss_index = torch.max(pred_ss, 1)
    #predicted_ss = predicted_ss.item()
    #predicted_ss_index = predicted_ss_index.item()
    # print(pred_ss, label_ss, predicted_ss, predicted_ss_index)
    #if (predicted_ss_index == label_ss):
    #    count_ss_correct += 1
    #print(sum_loss, sum_loss/3)

    #predicted_polya, predicted_polya_index = torch.max(pred_polya, 1)
    #predicted_polya = predicted_polya.item()
    #predicted_polya_index = predicted_polya_index.item()
    #if (predicted_polya_index == label_polya):
    #    count_polya_correct += 1

print('accuracy prom: {}'.format(count_prom_correct / len(dataloader) * 100))
print('accuracy ss: {}'.format(count_ss_correct / len(dataloader) * 100))
print('accuracy polya: {}'.format(count_polya_correct / len(dataloader) * 100))


Data sample 3
tensor([[0.4280]], grad_fn=<SigmoidBackward0>) tensor([1])
tensor([[0.4357, 0.5643]], grad_fn=<SoftmaxBackward0>) tensor([0])
tensor([[0.5731, 0.4269]], grad_fn=<SoftmaxBackward0>) tensor([0])


TypeError: 'torch.dtype' object is not callable

In [21]:
from data_dir import pretrained_3kmer_dir
from transformers import BertForMaskedLM
from torch import nn

model.eval()
with torch.no_grad():
    for step, batch in enumerate(dataloader):
        input_ids, attn_mask, label_prom, label_ss, label_polya = tuple(t for t in batch)
        bert_output = model.bert(input_ids, attention_mask=attn_mask, output_hidden_states=True, output_attentions=True)
        hidden_states = bert_output.hidden_states
        print(bert_output.last_hidden_state)
        print(hidden_states[12])
        #dropout = nn.Dropout(p=0.1)(hidden_states[1])
        
        #print(dropout)
        #print(bert_output.last_hidden_state)
        #print(bert_output.pooler_output)
        #print(bert_output.hidden_states)
        #print(len(bert_output[1]))
        #print(len(bert_output[2]))


tensor([[[-0.4187,  0.8725,  0.9659,  ..., -0.0091,  0.8986, -1.3403],
         [-1.4170, -0.9827, -0.7155,  ...,  0.0354,  0.6974, -0.8663],
         [-0.6982, -0.7672,  1.1986,  ..., -1.6839,  0.2338, -0.5892],
         ...,
         [-1.3530, -0.8583, -0.9080,  ...,  0.0355,  0.7268, -0.8151],
         [-0.7084, -0.7193,  1.1986,  ..., -1.6446,  0.1983, -0.5487],
         [-0.4187,  0.8725,  0.9659,  ..., -0.0091,  0.8986, -1.3403]]])
tensor([[[-0.4187,  0.8725,  0.9659,  ..., -0.0091,  0.8986, -1.3403],
         [-1.4170, -0.9827, -0.7155,  ...,  0.0354,  0.6974, -0.8663],
         [-0.6982, -0.7672,  1.1986,  ..., -1.6839,  0.2338, -0.5892],
         ...,
         [-1.3530, -0.8583, -0.9080,  ...,  0.0355,  0.7268, -0.8151],
         [-0.7084, -0.7193,  1.1986,  ..., -1.6446,  0.1983, -0.5487],
         [-0.4187,  0.8725,  0.9659,  ..., -0.0091,  0.8986, -1.3403]]])


In [16]:
from transformers import BertForMaskedLM
from data_dir import pretrained_3kmer_dir

bert_for_masked_lm = BertForMaskedLM.from_pretrained(pretrained_3kmer_dir)
print(bert_for_masked_lm)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(69, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

In [15]:
from transformers import BertForSequenceClassification
from data_dir import pretrained_3kmer_dir

bert_for_seq_cls = BertForSequenceClassification.from_pretrained(pretrained_3kmer_dir)
print(bert_for_seq_cls)

Some weights of the model checkpoint at pretrained\3-new-12w-0 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pretrained\3-new-12w-0

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(69, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [127]:
from torch.nn import BCELoss, CrossEntropyLoss
from torch import tensor
prom_pred = tensor([[0.2]])
prom_target = tensor([[0.]])
other_pred = tensor([[0.1, 1.5]])
other_target = tensor([0])
bce = BCELoss()
cross = CrossEntropyLoss()
print(bce(prom_pred, prom_target))
print(cross(other_pred, other_target))

tensor(0.2231)
tensor(1.6204)


In [None]:
import torch
import datetime
import os
from data_preparation import kmer
from transformers import BertTokenizer
from torch 
from data_dir import pretrained_3kmer_dir

_now = datetime.datetime.now()

_log_file = os.path.join('logs', 'notebooks', '2022-02.24.csv')
os.makedirs(_log_file, exist_ok=True)

seqs = ["ATGC" * 128, "GATC" * 128, "CCAT" * 128]
seqs = [' '.join(kmer(s, 3)) for s in seqs]
prom_labels = [1, 0, 0]
ss_labels = [0, 1, 0]
polya_labels = [0, 0, 1]

tokenizer = BertTokenizer.from_pretrained(pretrained_3kmer_dir)
encoded = tokenizer(seqs[0], seqs[1])
print(encoded)
print(tokenizer.convert_ids_to_tokens(encoded['input_ids']))



In [None]:
from multitask_learning import init_model_mtl
from data_dir import pretrained_3kmer_dir
import os

model_mtl_bert = init_model_mtl(pretrained_3kmer_dir, head="bert", config=os.path.join('models', 'config', 'mtl.json'))
preds = model_mtl_bert()

In [4]:
tuples = ('0.1', 0.2)
titles = ['percentage', 'percentile']

for a, b in zip(titles, tuples):
    print(a, b)

percentage 0.1
percentile 0.2
