In [None]:
from transformers import BertForTokenClassification, BertModel
from data_dir import pretrained_3kmer_dir

bertForTokenClassification = BertForTokenClassification.from_pretrained(pretrained_3kmer_dir)

In [4]:
from data_dir import chr24_index_csv, chr24_fasta, labseq_dir, labseq_names
chr_indices = [chr24_index_csv]
chr_fastas = [chr24_fasta]
chr_labseq_path = ["{}/{}".format(labseq_dir, fname) for fname in [labseq_names[-1]]]
print(chr_indices)
print(chr_fastas)
print(chr_labseq_path)

['./data/genome/grch38/exon/NC_000024.10.csv']
['./data/chr/NC_000024.10.fasta']
['./data/genome/labseq/chr24.csv']


In [5]:
from data_dir import chr24_index_csv, chr24_fasta, labseq_dir, labseq_names
from data_preparation import generate_sequence_labelling
chr_indices = [chr24_index_csv]
chr_fastas = [chr24_fasta]
chr_labseq_path = ["{}/{}".format(labseq_dir, fname) for fname in [labseq_names[-1]]]
for src, fasta, target in zip(chr_indices, chr_fastas, chr_labseq_path):
    print("Generating sequential labelling for index {}, from fasta {}, to {}: {}".format(src, fasta, target, generate_sequence_labelling(src, fasta, target, do_expand=True, expand_size=512)))

Processing index ./data/genome/grch38/exon/NC_000024.10.csv, with fasta ./data/chr/NC_000024.10.fasta, to seq. labelling ./data/genome/labseq/chr24.csv, expanding [5431760/57226904]

In [1]:
from transformers import BertTokenizer
from data_dir import pretrained_3kmer_dir
from sequential_labelling import preprocessing

"""
Initialize tokenizer.
"""
tokenizer = BertTokenizer.from_pretrained(pretrained_3kmer_dir)

"""
Create sample data sequential labelling.
"""
from random import randint
from data_preparation import kmer
from sequential_labelling import process_sequence_and_label, create_dataloader
sequences = ['ATGC' * 128, 'TGAC' * 128, 'GATC' * 128, "AGCC" * 128]
labels = [['E' if randint(0, 255) % 2 == 0 else '.' for i in range(len(s))] for s in sequences]

kmer_seq = [' '.join(kmer(sequence, 3)) for sequence in sequences]
kmer_label = [' '.join(kmer(''.join(label), 3)) for label in labels]

arr_input_ids = []
arr_attn_mask = []  
arr_label_repr = []
arr_token_type_ids = []
for seq, label in zip(kmer_seq, kmer_label):
    input_ids, attn_mask, token_type_ids, label_repr = process_sequence_and_label(seq, label, tokenizer)
    arr_input_ids.append(input_ids)
    arr_attn_mask.append(attn_mask)
    arr_token_type_ids.append(token_type_ids)
    arr_label_repr.append(label_repr)

dataloader = create_dataloader(arr_input_ids, arr_attn_mask, arr_token_type_ids, arr_label_repr, batch_size=1)

In [3]:
from data_dir import pretrained_3kmer_dir
from transformers import BertForMaskedLM
from models.seq2seq import Seq2SeqHead
from tqdm import tqdm
from torch.nn import Softmax, CrossEntropyLoss, NLLLoss
from torch.cuda import empty_cache
model = BertForMaskedLM.from_pretrained(pretrained_3kmer_dir)
bert = model.bert
bert = bert.to('cuda')
seq2seq = Seq2SeqHead([768, 10])
seq2seq.to('cuda')
softmax = Softmax(dim=2)
loss_func = CrossEntropyLoss()
for step, batch in enumerate(dataloader):
    input_ids, attn_mask, token_type_ids, label = tuple(t.to('cuda') for t in batch)
    output_bert = bert(input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids)
    output_seq2seq = seq2seq(output_bert[0])
    output = softmax(output_seq2seq)

    #print(output, output.shape)
    #print(output.view(-1, 10), output.view(-1, 10).shape)
    #print(label, label.shape)
    #print(label.reshape([label.shape[0] * label.shape[1]]), label.shape)
    loss = 0
    for pred, target in zip(output, label):
        print(pred, target)
        loss += loss_func(pred, target)
    print(loss)

    empty_cache()
    

tensor([[0.0957, 0.1316, 0.1041,  ..., 0.0761, 0.1100, 0.1061],
        [0.0928, 0.0975, 0.0976,  ..., 0.0714, 0.1303, 0.0872],
        [0.0953, 0.1054, 0.1036,  ..., 0.0807, 0.1185, 0.0926],
        ...,
        [0.0862, 0.0936, 0.0937,  ..., 0.1239, 0.1213, 0.0837],
        [0.0955, 0.1037, 0.1038,  ..., 0.0809, 0.1166, 0.0928],
        [0.0995, 0.1367, 0.1081,  ..., 0.0790, 0.0755, 0.1102]],
       device='cuda:0', grad_fn=<UnbindBackward0>) tensor([0, 3, 4, 1, 1, 1, 1, 1, 2, 3, 4, 2, 3, 4, 1, 1, 2, 3, 7, 3, 4, 1, 1, 1,
        2, 5, 6, 4, 2, 3, 7, 3, 4, 1, 2, 5, 6, 4, 2, 5, 6, 4, 1, 2, 5, 8, 8, 8,
        6, 7, 3, 4, 1, 2, 3, 4, 2, 5, 6, 4, 2, 5, 6, 7, 3, 4, 1, 1, 2, 3, 4, 2,
        3, 4, 1, 2, 5, 6, 4, 2, 5, 8, 6, 4, 1, 1, 1, 1, 1, 2, 5, 8, 8, 8, 8, 8,
        6, 7, 3, 4, 2, 3, 4, 1, 2, 5, 6, 7, 5, 8, 8, 8, 6, 4, 2, 3, 4, 1, 1, 1,
        1, 1, 2, 5, 8, 8, 8, 6, 7, 3, 7, 5, 6, 4, 1, 1, 2, 3, 7, 5, 8, 6, 7, 3,
        7, 3, 4, 1, 1, 1, 2, 3, 7, 3, 7, 5, 6, 7, 3, 4, 1, 2, 3, 7, 3, 

In [4]:
from sequential_labelling import DNABERTSeq2Seq, train, init_adamw_optimizer
from transformers import get_linear_schedule_with_warmup
from data_dir import pretrained_3kmer_dir
import os
epoch_size = 10
warmup = 10
model = DNABERTSeq2Seq(pretrained_3kmer_dir)
optimizer = init_adamw_optimizer(model.parameters())
training_steps = len(dataloader) * epoch_size
optim_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup, num_training_steps=training_steps)
device = "cpu"
log_path = os.path.join("logs", "sample_log", "seq2seq", "log.2022-03-03.txt")
save_path = os.path.join("result", "samples", "seq2seq", "2022-03-03")
model.to(device)
model.train()
"""
Play with result.
"""
trained_model = train(model, optimizer, optim_scheduler, dataloader, 10, 2, log_path, save_path, device)

100%|██████████| 4/4 [00:15<00:00,  3.83s/it]
100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
100%|██████████| 4/4 [00:12<00:00,  3.24s/it]
100%|██████████| 4/4 [00:13<00:00,  3.33s/it]
100%|██████████| 4/4 [00:13<00:00,  3.39s/it]
100%|██████████| 4/4 [00:13<00:00,  3.38s/it]
100%|██████████| 4/4 [00:13<00:00,  3.40s/it]
100%|██████████| 4/4 [00:12<00:00,  3.10s/it]
100%|██████████| 4/4 [00:12<00:00,  3.12s/it]
100%|██████████| 4/4 [00:12<00:00,  3.10s/it]


Some weights of the model checkpoint at pretrained\3-new-12w-0 were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at pretrained\3-new-12w-0 and are new

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(69, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a