In [None]:
from transformers import BertForTokenClassification, BertModel
from data_dir import pretrained_3kmer_dir

bertForTokenClassification = BertForTokenClassification.from_pretrained(pretrained_3kmer_dir)

In [4]:
from data_dir import chr24_index_csv, chr24_fasta, labseq_dir, labseq_names
chr_indices = [chr24_index_csv]
chr_fastas = [chr24_fasta]
chr_labseq_path = ["{}/{}".format(labseq_dir, fname) for fname in [labseq_names[-1]]]
print(chr_indices)
print(chr_fastas)
print(chr_labseq_path)

['./data/genome/grch38/exon/NC_000024.10.csv']
['./data/chr/NC_000024.10.fasta']
['./data/genome/labseq/chr24.csv']


In [5]:
from data_dir import chr24_index_csv, chr24_fasta, labseq_dir, labseq_names
from data_preparation import generate_sequence_labelling
chr_indices = [chr24_index_csv]
chr_fastas = [chr24_fasta]
chr_labseq_path = ["{}/{}".format(labseq_dir, fname) for fname in [labseq_names[-1]]]
for src, fasta, target in zip(chr_indices, chr_fastas, chr_labseq_path):
    print("Generating sequential labelling for index {}, from fasta {}, to {}: {}".format(src, fasta, target, generate_sequence_labelling(src, fasta, target, do_expand=True, expand_size=512)))

Processing index ./data/genome/grch38/exon/NC_000024.10.csv, with fasta ./data/chr/NC_000024.10.fasta, to seq. labelling ./data/genome/labseq/chr24.csv, expanding [5431760/57226904]

In [2]:
from transformers import BertTokenizer
from data_dir import pretrained_3kmer_dir
from sequential_labelling import preprocessing

"""
Initialize tokenizer.
"""
tokenizer = BertTokenizer.from_pretrained(pretrained_3kmer_dir)

"""
Create sample data sequential labelling.
"""
from random import randint
from data_preparation import kmer
from sequential_labelling import process_sequence_and_label, create_dataloader
sequences = ['ATGC' * 128, 'TGAC' * 128, 'GATC' * 128, "AGCC" * 128]
labels = [['E' if randint(0, 255) % 2 == 0 else '.' for i in range(len(s))] for s in sequences]

kmer_seq = [' '.join(kmer(sequence, 3)) for sequence in sequences]
kmer_label = [' '.join(kmer(''.join(label), 3)) for label in labels]

arr_input_ids = []
arr_attn_mask = []
arr_label_repr = []
for seq, label in zip(kmer_seq, kmer_label):
    input_ids, attn_mask, label_repr = process_sequence_and_label(seq, label, tokenizer)
    arr_input_ids.append(input_ids)
    arr_attn_mask.append(attn_mask)
    arr_label_repr.append(label_repr)

dataloader = create_dataloader(arr_input_ids, arr_attn_mask, arr_label_repr, batch_size=1)

In [3]:
from sequential_labelling import DNABERTSeq2Seq, train, init_adamw_optimizer
from transformers import get_linear_schedule_with_warmup
from data_dir import pretrained_3kmer_dir
import os
epoch_size = 10
warmup = 10
model = DNABERTSeq2Seq(pretrained_3kmer_dir)
optimizer = init_adamw_optimizer(model.parameters())
training_steps = len(dataloader) * epoch_size
optim_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup, num_training_steps=training_steps)
device = "cpu"
log_path = os.path.join("logs", "sample_log", "seq2seq", "log.2022-03-34.txt")
save_path = os.path.join("result", "samples", "seq2seq", "2022-03-34")
model.to(device)
model.train()
"""
Play with result.
"""
trained_model = train(model, optimizer, optim_scheduler, dataloader, 10, 2, log_path, save_path, device)

100%|██████████| 4/4 [00:14<00:00,  3.52s/it]
100%|██████████| 4/4 [00:14<00:00,  3.69s/it]
100%|██████████| 4/4 [00:14<00:00,  3.59s/it]
100%|██████████| 4/4 [00:14<00:00,  3.58s/it]
100%|██████████| 4/4 [00:14<00:00,  3.60s/it]
100%|██████████| 4/4 [00:13<00:00,  3.43s/it]
100%|██████████| 4/4 [00:13<00:00,  3.44s/it]
100%|██████████| 4/4 [00:13<00:00,  3.38s/it]
100%|██████████| 4/4 [00:14<00:00,  3.57s/it]
100%|██████████| 4/4 [00:14<00:00,  3.53s/it]


In [None]:
100%|██████████| 2/2 [00:07<00:00,  3.58s/it]
[-0.09367253631353378, -0.0955008864402771, -0.09063666313886642, -0.09563980996608734]
[-0.09989060461521149, -0.09998640418052673, -0.1000804752111435, -0.09985078871250153]

In [3]:
import torch
from torch import nn
y_pred = torch.tensor([
    [
        [0.05, 0.95, 2],    # First token 
        [0.1, 0.8, 1.5]     # Second token
    ], # sequence
] # batch
)
y_softmax = torch.nn.Softmax(dim=2)(y_pred)
y_true = torch.tensor([[1, 0]])
print(y_softmax.shape, y_true.shape)

for p, t in zip(y_softmax, y_true):
    print(p.shape, t.shape)
    print(nn.NLLLoss()(p, t))

torch.Size([1, 2, 3]) torch.Size([1, 2])
torch.Size([2, 3]) torch.Size([2])
tensor(-0.1880)


In [85]:
import torch
loss = nn.CrossEntropyLoss()
activate = nn.Softmax(dim=2)
input = torch.randn(3, 5, 10, requires_grad=True, dtype=torch.float)
input = activate(input)
target = torch.empty(3, 5, 10, dtype=torch.float).random_(5)
output = loss(input, target)
print(input.shape, input)
print(target.shape, target)

torch.Size([3, 5, 10]) tensor([[[0.0761, 0.0309, 0.1782, 0.3150, 0.0041, 0.0667, 0.0346, 0.0324,
          0.1923, 0.0697],
         [0.0491, 0.0709, 0.1599, 0.1308, 0.0236, 0.0327, 0.2852, 0.0152,
          0.0579, 0.1746],
         [0.1077, 0.0813, 0.0841, 0.0973, 0.0452, 0.0425, 0.0752, 0.0313,
          0.1997, 0.2358],
         [0.1600, 0.1957, 0.1299, 0.1179, 0.0709, 0.0377, 0.1201, 0.0344,
          0.0467, 0.0868],
         [0.1917, 0.0216, 0.0058, 0.1390, 0.0640, 0.0325, 0.2813, 0.0407,
          0.0276, 0.1961]],

        [[0.0189, 0.2757, 0.0623, 0.0221, 0.0229, 0.1040, 0.0658, 0.0579,
          0.0653, 0.3052],
         [0.0463, 0.0238, 0.0840, 0.4257, 0.0258, 0.0374, 0.1712, 0.0945,
          0.0349, 0.0564],
         [0.0105, 0.0182, 0.2384, 0.0794, 0.0994, 0.2283, 0.1128, 0.1145,
          0.0499, 0.0485],
         [0.0164, 0.0236, 0.1261, 0.0135, 0.1104, 0.3142, 0.1054, 0.0131,
          0.1640, 0.1134],
         [0.0067, 0.0832, 0.1345, 0.1329, 0.1143, 0.0975, 0.0780, 