In [None]:
from transformers import BertForTokenClassification, BertModel
from data_dir import pretrained_3kmer_dir

bertForTokenClassification = BertForTokenClassification.from_pretrained(pretrained_3kmer_dir)

In [4]:
from data_dir import chr24_index_csv, chr24_fasta, labseq_dir, labseq_names
chr_indices = [chr24_index_csv]
chr_fastas = [chr24_fasta]
chr_labseq_path = ["{}/{}".format(labseq_dir, fname) for fname in [labseq_names[-1]]]
print(chr_indices)
print(chr_fastas)
print(chr_labseq_path)

['./data/genome/grch38/exon/NC_000024.10.csv']
['./data/chr/NC_000024.10.fasta']
['./data/genome/labseq/chr24.csv']


In [5]:
from data_dir import chr24_index_csv, chr24_fasta, labseq_dir, labseq_names
from data_preparation import generate_sequence_labelling
chr_indices = [chr24_index_csv]
chr_fastas = [chr24_fasta]
chr_labseq_path = ["{}/{}".format(labseq_dir, fname) for fname in [labseq_names[-1]]]
for src, fasta, target in zip(chr_indices, chr_fastas, chr_labseq_path):
    print("Generating sequential labelling for index {}, from fasta {}, to {}: {}".format(src, fasta, target, generate_sequence_labelling(src, fasta, target, do_expand=True, expand_size=512)))

Processing index ./data/genome/grch38/exon/NC_000024.10.csv, with fasta ./data/chr/NC_000024.10.fasta, to seq. labelling ./data/genome/labseq/chr24.csv, expanding [5431760/57226904]

In [1]:
from transformers import BertTokenizer
from data_dir import pretrained_3kmer_dir
from sequential_labelling import preprocessing, initialize_seq2seq

"""
Initialize model and tokenizer.
"""
tokenizer = BertTokenizer.from_pretrained(pretrained_3kmer_dir)
in_out_dimensions = [768, 512, 512, 512]
model = initialize_seq2seq(pretrained_3kmer_dir, in_out_dimensions)
#print(model)

"""
Create sample data sequential labelling.
"""
from random import randint
from data_preparation import kmer
from sequential_labelling import process_sequence_and_label, create_dataloader
sequences = ['ATGC' * 128, 'TGAC' * 128, 'GATC' * 128]
labels = [['E' if randint(0, 255) % 2 == 0 else '.' for i in range(len(s))] for s in sequences]

kmer_seq = [' '.join(kmer(sequence, 3)) for sequence in sequences]
kmer_label = [' '.join(kmer(''.join(label), 3)) for label in labels]

arr_input_ids = []
arr_attn_mask = []
arr_label_repr = []
for seq, label in zip(kmer_seq, kmer_label):
    input_ids, attn_mask, label_repr = process_sequence_and_label(seq, label, tokenizer)
    arr_input_ids.append(input_ids)
    arr_attn_mask.append(attn_mask)
    arr_label_repr.append(label_repr)

dataloader = create_dataloader(arr_input_ids, arr_attn_mask, arr_label_repr, batch_size=1)

In [3]:
"""
Play with result.
"""
from tqdm import tqdm
for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    input_ids, attn_mask, label_repr = tuple(t for t in batch)
    

100%|██████████| 3/3 [00:00<00:00, 230.68it/s]

tensor([[0, 7, 5, 6, 7, 3, 7, 3, 4, 2, 5, 6, 4, 1, 2, 5, 8, 6, 4, 1, 1, 2, 5, 6,
         7, 5, 6, 4, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4, 1, 1, 2, 3, 4,
         2, 5, 8, 8, 6, 7, 5, 6, 7, 5, 8, 8, 8, 6, 7, 5, 6, 4, 2, 3, 4, 1, 1, 1,
         2, 3, 7, 5, 8, 6, 4, 1, 1, 1, 1, 2, 5, 8, 8, 6, 4, 2, 3, 7, 3, 4, 2, 3,
         4, 1, 1, 2, 5, 8, 6, 7, 3, 4, 1, 2, 5, 8, 6, 4, 1, 1, 1, 2, 5, 8, 6, 7,
         5, 6, 7, 5, 8, 8, 6, 4, 1, 1, 1, 2, 3, 4, 2, 5, 8, 6, 4, 1, 2, 3, 7, 5,
         6, 7, 5, 6, 7, 5, 6, 4, 1, 2, 3, 4, 1, 1, 2, 3, 7, 5, 6, 4, 2, 5, 8, 8,
         6, 4, 2, 5, 8, 6, 4, 2, 3, 7, 3, 7, 5, 8, 8, 6, 7, 5, 8, 8, 6, 7, 5, 6,
         7, 5, 6, 4, 1, 1, 1, 2, 3, 7, 5, 8, 8, 8, 8, 6, 4, 1, 2, 5, 8, 6, 7, 5,
         6, 4, 1, 2, 3, 4, 2, 5, 6, 4, 2, 3, 4, 1, 1, 1, 1, 2, 5, 6, 4, 2, 3, 4,
         2, 3, 7, 5, 8, 8, 8, 8, 6, 7, 3, 7, 3, 7, 3, 7, 3, 7, 5, 6, 4, 2, 5, 6,
         7, 3, 4, 2, 3, 7, 3, 4, 2, 3, 4, 1, 2, 3, 4, 2, 5, 6, 7, 3, 7, 5, 8, 8,
         8, 8, 6, 4, 2, 3, 7




In [60]:

# print(model)


In [65]:
model.eval()
for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    input_ids, attention_mask, label = tuple(t for t in batch)
    output = model(input_ids, attention_mask)
    print(output[0][1:5])
    print(label[0][1:5])
    loss_func = nn.CrossEntropyLoss()
    loss = loss_func(output, label.float())
    print(loss)
    loss.backward()

  0%|          | 0/3 [00:00<?, ?it/s]

tensor([0., 0., 0., 0.], grad_fn=<SliceBackward0>)
tensor([3., 7., 3., 7.], dtype=torch.float64)
tensor(14461.8037, grad_fn=<DivBackward1>)


 33%|███▎      | 1/3 [00:03<00:06,  3.03s/it]

tensor([0.0000, 0.0110, 0.0611, 0.0000], grad_fn=<SliceBackward0>)
tensor([7., 3., 7., 5.], dtype=torch.float64)
tensor(13898.5840, grad_fn=<DivBackward1>)


 67%|██████▋   | 2/3 [00:05<00:02,  2.98s/it]

tensor([0.0000, 0.0648, 0.0000, 0.0000], grad_fn=<SliceBackward0>)
tensor([5., 8., 6., 7.], dtype=torch.float64)
tensor(15300.6055, grad_fn=<DivBackward1>)


100%|██████████| 3/3 [00:09<00:00,  3.00s/it]
