In [None]:
from transformers import BertForTokenClassification, BertModel
from data_dir import pretrained_3kmer_dir

bertForTokenClassification = BertForTokenClassification.from_pretrained(pretrained_3kmer_dir)

In [4]:
from data_dir import chr24_index_csv, chr24_fasta, labseq_dir, labseq_names
chr_indices = [chr24_index_csv]
chr_fastas = [chr24_fasta]
chr_labseq_path = ["{}/{}".format(labseq_dir, fname) for fname in [labseq_names[-1]]]
print(chr_indices)
print(chr_fastas)
print(chr_labseq_path)

['./data/genome/grch38/exon/NC_000024.10.csv']
['./data/chr/NC_000024.10.fasta']
['./data/genome/labseq/chr24.csv']


In [5]:
from data_dir import chr24_index_csv, chr24_fasta, labseq_dir, labseq_names
from data_preparation import generate_sequence_labelling
chr_indices = [chr24_index_csv]
chr_fastas = [chr24_fasta]
chr_labseq_path = ["{}/{}".format(labseq_dir, fname) for fname in [labseq_names[-1]]]
for src, fasta, target in zip(chr_indices, chr_fastas, chr_labseq_path):
    print("Generating sequential labelling for index {}, from fasta {}, to {}: {}".format(src, fasta, target, generate_sequence_labelling(src, fasta, target, do_expand=True, expand_size=512)))

Processing index ./data/genome/grch38/exon/NC_000024.10.csv, with fasta ./data/chr/NC_000024.10.fasta, to seq. labelling ./data/genome/labseq/chr24.csv, expanding [5431760/57226904]

In [15]:
"""
Preprocess data
"""
from torch.utils.data import DataLoader, TensorDataset
from torch import tensor
from tqdm import tqdm
from sequential_labelling import Label_Begin, Label_End
def prepare_sequence(sequence, label, tokenizer, label_dictionary, label_begin=Label_Begin, label_end=Label_End):
    """
    @param  sequence (string): a sequence in kmer format.
    @param  label (string): label in kmer format.
    @param  tokenizer (BERT tokenizer): initialized BERT-related tokenizer.
    @param  label_dictionary (map): object to map each label to integer.
    @return input_ids, attention_mask, label_kmer in numeric format.
    """
    arr_seq_kmer = sequence.strip()
    encoded = tokenizer.encode_plus(text=arr_seq_kmer, return_attention_mask=True, padding='max_length')
    input_ids = encoded.get('input_ids')
    attention_mask = encoded.get('attention_mask')

    label_kmer = [label_begin]
    label_kmer.extend(label.strip().split(' '))
    label_kmer.extend([label_end])
    label_kmer = [label_dictionary[k] for k in label_kmer]

    return input_ids, attention_mask, label_kmer

def prepare_seq2seq_data(sequences, labels, batch_size, tokenizer, label_dictionary):
    """
    @param      sequences (string): list of sequences in kmer format.
    @param      labels (string): list of sequential label in kmer format.
    @param      tokenizer (object): initialized BERT-related tokenizer.
    @param      label_dictionary (map): dictionary to convert kmer label into integer.
    @return     dataloader (torch.utils.data.DataLoader): dataloader for these sequences and labels.
    """
    if len(sequences) != len(labels):
        raise Exception("Sequence and labels size are not matched.")
    arr_input_ids = []
    arr_attention_mask = []
    arr_label_kmer = []
    for seq, label in tqdm(zip(sequences, labels), total=len(sequences)):
        input_ids, attention_mask, label_kmer = prepare_sequence(seq, label, tokenizer, label_dictionary)
        arr_input_ids.append(input_ids)
        arr_attention_mask.append(attention_mask)
        arr_label_kmer.append(label_kmer)

    arr_input_ids = tensor(arr_input_ids)
    arr_attention_mask = tensor(arr_attention_mask)
    arr_label_kmer = tensor(arr_label_kmer)
    tensor_dataset = TensorDataset(arr_input_ids, arr_attention_mask, arr_label_kmer)
    dataloader = DataLoader(tensor_dataset, batch_size=batch_size)
    return dataloader

In [2]:
"""
Create sample data sequential labelling.
"""
from random import randint
from data_preparation import kmer
sequence = 'ATGC' * 128
label = ['E' if randint(0, 255) % 2 == 0 else '.' for i in range(len(sequence))]

kmer_seq = ' '.join(kmer(sequence, 3))
kmer_label = ' '.join(kmer(''.join(label), 3))

In [16]:
from transformers import BertTokenizer
from data_dir import pretrained_3kmer_dir
from sequential_labelling import Label_Dictionary

tokenizer = BertTokenizer.from_pretrained(pretrained_3kmer_dir)
input_ids, attention_mask, label_kmer = prepare_sequence(kmer_seq, kmer_label, tokenizer, Label_Dictionary)
dataloader = prepare_seq2seq_data([kmer_seq], [kmer_label], 1, tokenizer, Label_Dictionary)

100%|██████████| 1/1 [00:00<00:00, 142.86it/s]


In [19]:
print([input_ids])


[[2, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12, 35, 61, 38, 12,

In [40]:
"""
Initialize simple model for sequential labelling.
"""
from torch import nn
class Seq2SeqHead(nn.Module):
    def __init__(self, dims):
        super().__init__()
        dims_ins_outs = [dims[i:i+2] for i in range(len(dims)-2+1)]
        self.hidden_layers = [nn.Linear(d[0], d[1]) for d in dims_ins_outs]
        self.stack = nn.Sequential()
        for i in range(0, len(self.hidden_layers)):
            linear_layer = self.hidden_layers[i]
            self.stack.add_module("hidden-{}".format(i+1), linear_layer)
            self.stack.add_module("relu-{}".format(i+1), nn.ReLU())
    
    def forward(self, input):
        return self.stack(input)

class DNABertSeq2Seq(nn.Module):
    def __init__(self, bert_layer, seq2seq_head):
        super().__init__()
        self.bert_layer = bert_layer
        self.seq2seq_head = seq2seq_head
    
    def forward(self, input_ids, attention_masks):
        bert_output = self.bert_layer(input_ids, attention_masks)
        last_hidden_output = bert_output[0][:, 0, :]
        seq2seq_output = self.seq2seq_head(last_hidden_output)
        return seq2seq_output

from transformers import BertForMaskedLM
def initialize_seq2seq(bert_pretrained_path, in_out_dims):
    seq2seq_head = Seq2SeqHead(in_out_dims)
    bert_layer = BertForMaskedLM.from_pretrained(bert_pretrained_path).bert
    model = DNABertSeq2Seq(bert_layer, seq2seq_head)
    return model


In [41]:
from transformers import BertModel
from data_dir import pretrained_3kmer_dir
in_out_dimensions = [768, 512, 512, 512]
model = initialize_seq2seq(pretrained_3kmer_dir, in_out_dimensions)

In [48]:
model.eval()
for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    input_ids, attention_mask, label = tuple(t for t in batch)
    output = model(input_ids, attention_mask)
    print(output[0][1:5])
    print(label[0][1:5])

100%|██████████| 1/1 [00:01<00:00,  1.02s/it]

tensor([0.0000, 0.0494, 0.0375, 0.0648], grad_fn=<SliceBackward0>)
tensor([7, 5, 6, 4])



