In [2]:
"""
Check if CUDA is supported.
"""
import torch
torch.cuda.is_available()
torch.device('cuda:0')
torch.cuda.get_device_name(0)
_device = torch.device('cuda:0')


In [3]:
import pandas as pd
from data_preparation import kmer

def get_sequences(csv_path, n_sample=10, random_state=1337):
    r"""
    Get sequence from certain CSV. CSV has header such as 'sequence', 'label_prom', 'label_ss', 'label_polya'.
    """
    df = pd.read_csv(csv_path)
    if (n_sample > 0):
        df = df.sample(n=n_sample, random_state=random_state)
    sequence = list(df['sequence'])
    label_prom = list(df['label_prom'])
    label_ss = list(df['label_ss'])
    label_polya = list(df['label_polya'])

    return sequence, label_prom, label_ss, label_polya

import torch
def preprocessing(data, tokenizer):
    """
    Preprocessing for pretrained BERT.
    @param  data (string): string containing kmers separated by spaces.
    @param  tokenizer (Tokenizer): tokenizer initialized from pretrained values.
    @return input_ids (torch.Tensor): tensor of token ids to be fed to model.
    @return attention_masks (torch.Tensor): tensor of indices (a bunch of 'indexes') specifiying which token needs to be attended by model.
    """
    input_ids = []
    attention_masks = []

    _count = 0
    _len_data = len(data)
    for sequence in data:
        """
        Sequence is 512 characters long.
        """
        _count += 1
        if _count < _len_data:
            print("Seq length = {} [{}/{}]".format(len(sequence.split(' ')), _count, _len_data), end='\r')
        else:
            print("Seq length = {} [{}/{}]".format(len(sequence.split(' ')), _count, _len_data))
        encoded_sent = tokenizer.encode_plus(
            text=sequence,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert input_ids and attention_masks to tensor.
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

"""
Initialize tokenizer using BertTokenizer with pretrained weights from DNABert.
"""
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('./pretrained/3-new-12w-0')

In [4]:
"""
Split dataset into two parts: train and validation.
"""
from data_dir import workspace_dir, dataset_full_dir
from data_preparation import split_and_store_csv
_src_csv = "{}/train.csv".format(dataset_full_dir)
_fractions = [0.9, 0.1]
_store_paths = [
    "{}/{}".format(workspace_dir, 'train.csv'),
    "{}/{}".format(workspace_dir, 'validation.csv'),
]
print("Splitting source {}: {}".format(_src_csv, split_and_store_csv(_src_csv, fractions=_fractions, store_paths=_store_paths)))

Splitting and storing split to ./workspace/train.csv
Splitting and storing split to ./workspace/validation.csv
Splitting source ./dataset/full/train.csv: True


In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from data_dir import workspace_dir

train_seq, train_label_prom, train_label_ss, train_label_polya = get_sequences('{}/train.csv'.format(workspace_dir), n_sample=500000)
validation_seq, val_label_prom, val_label_ss, val_label_polya = get_sequences('{}/validation.csv'.format(workspace_dir), n_sample=100)

"""
Create dataloader.
"""
BATCH_SIZE = 2
EPOCH_SIZE = 4

_device = torch.device('cuda:0')
train_label_prom = torch.tensor(train_label_prom, device=_device)
train_label_ss = torch.tensor(train_label_ss, device=_device)
train_label_polya = torch.tensor(train_label_polya, device=_device)

train_inputs_ids, train_masks = preprocessing(train_seq, tokenizer)
train_data = TensorDataset(train_inputs_ids, train_masks, train_label_prom, train_label_ss, train_label_polya)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_label_prom = torch.tensor(val_label_prom, device=_device)
val_label_ss = torch.tensor(val_label_ss, device=_device)
val_label_polya = torch.tensor(val_label_polya, device=_device)

val_input_ids, val_masks = preprocessing(validation_seq, tokenizer)
val_data = TensorDataset(val_input_ids, val_masks, val_label_prom, val_label_ss, val_label_polya)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

print('# of training data: {}'.format(len(train_seq)))  
print('# of training data: {}'.format(len(validation_seq)))

Seq length = 510 [500000/500000]
Seq length = 510 [100/100]
# of training data: 500000
# of training data: 100


In [1]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
import numpy as np

from multitask_learning import PolyAHead, PromoterHead, SpliceSiteHead, MTModel
from transformers import BertForMaskedLM
from data_dir import pretrained_3kmer_dir

_device = "cuda"
polya_head = PolyAHead(_device)
promoter_head = PromoterHead(_device)
splice_head = SpliceSiteHead(_device)

dnabert_3_pretrained = pretrained_3kmer_dir
shared_parameter = BertForMaskedLM.from_pretrained(dnabert_3_pretrained).bert

model = MTModel(shared_parameters=shared_parameter, promoter_head=promoter_head, polya_head=polya_head, splice_site_head=splice_head).to(_device)
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
loss_fn = CrossEntropyLoss()



In [2]:
import torch
import datetime
import os
from data_preparation import kmer

_now = datetime.datetime.now()

_log_file = os.path.join('logs', 'notebooks', '2022-02.24.csv')
os.makedirs(_log_file, exist_ok=True)

seqs = ["ATGC" * 128, "GATC" * 128, "CCAT" * 128]
seqs = [' '.join(kmer(s, 3)) for s in seqs]
prom_labels = [1, 0, 0]
ss_labels = [0, 1, 0]
polya_labels = [0, 0, 1]

"""
Initialize BERT tokenizer.
"""
from transformers import BertTokenizer
from data_dir import pretrained_3kmer_dir
import torch
tokenizer = BertTokenizer.from_pretrained(pretrained_3kmer_dir)

arr_input_ids = []
arr_attention_mask = []
for s in seqs:
    encoded = tokenizer.encode_plus(text=s, padding="max_length", return_attention_mask=True)
    arr_input_ids.append(encoded.get('input_ids'))
    arr_attention_mask.append(encoded.get('attention_mask'))
#endfor
arr_input_ids = torch.tensor(arr_input_ids).to('cuda')
arr_attention_mask = torch.tensor(arr_attention_mask).to('cuda')
prom_labels = torch.tensor(prom_labels).to('cuda')
ss_labels = torch.tensor(ss_labels).to('cuda')
polya_labels = torch.tensor(polya_labels).to('cuda')

from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(arr_input_ids, arr_attention_mask, prom_labels, ss_labels, polya_labels)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.