In [64]:
import numpy as np
import pandas as pd
import torch
import transformers
import os
import csv
import nltk

In [65]:
#first thing to do is to get the data ready

#first let us get our tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2', do_lower_case=False)

## Preprocessing data

1. We read the tsv file 
2. Break corpus into sentences and tags 

In [66]:
class SentenceFetch(object):
  
  def __init__(self, data):
    self.data = data
    self.sentences = []
    self.tags = []
    self.sent = []
    self.tag = []
    
    # make tsv file readable
    with open(self.data) as tsv_f:
      reader = csv.reader(tsv_f, delimiter='\t')
      for row in reader:
        if len(row) == 0:
          if len(self.sent) != len(self.tag):
            break
          self.sentences.append(self.sent)
          self.tags.append(self.tag)
          self.sent = []
          self.tag = []
        else:
          self.sent.append(row[0])
          self.tag.append(row[1])   

  def getSentences(self):
    return self.sentences
  
  def getTags(self):
    return self.tags
     

In [89]:
def extractFromDirectories(parent_dir, corpus_path, file_type = 'train.tsv'):
    sentences, tags = [], []
    for path, dirs, files in os.walk(parent_dir):
        for file in files:
            if path == corpus_path:
                if file == file_type:
                    current_path = os.path.join(path,file)
                    sentence = SentenceFetch(current_path).getSentences()
                    tag = SentenceFetch(current_path).getTags()
                    sentences.extend(sentence)
                    tags.extend(tag)
    
    print('Number of samples: ',len(sentences))

    return sentences, tags

PARENT_DIR = 'BioNLP'
CORPUS_PATH = 'BioNLP\BioNLP09-IOB'

train_sentences, train_tags = extractFromDirectories(parent_dir=PARENT_DIR,
                                                     corpus_path=CORPUS_PATH,
                                                     file_type='train.tsv')

val_sentences, val_tags = extractFromDirectories(parent_dir=PARENT_DIR,
                                                     corpus_path=CORPUS_PATH,
                                                     file_type='devel.tsv')

test_sentences, test_tags = extractFromDirectories(parent_dir=PARENT_DIR,
                                                     corpus_path=CORPUS_PATH,
                                                     file_type='test.tsv')

Number of samples:  7462
Number of samples:  1448
Number of samples:  2446


In [113]:
train_sentences[0]



['Reactive',
 'oxygen',
 'intermediate',
 '-',
 'dependent',
 'NF',
 '-',
 'kappaB',
 'activation',
 'by',
 'interleukin',
 '-',
 '1beta',
 'requires',
 '5',
 '-',
 'lipoxygenase',
 'or',
 'NADPH',
 'oxidase',
 'activity',
 '.']

In [132]:
encoded_sent = tokenizer.encode_plus(train_sentences[0], is_split_into_words=True, truncation=True, padding = 'max_length', max_length = 50)

In [130]:
encoded_sent['input_ids'].tokenizer.

{'input_ids': [101, 11336, 19667, 7621, 9533, 118, 7449, 151, 2271, 118, 24181, 13059, 2064, 14915, 1118, 9455, 1513, 17041, 1179, 118, 122, 16632, 1161, 5315, 126, 118, 4764, 10649, 1183, 4915, 6530, 1137, 151, 14569, 2101, 3048, 184, 8745, 9028, 1162, 3246, 119, 102, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}

In [133]:
tokenizer.convert_ids_to_tokens(encoded_sent['input_ids'])

['Re',
 '##active',
 'oxygen',
 'intermediate',
 '-',
 'dependent',
 'N',
 '##F',
 '-',
 'ka',
 '##ppa',
 '##B',
 'activation',
 'by',
 'inter',
 '##le',
 '##uki',
 '##n',
 '-',
 '1',
 '##bet',
 '##a',
 'requires',
 '5',
 '-',
 'lip',
 '##ox',
 '##y',
 '##gen',
 '##ase',
 'or',
 'N',
 '##AD',
 '##P',
 '##H',
 'o',
 '##xi',
 '##das',
 '##e',
 'activity',
 '.',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [119]:
encoded_sentence = tokenizer(train_sentences[0], padding = 'max_length', max_length = 50, truncation=True)

encoded_sentence['input_ids']

[[101,
  11336,
  19667,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [101,
  7621,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [101,
  9533,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [101,
  118,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


## Tokenizing data and keeping labels intact

1. Now we zip sentences and tags 
2. We tokenize each word; note that some words are broken into sub-words.
3. To deal with that, we just extend the label to all subwords generated

In [100]:
from keras_preprocessing.sequence import pad_sequences
import itertools

MAX_LENGTH = 40

def tokenizeAndLabelSample(sentence, text_labels):
  """Generate tokens for words in a text sequence while keeping labels intact. This function does tokenization on the sentence level"""

  tokens = []
  labels = []

  #zip sentence and label
  for word, label in zip(sentence, text_labels):
    token = tokenizer.tokenize(word)
    tokens.extend(token)

    #make sure label is copied for each sub-word IF tokenizer generates sub-words for given word
    #len(token) is > 1 if sub-words generated
    labels.extend([label]*len(token)) 
  
  return tokens, labels

def tokenizeDataset(sentences, labels):
  """This funciton uses tokenizeAndLabelSample and runs it on the entire dataset"""
  tokenized_text = []
  text_labels = []

  for i in range(len(sentences)):
    sent_tokens, sent_labels = tokenizeAndLabelSample(sentences[i],labels[i])
    tokenized_text.append(sent_tokens)
    text_labels.append(sent_labels)
  
  return tokenized_text, text_labels

def generateInputIds(tokenized_text):
  """Generate input_ids"""
  input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(text_sequence) for text_sequence in tokenized_text],
                            maxlen = MAX_LENGTH, dtype='long',value=0.0, truncating='post',padding='post')
  
  return input_ids

def generateAttentionMask(input_ids):
  """Generates attention mask for input ids"""
  attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

  return attention_masks

def alignLabels(tags, text_labels):
  """Ensures that labels match """
  #get all unique tag values
  tag_values = list(set(itertools.chain.from_iterable(tags)))
  #append PAD token to tag_values
  tag_values.append("PAD")
  #create a dictionary mapping tag values to ids
  tag_id_dict = {t: i for i,t in enumerate(tag_values)}
  #pad 
  tag_ids = pad_sequences([[tag_id_dict.get(l) for l in lab] for lab in text_labels],
                     maxlen=MAX_LENGTH, value=tag_id_dict["PAD"], padding="post",
                     dtype="long", truncating="post")
  
  return tag_ids, tag_id_dict

def generateInputs(sentences, tags):
  """A wrapped function that does the whole processing of tokenization and aligning of labels"""

  #tokenize dataset
  tokenized_text, text_labels = tokenizeDataset(sentences, tags)

  #generate input ids
  input_ids = generateInputIds(tokenized_text)

  #generate attention mask
  attention_masks = generateAttentionMask(input_ids)

  #align labels
  tag_ids, tag_id_dict = alignLabels(tags, text_labels)

  return input_ids, attention_masks, tag_ids, tag_id_dict


In [101]:
train_input_ids, train_attention_masks, train_tag_ids, train_tag_id_dict = generateInputs(train_sentences,train_tags)

In [116]:
train_input_ids[0]

array([11336, 19667,  7621,  9533,   118,  7449,   151,  2271,   118,
       24181, 13059,  2064, 14915,  1118,  9455,  1513, 17041,  1179,
         118,   122, 16632,  1161,  5315,   126,   118,  4764, 10649,
        1183,  4915,  6530,  1137,   151, 14569,  2101,  3048,   184,
        8745,  9028,  1162,  3246])

In [70]:
tokenized_text, text_labels = tokenizeDataset(sentences,tags)

In [71]:
#now lets get our input ids 
from keras_preprocessing.sequence import pad_sequences
import itertools

input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(text_sequence) for text_sequence in tokenized_text],
                            maxlen = 100, dtype='long',value=0.0, truncating='post',padding='post')

tag_values = list(set(itertools.chain.from_iterable(tags)))

tag_values.append("PAD")

tag2idx = {t: i for i,t in enumerate(tag_values)}

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in text_labels],
                     maxlen=100, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [72]:
#generate attendtion masks
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [9]:
type(tags[0][0])

numpy.int32

## Creating Dataloaders

In [73]:
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

tr_inputs = torch.tensor(input_ids)
tr_tags = torch.tensor(tags)
tr_masks = torch.tensor(attention_masks)

train_dataset = TensorDataset(tr_inputs,tr_masks,tr_tags)

BATCH_SIZE = 16

train_loader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = BATCH_SIZE
)


In [54]:
tag_values

['O', 'I-Protein', 'E-Protein', 'S-Protein', 'B-Protein', 'PAD']

In [74]:
from transformers import BertForTokenClassification, AdamW

model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.2', num_labels = len(tag_values))

optimizer = AdamW(model.parameters(), lr=5e-6)

#check for cuda
device = torch.device("cuda")

model = model.cuda()

# for num, batch in enumerate(train_loader):
#     label = batch[2].type(torch.LongTensor).to(device)

#     output = model(input_ids = batch[0].to(device), attention_mask = batch[1].to(device), labels = label)
#     loss = output[0]
#     loss.backward()
#     optimizer.step()
#     print(loss)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [75]:
#specify number of epochs
epochs = 6

total_steps = len(train_loader)*epochs

training_stats = []


for epoch_i in range(0,epochs):

    #keeping track of ep
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    #flush out total loss after each epoch
    total_train_loss = 0

    #put model in training mode
    model.train()

    for step, batch in enumerate(train_loader):
        batch_input_ids = batch[0].to(device)
        batch_attention_mask = batch[1].to(device)
        batch_label = batch[2].type(torch.LongTensor).to(device)

        model.zero_grad()

        output = model(input_ids = batch_input_ids,
                       attention_mask = batch_attention_mask,
                       labels = batch_label)
        
        loss = output[0]

        total_train_loss += loss.item()

        #backward pass
        loss.backward()

        #clip the norm of gradients to 1.0
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
    
    average_train_loss = total_train_loss / len(train_loader)
    print('Training loss: ',average_train_loss)


Training...
Training loss:  0.11376170058101416

Training...
Training loss:  0.04043763983175158

Training...
Training loss:  0.03095139863193035

Training...
Training loss:  0.02555634008385241

Training...
Training loss:  0.021814862606860698

Training...
Training loss:  0.01945531152449548


In [76]:
#save the model
output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./model_save/


('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json')

In [None]:
#loading model and tokenizer
from transformers import BertForTokenClassification, BertTokenizer

input_dir = './model_save/'

device = "cuda"

model = BertForTokenClassification.from_pretrained(input_dir)
tokenizer = BertTokenizer.from_pretrained(input_dir)

# Copy the model to the GPU.
model.to(device)

In [2]:
#Using the model for predictions
text = """In addition to their essential catalytic role in protein biosynthesis, aminoacyl-tRNA synthetases participate in numerous other functions, including regulation of gene expression and amino acid biosynthesis via transamidation pathways. Herein, we describe a class of aminoacyl-tRNA synthetase-like (HisZ) proteins based on the catalytic core of the contemporary class II histidyl-tRNA synthetase whose members lack aminoacylation activity but are instead essential components of the first enzyme in histidine biosynthesis ATP phosphoribosyltransferase (HisG). Prediction of the function of HisZ in Lactococcus lactis was assisted by comparative genomics, a technique that revealed a link between the presence or the absence of HisZ and a systematic variation in the length of the HisG polypeptide. HisZ is required for histidine prototrophy, and three other lines of evidence support the direct involvement of HisZ in the transferase function. (i) Genetic experiments demonstrate that complementation of an in-frame deletion of HisG from Escherichia coli (which does not possess HisZ) requires both HisG and HisZ from L. lactis. (ii) Coelution of HisG and HisZ during affinity chromatography provides evidence of direct physical interaction. (iii) Both HisG and HisZ are required for catalysis of the ATP phosphoribosyltransferase reaction. This observation of a common protein domain linking amino acid biosynthesis and protein synthesis implies an early connection between the biosynthesis of amino acids and proteins."""


nltk.download('punkt')

sent_text = nltk.sent_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adil.ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [29]:
sent_text[0]

'In addition to their essential catalytic role in protein biosynthesis, aminoacyl-tRNA synthetases participate in numerous other functions, including regulation of gene expression and amino acid biosynthesis via transamidation pathways.'

In [7]:
tokenized_text = []
for sentence in sent_text:
    tokenized_text.append(nltk.word_tokenize(sentence))

In [28]:
tokenized_text

[['In',
  'addition',
  'to',
  'their',
  'essential',
  'catalytic',
  'role',
  'in',
  'protein',
  'biosynthesis',
  ',',
  'aminoacyl-tRNA',
  'synthetases',
  'participate',
  'in',
  'numerous',
  'other',
  'functions',
  ',',
  'including',
  'regulation',
  'of',
  'gene',
  'expression',
  'and',
  'amino',
  'acid',
  'biosynthesis',
  'via',
  'transamidation',
  'pathways',
  '.'],
 ['Herein',
  ',',
  'we',
  'describe',
  'a',
  'class',
  'of',
  'aminoacyl-tRNA',
  'synthetase-like',
  '(',
  'HisZ',
  ')',
  'proteins',
  'based',
  'on',
  'the',
  'catalytic',
  'core',
  'of',
  'the',
  'contemporary',
  'class',
  'II',
  'histidyl-tRNA',
  'synthetase',
  'whose',
  'members',
  'lack',
  'aminoacylation',
  'activity',
  'but',
  'are',
  'instead',
  'essential',
  'components',
  'of',
  'the',
  'first',
  'enzyme',
  'in',
  'histidine',
  'biosynthesis',
  'ATP',
  'phosphoribosyltransferase',
  '(',
  'HisG',
  ')',
  '.'],
 ['Prediction',
  'of',
  'th

In [10]:
example = tokenized_text[0]

len(example)

32

In [12]:
tokenized_sentence = []

for word in example:
    tokenized_word = tokenizer.tokenize(word)
    tokenized_sentence.extend(tokenized_word)
    

In [21]:
input_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
input_attentions = [[1]*len(input_ids)]

In [None]:
actual_sentences = []
pred_labels = []

for x,y in zip(input_ids,input_attentions):
    x = torch.tensor(x).cuda()
    y = torch.tensor(y).cuda()
    #x = x.view(-1,x.size()[-1])
    #y = y.view(-1,y.size()[-1])
    with torch.no_grad():
        _,y_hat = model(x,y,None)
    label_indices = y_hat.to('cpu').numpy()

In [52]:
test_example = ['The',
 'Cdc6',
 'protein',
 'is',
 'ubiquitinated',
 'in',
 'vivo',
 'for',
 'proteolysis',
 'in',
 'Saccharomyces',
 'cerevisiae',
 '.']

In [81]:
#test_example = sent_text[0]

text = tokenizer(sentences[1], padding = 'max_length', max_length = 100, truncation=True, return_tensors ='pt')

input_id = text['input_ids'].to(device)
mask = text['attention_mask'].to(device)

with torch.no_grad():
    output = model(input_id,mask,None)

In [82]:
logits = output['logits'][0]
predictions = logits.argmax(dim=1).tolist()

In [83]:
predictions

[5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5]