### Install required Packages

In [20]:
!pip install transformers
!pip install seqeval

### Import packages

In [21]:
from transformers import BertTokenizer, BertForTokenClassification, get_scheduler, AdamW
import pandas as pd
import numpy as np
import torch
import transformers
from tqdm.auto import tqdm
from seqeval.metrics import f1_score
from sklearn.utils import shuffle
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
import datetime as dt
import pickle

## Data Pre-processing

In [22]:
model_name = 'bert-base-multilingual-cased'  # 'bert-large-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [54]:
raw_data = pd.read_excel('dataset_version_one .xlsx')
raw_data.head()

Unnamed: 0,indexes,keywords,label,code
0,"13-19, 4-11","String, _field1","variable_type, variable_name",var _field1: String? = nil
1,"13-18, 4-11","Int32, _field2","variable_type, variable_name",var _field2: Int32? = nil
2,"8-26, 8-26, 8-34, 4-6","SDTTopLevelMessage, SDTTopLevelMessage, SDTTop...","object_name, object_name, variable_type, varia...",var _o: SDTTopLevelMessage.OneOf_O?
3,"29-42, 11-26","_StorageClass, defaultInstance","object_name, variable_name",static let defaultInstance = _StorageClass()
4,8-12,init,object_name,private init() {}


In [27]:
def sentences_format(indexes, label, code, tokenizer):

    tokens = tokenizer(code, is_split_into_words= True, add_special_tokens=False)
    sentence = tokenizer.convert_ids_to_tokens(tokens.input_ids)
    tag_labels = [None] * len(sentence)
    label = label
    for index, lbl in zip(indexes.split(','), label.split(',')):
        idx = index.split('-')
        s_tokens = code[int(idx[0]):int(idx[1])]
        t_tokens = tokenizer(s_tokens, is_split_into_words= True, add_special_tokens=False)
        tag_tokens = tokenizer.convert_ids_to_tokens(t_tokens.input_ids)
       #print(f'{lbl.strip()} {code[int(idx[0]):int(idx[1])]}')    
        if len(tag_tokens)>1:
            local_index = []
            for itr, tag in enumerate(tag_tokens):
                for sitr, s_tag in enumerate(sentence):
                    if tag == s_tag:
                        if itr+1 < len(tag_tokens) and sitr+1 < len(sentence):
                            if tag_tokens[itr+1] == sentence[sitr+1]:
                                local_index.append(sitr)
                                local_index.append(sitr+1)

            for i in range(len(local_index)):
                if i == 0:
                    tag_labels[local_index[i]]= f'B-{lbl.strip()}'
                else:
                    tag_labels[local_index[i]] = f'I-{lbl.strip()}'

        else:
            t_idx = sentence.index(tag_tokens[0])
            tag_labels[t_idx] = f'B-{lbl.strip()}'
        pass
    return sentence, tag_labels

In [28]:
all_tokens = []
all_tags = []
sent_id = []
sent_count = 0
for index, row in raw_data.iterrows():
    sentence_tokens, tag_labels = sentences_format(row['indexes'], row['label'], row['code'], tokenizer)
    sent_count += 1
    for token in sentence_tokens:
      all_tokens.append(token)
    for lbl in tag_labels:
      if lbl == None:
        all_tags.append('O')
      else:
        all_tags.append(lbl)
      sent_id.append(sent_count)

    if index % 1000 == 0:
      print(f'Processed Rows {index}')


Processed Rows 0


In [29]:
data_tuples = list(zip(sent_id,all_tokens, all_tags))

In [30]:
token_data = pd.DataFrame(data_tuples, columns =['Sentence Id', 'Token', 'Tag'] )
token_data.to_csv('processed_data_.csv')

### Loading Processed Data

In [31]:
dataset = pd.read_csv('processed_data_.csv')
dataset.head()
dataset = dataset
agg_func = lambda s: [(w, t) for w, t in zip(s["Token"].values.tolist(),
                                             s["Tag"].values.tolist())]
processed_sentences = dataset.groupby("Sentence Id").apply(agg_func).tolist()


### Data processing (formating Data)

In [32]:
tags_vals = dataset['Tag'].unique().tolist()
tag2idx = {t: i for i, t in enumerate(tags_vals)}
sentences = [' '.join([str(s[0]) for s in sent]) for sent in processed_sentences]
labels = [[s[1] for s in sent] for sent in processed_sentences]
labels = [[tag2idx.get(l) for l in lab] for lab in labels]

# TO DO: Train, Test Split...
# Creating the dataset and dataloader for the neural network

train_percent = 0.8
train_size = int(train_percent*len(sentences))
# train_dataset=df.sample(frac=train_size,random_state=200).reset_index(drop=True)
# test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_sentences = sentences[0:train_size]
train_labels = labels[0:train_size]

test_sentences = sentences[train_size:]
test_labels = labels[train_size:]

print("FULL Dataset: {}".format(len(sentences)))
print("TRAIN Dataset: {}".format(len(train_sentences)))
print("TEST Dataset: {}".format(len(test_sentences)))


FULL Dataset: 10
TRAIN Dataset: 8
TEST Dataset: 2


In [33]:
def calc_max_len_in_sentences(sentences):
    #Confirm maximum word count
    max_len = []
    #Sentence by sentence processing
    for sent in sentences:
        # Tokenization
        token_words = tokenizer.tokenize(sent)
        # Get the number of sentences and store them in the list.
        max_len.append(len(token_words))
    max_length=max(max_len, default = 0)+2
    if max_length>512:
        return 512 #BERT allows 512 words
    else:
        return max_length # Maximum number of words plus +2 of Special token ([CLS], [SEP])

def convert_sentences_for_bert(sentences, max_length):
    input_ids = []
    attention_masks = []
    #1 Sentence by sentence processing
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, # Add Special Token
                            max_length = max_length,   # Fixed sentence length (Padding/Transcatinating)
                            padding='max_length',      #Fill with PADDING
                            return_attention_mask = True,   # Create Attention Masks
                            return_tensors = 'pt',     # return with Pytorch tensors
                            truncation=True, 
                            return_token_type_ids=True
                    )

       # Get word ID
        input_ids.append(encoded_dict['input_ids'])

        # Attention mask acquisition
        attention_masks.append(encoded_dict['attention_mask'])

   # Combine the listed tesor longitudinally (dim=0)
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

In [34]:
def process_labels(labels, tag2idx, max_length):
  processed_labels = []
  for indx in range(len(labels)):
    label = labels[indx]
    label.extend([tag2idx['O']] * max_length)
    processed_labels.append(label[:max_length])
  return processed_labels

In [35]:
#train_input_ids.size(), train_attention_masks.size(), tensor_labels.size()

In [36]:
# get max length from the training sentences
max_length = max(calc_max_len_in_sentences(train_sentences), calc_max_len_in_sentences(test_sentences))

# Conversion to BERT format
train_input_ids,  train_attention_masks = convert_sentences_for_bert(train_sentences, max_length)
test_input_ids, test_attention_masks = convert_sentences_for_bert(test_sentences, max_length)

# train dataset
train_processed_labels = process_labels(train_labels, tag2idx, max_length)
train_tensor_labels = torch.Tensor(train_processed_labels)
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_tensor_labels)

# test dataset
test_processed_labels = process_labels(test_labels, tag2idx, max_length)
test_tensor_labels = torch.Tensor(test_processed_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_tensor_labels)

In [None]:
num_labels = len(tags_vals)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

# device selection GPU/CPU, prefer GPU 
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [38]:
# hyper parameters
batch_size = 1
num_epochs = 100
optimizer = AdamW(model.parameters(), lr=5e-5)

def worker_init_fn(worker_id):
    random.seed(worker_id)

# train data loder with specific batches along with shuffling
train_dataloader = DataLoader(
            train_dataset,  
            sampler =  SequentialSampler(train_dataset),
            #shuffle = True,
            batch_size = batch_size,
            worker_init_fn=worker_init_fn
        )
num_training_steps = num_epochs * len(train_dataloader)

test_dataloader = DataLoader(
            test_dataset,  
            sampler =  SequentialSampler(test_dataset),
            #shuffle = True,
            batch_size = batch_size,
            worker_init_fn=worker_init_fn
        )
num_testing_steps =  len(test_dataloader)

# learning rate scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [18]:
# save model
def save_model(model):
  datetime=str(dt.datetime.now()).split(".")[0].replace(":","").replace(" ","").replace("-","")
  # location with file name of model 
  model_file = 'model_%s.pickle'%datetime
  with open(model_file, mode='wb') as bertf:
    pickle.dump(model, bertf)
    print("Model saved as " + model_file)

# load model
def load_model(model_file):
  with open(model_file, mode='rb') as bertf:
    model= pickle.load(bertf)
    print("Model loaded from " + model_file)
  return model

In [None]:
# progress bar (tqdm)
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    
    for batch in train_dataloader:
        b_input_ids = batch[0].to(device, dtype=torch.long)
        b_input_mask = batch[1].to(device, dtype=torch.long)
        b_labels = batch[2].to(device, dtype=torch.long)
        outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        progress_bar.set_postfix_str(f'Loss: {loss.item():.5f}')
    if epoch % 10 == 0:
      save_model(model)

HBox(children=(FloatProgress(value=0.0, max=800.0), HTML(value='')))

Model saved as model_20210712160750.pickle
Model saved as model_20210712161136.pickle
Model saved as model_20210712161519.pickle
Model saved as model_20210712161903.pickle
Model saved as model_20210712162250.pickle
Model saved as model_20210712162634.pickle
Model saved as model_20210712163016.pickle
Model saved as model_20210712163407.pickle
Model saved as model_20210712163759.pickle


In [40]:
def flat_accuracy(preds, labels):
    flat_preds = np.argmax(preds, axis=2).flatten()
    flat_labels = labels.flatten()
    return np.sum(flat_preds == flat_labels)/len(flat_labels)

In [41]:
def valid(model, testing_loader):
    model.eval()
    eval_loss = 0; eval_accuracy = 0
    n_correct = 0; n_wrong = 0; total = 0
    predictions , true_labels = [], []
    nb_eval_steps, nb_eval_examples = 0, 0
    progress_bar = tqdm(range(num_testing_steps))
    with torch.no_grad():
        for batch in test_dataloader:
          b_input_ids = batch[0].to(device, dtype=torch.long)
          b_input_mask = batch[1].to(device, dtype=torch.long)
          b_labels = batch[2].to(device, dtype=torch.long)
          outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
          loss = outputs.loss
          logits = outputs.logits
          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.to('cpu').numpy()
          predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
          true_labels.append(label_ids)
          accuracy = flat_accuracy(logits, label_ids)
          eval_loss += loss.mean().item()
          eval_accuracy += accuracy
          nb_eval_examples += b_input_ids.size(0)
          nb_eval_steps += 1
          progress_bar.update(1)
        eval_loss = eval_loss/nb_eval_steps
        print("Validation loss: {}".format(eval_loss))
        print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
        pred_tags = [[tags_vals[p_i]] for p in predictions for p_i in p]
        #print(pred_tags)
        valid_tags = [[tags_vals[l_ii]] for l in true_labels for l_i in l for l_ii in l_i]
        #print(valid_tags)
        print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))

In [42]:
# To get the results on the validation set. This data is not seen by the model
valid(model, test_dataloader)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

Validation loss: 0.5999726429581642
Validation Accuracy: 0.9673913043478262
F1-Score: 0.6666666666666665


### Predictions

In [43]:
def predictions(model, sentence, tokenizer):
  pt_sentence = tokenizer(sentence, 
                          add_special_tokens = True, # Add Special Token
                          return_attention_mask = True,   # Create Attention Masks
                          return_tensors = 'pt',     # return with Pytorch tensors
                          return_token_type_ids=True)
  output = model(**pt_sentence.to(device))
  loss = output.loss
  logits = output.logits
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  predictions = [list(p) for p in np.argmax(logits, axis=2)]
  pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
  sentence = tokenizer.convert_ids_to_tokens(pt_sentence.input_ids[0])
  return sentence, pred_tags

In [52]:
sentence = 'private init() {}'
predictions(model, sentence, tokenizer)

(['[CLS]', 'private', 'init', '(', ')', '{', '}', '[SEP]'],
 ['O', 'B-object_name', 'O', 'O', 'O', 'O', 'O', 'O'])