In [None]:
!pip install wget
!pip install transformers

In [270]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import pickle
import json
import wget
import os

In [271]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print( torch.cuda.device_count())
    print('Available:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


4
Available: NVIDIA A100-SXM4-80GB


In [272]:
def word_shape_features(word):
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha(),word.isalnum(), word.isnumeric()])

def get_word_features(word):
    return word_shape_features(word)

def get_sent_features(sent):
    ret = []
    for word in sent:
        ret.append(get_word_features(word))
    return ret

In [274]:
TAGS =  pickle.load(open( "tags.pickle", "rb" ))
TAGS.remove("O") 
NUM_TAGS = len(TAGS)

tag2id = {}
for id,label in enumerate(TAGS):
    tag2id[label] = id 

def label2id(labels):
    ret = []
    prev_label = ""
    for label in labels:
        if label == "O":
            ret.append([2*NUM_TAGS])
        elif label == prev_label:
            l =[tag2id[t]+ NUM_TAGS for t in label]
            ret.append(l)
        else:
            l =[tag2id[t] for t in label]
            ret.append(l)
        prev_label = label
    return ret 

In [275]:
def get_label(label_id):
    if label_id == (2*NUM_TAGS):
        return "O"
    elif label_id >= NUM_TAGS:
        return [TAGS[label_id-NUM_TAGS]]
    else:
        return TAGS[label_id]

def id2label(labels):
    ret = []
    for label in labels:
        l = [get_label(x) for x in label]
        if len(l) == 1 and l[0] == "O":
            l = "O"
        ret.append(l)
    return ret 


In [276]:
import regex as re
def clean_text(sent):
    '''
    This is text cleaning function
    '''
    ret_sent= []
    for txt in sent:
#       if len(txt) < 1:
# #         print("HI",txt, sent)
#         return -1
      fil_txt = re.sub('[^A-Za-z0-9]+', '', str(txt))
      if len(fil_txt) == 0:
        fil_txt  = txt [0]
      ret_sent.append(fil_txt)
    assert(len(ret_sent) == len(sent))
    return ret_sent

In [277]:
clean_text(["---ABC", "--"])

['ABC', '-']

In [278]:
from transformers import BertTokenizer
import numpy as np
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [279]:
from tqdm import tqdm
from torch.utils.data import TensorDataset, random_split


In [280]:
def to_bool_vec(y_id):
    y_bool = np.zeros(2*NUM_TAGS+1, np.int32)
    num_labels = len(y_id)
    for id in y_id:
        # for l in label:
          y_bool[id] = 1
    return y_bool


In [372]:
from transformers import BertForTokenClassification, AdamW, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
model = joblib.load('model_0.h5')
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [373]:
from torch import nn
from scipy.special import expit
import random
import joblib
from warnings import simplefilter
from tqdm import tqdm

m = nn.Sigmoid()
criterion = nn.BCELoss()

def compute_loss(pred,target):
        sum_mat = torch.sum(target, axis=2)
        flat_pred = pred[sum_mat > 0][:] 
        flat_target  = target[ sum_mat > 0][:]
        # print(criterion(flat_pred, flat_target))
        return criterion(m(flat_pred), flat_target)

In [374]:
def get_test_vecs():
    f = open('data/test.json')
    test_data = json.load(f)
    f.close()
    a = [d["sent"] for d in test_data]
    set_ = set()
    for idx,s in enumerate(a):
        for t in s:
            if len(t)<1:
                set_.add(idx)
    test_data = [test_data[i] for i in range(len(test_data)) if i not in set_]
    df = pd.DataFrame(test_data)
    df["sent"] = df["sent"].map( lambda x: clean_text(x))
    df["features"] = df["sent"].map(lambda x: get_sent_features(x))
    df["labels"] = df["tags"].map(lambda x: label2id(x))
    sentences = list(df["sent"])
    labels = list(df["labels"])
#     from warnings import simplefilter
    simplefilter(action='ignore', category=FutureWarning)

    input_ids = []
    attention_masks = []

    for sent in tqdm(sentences):

        sent_str = ' '.join(sent)
    #     print(len(sent))
    #     print(sent_str)
        encoded_dict = tokenizer.encode_plus(
                            sent_str,                 
                            add_special_tokens = False,
                            truncation = True,
                            max_length = 105,           
                            pad_to_max_length = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',     
                       )


        input_ids.append(encoded_dict['input_ids'][0])

        # And its attention mask
        attention_masks.append(encoded_dict['attention_mask'][0])
    new_labels = []

    # The special label ID we'll give to "extra" tokens.
    to_remove_idx = []
    null_label_id =  np.zeros(2*NUM_TAGS+1, np.int32) #-100
    idx = 0
    for (tokens, masks, orig_labels) in zip(input_ids, attention_masks, labels):

        padded_labels = []
        ty = 0
        orig_labels_i = 0 
        # print(tokens, masks, orig_labels)

        for token_id,mask_id in zip(tokens,masks):
          token_id = token_id.numpy().item()

          if mask_id.numpy().item() == 0:
            padded_labels.append(null_label_id)


          elif (token_id == tokenizer.pad_token_id) or \
              (token_id == tokenizer.cls_token_id) or \
              (token_id == tokenizer.sep_token_id):

              padded_labels.append(null_label_id)

          elif tokenizer.ids_to_tokens[token_id][0:2] == '##':

              padded_labels.append(null_label_id)

          else:
            # print(tokenizer.ids_to_tokens[token_id], orig_labels_i, len(orig_labels))
            if orig_labels_i >= len(orig_labels):
              ty+=1
              break
            else:

              padded_labels.append(to_bool_vec(orig_labels[orig_labels_i]))
            orig_labels_i += 1

        # assert(len(sen) == len(padded_labels))    
        if ty == 0:
          new_labels.append(padded_labels)
        else:
          to_remove_idx.append(idx)
        idx+=1
    filtered_attention_masks = [attention_masks[idx] for idx in range(len(attention_masks)) if idx not in to_remove_idx]
    filtered_input_ids = [input_ids[idx] for idx in range(len(input_ids)) if idx not in to_remove_idx]
    pt_input_ids = torch.stack(filtered_input_ids, dim=0)

    pt_attention_masks = torch.stack(filtered_attention_masks, dim=0)

    pt_labels = torch.tensor(new_labels, dtype=torch.float32)
    return pt_input_ids, pt_attention_masks, pt_labels

In [352]:
pt_input_ids, pt_attention_masks, pt_labels = get_test_vecs()

100%|███████████████████████████████████████| 278/278 [00:00<00:00, 1613.19it/s]


In [353]:
pt_input_ids.size(), pt_attention_masks.size(), pt_labels.size()

(torch.Size([278, 105]), torch.Size([278, 105]), torch.Size([278, 105, 227]))

In [354]:
from torch.utils.data import  SequentialSampler

pt_input_ids, pt_attention_masks, pt_labels = get_test_vecs()
batch_size = 256
prediction_data = TensorDataset(pt_input_ids, pt_attention_masks, pt_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


100%|███████████████████████████████████████| 278/278 [00:00<00:00, 1626.96it/s]


In [375]:
print('Predicting labels for {:,} test sentences...'.format(len(pt_input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 

  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = m(logits)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 278 test sentences...
    DONE.


In [376]:
all_predictions = np.concatenate(predictions, axis=0)
all_true_labels = np.concatenate(true_labels, axis=0)

In [377]:
all_predictions.shape, all_true_labels.shape

((278, 105, 227), (278, 105, 227))

In [378]:
sum_mat = np.sum(all_true_labels, axis=2)
predicted_label_ids = all_predictions [sum_mat > 0.1][:]
all_true_labels = all_true_labels [sum_mat > 0.1][:]

In [359]:
xt = [i for i in range(all_true_labels.shape[0])if np.round(all_true_labels[i][226]) != 1]

In [379]:
predicted_label_ids.shape, all_true_labels.shape

((6944, 227), (6944, 227))

In [388]:
t = 0.53
pred = []
true = []
for p in predicted_label_ids:
    rt = [i for (i,x) in enumerate(p) if x >t]
    pred.append(rt)
for p in all_true_labels:
    rt = [i for (i,x) in enumerate(p) if round(x)  == 1]
    true.append(rt)
    
    

In [384]:
def f1(p, r):
            if r == 0.:
                return 0.
            return 2 * p * r / float( p + r )
def loose_macro(true, pred):
    num_entities = len(true)
    p = 0.
    r = 0.
    for true_labels, predicted_labels in zip(true, pred):
        if len(predicted_labels) > 0:
            p += len(set(predicted_labels).intersection(set(true_labels))) / float(len(predicted_labels))
        if len(true_labels):
            r += len(set(predicted_labels).intersection(set(true_labels))) / float(len(true_labels))
    precision = p / num_entities
    recall = r / num_entities
    return precision, recall, f1( precision, recall)
def loose_micro(true, pred):
    num_predicted_labels = 0.
    num_true_labels = 0.
    num_correct_labels = 0.
    for true_labels, predicted_labels in zip(true, pred):
        num_predicted_labels += len(predicted_labels)
        num_true_labels += len(true_labels)
        num_correct_labels += len(set(predicted_labels).intersection(set(true_labels))) 
    if num_predicted_labels > 0:
        precision = num_correct_labels / num_predicted_labels
    else:
        precision = 0.
    recall = num_correct_labels / num_true_labels
    return precision, recall, f1( precision, recall)

In [390]:
d

{0: {'0.1': {'macro': (0.8457726846923989,
    0.9856464906090527,
    0.9103681885200792),
   'micro': (0.6642058175161466, 0.9767978286079013, 0.7907292611212928)},
  '0.11': {'macro': (0.8481239759216714,
    0.9841831493156995,
    0.9111019808151525),
   'micro': (0.6695542540194391, 0.9749335658922077, 0.7938896336301721)},
  '0.12': {'macro': (0.8502150061694125,
    0.9825966880165531,
    0.9116249659625671),
   'micro': (0.6744995038302551, 0.9729144745780921, 0.796679327689346)},
  '0.13': {'macro': (0.8522855155357258,
    0.981046996851189,
    0.9121445671494535),
   'micro': (0.6794295506052733, 0.9709617383775859, 0.7994468971868763)},
  '0.13999999999999999': {'macro': (0.8541551253088029,
    0.9793013570571559,
    0.9124571882642316),
   'micro': (0.6841649596793988, 0.9688288954401398, 0.8019857788728508)},
  '0.14999999999999997': {'macro': (0.8560589973093382,
    0.9774022072919138,
    0.9127151983826365),
   'micro': (0.6886975121017674, 0.966525425067698, 0.8

In [389]:
loose_macro(true,pred), loose_micro(true,pred)

((0.8893913210445468, 0.8951732910906296, 0.892272939307375),
 (0.8808111463650159, 0.8658801193382154, 0.8732818163167614))