In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import pickle
import json
import wget
import os

In [96]:
# device

device(type='cuda')

In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print( torch.cuda.device_count())
    print('Available:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


4
Available: NVIDIA A100-SXM4-80GB


In [None]:
!pip install wget
!pip install transformers

In [None]:
# url_data = ''
# wget.download(url_data)


In [3]:
def word_shape_features(word):
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha(),word.isalnum(), word.isnumeric()])

def get_word_features(word):
    return word_shape_features(word)

def get_sent_features(sent):
    ret = []
    for word in sent:
        ret.append(get_word_features(word))
    return ret

In [4]:
TAGS =  pickle.load(open( "tags.pickle", "rb" ))
TAGS.remove("O") 
NUM_TAGS = len(TAGS)

tag2id = {}
for id,label in enumerate(TAGS):
    tag2id[label] = id 

def label2id(labels):
    ret = []
    prev_label = ""
    for label in labels:
        if label == "O":
            ret.append([2*NUM_TAGS])
        elif label == prev_label:
            l =[tag2id[t]+ NUM_TAGS for t in label]
            ret.append(l)
        else:
            l =[tag2id[t] for t in label]
            ret.append(l)
        prev_label = label
    return ret 

In [5]:
def get_label(label_id):
    if label_id == (2*NUM_TAGS):
        return "O"
    elif label_id >= NUM_TAGS:
        return [TAGS[label_id-NUM_TAGS]]
    else:
        return TAGS[label_id]

def id2label(labels):
    ret = []
    for label in labels:
        l = [get_label(x) for x in label]
        if len(l) == 1 and l[0] == "O":
            l = "O"
        ret.append(l)
    return ret 


In [6]:
import regex as re
def clean_text(sent):
    '''
    This is text cleaning function
    '''
    ret_sent= []
    for txt in sent:
#       if len(txt) < 1:
# #         print("HI",txt, sent)
#         return -1
      fil_txt = re.sub('[^A-Za-z0-9]+', '', str(txt))
      if len(fil_txt) == 0:
        fil_txt  = txt [0]
      ret_sent.append(fil_txt)
    assert(len(ret_sent) == len(sent))
    return ret_sent

In [9]:
clean_text(["---ABC", "--"])

['ABC', '-']

In [17]:
from transformers import BertTokenizer
import numpy as np
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [26]:
from tqdm import tqdm
from torch.utils.data import TensorDataset, random_split


In [18]:
def to_bool_vec(y_id):
    y_bool = np.zeros(2*NUM_TAGS+1, np.int32)
    num_labels = len(y_id)
    for id in y_id:
        # for l in label:
          y_bool[id] = 1
    return y_bool


In [234]:
from transformers import BertForTokenClassification, AdamW, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
model = joblib.load('model_0.h5')
model.cuda()

In [262]:
from torch import nn
from scipy.special import expit
import random
import joblib
from warnings import simplefilter
from tqdm import tqdm

m = nn.Sigmoid()
criterion = nn.BCELoss()

def compute_loss(pred,target):
        sum_mat = torch.sum(target, axis=2)
        flat_pred = pred[sum_mat > 0][:] 
        flat_target  = target[ sum_mat > 0][:]
        # print(criterion(flat_pred, flat_target))
        return criterion(m(flat_pred), flat_target)

In [255]:
def get_test_vecs():
#     f = open('data/test.json')
    f = open('data/dev.json')

    test_data = json.load(f)
    f.close()
    a = [d["sent"] for d in test_data]
    set_ = set()
    for idx,s in enumerate(a):
        for t in s:
            if len(t)<1:
                set_.add(idx)
    test_data = [test_data[i] for i in range(len(test_data)) if i not in set_]
    df = pd.DataFrame(test_data)
    df["sent"] = df["sent"].map( lambda x: clean_text(x))
    df["features"] = df["sent"].map(lambda x: get_sent_features(x))
    df["labels"] = df["tags"].map(lambda x: label2id(x))
    sentences = list(df["sent"])
    labels = list(df["labels"])
#     from warnings import simplefilter
    simplefilter(action='ignore', category=FutureWarning)

    input_ids = []
    attention_masks = []

    for sent in tqdm(sentences):

        sent_str = ' '.join(sent)
    #     print(len(sent))
    #     print(sent_str)
        encoded_dict = tokenizer.encode_plus(
                            sent_str,                 
                            add_special_tokens = False,
                            truncation = True,
                            max_length = 105,           
                            pad_to_max_length = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',     
                       )


        input_ids.append(encoded_dict['input_ids'][0])

        # And its attention mask
        attention_masks.append(encoded_dict['attention_mask'][0])
    new_labels = []

    # The special label ID we'll give to "extra" tokens.
    to_remove_idx = []
    null_label_id =  np.zeros(2*NUM_TAGS+1, np.int32) #-100
    idx = 0
    for (tokens, masks, orig_labels) in zip(input_ids, attention_masks, labels):

        padded_labels = []
        ty = 0
        orig_labels_i = 0 
        # print(tokens, masks, orig_labels)

        for token_id,mask_id in zip(tokens,masks):
          token_id = token_id.numpy().item()

          if mask_id.numpy().item() == 0:
            padded_labels.append(null_label_id)


          elif (token_id == tokenizer.pad_token_id) or \
              (token_id == tokenizer.cls_token_id) or \
              (token_id == tokenizer.sep_token_id):

              padded_labels.append(null_label_id)

          elif tokenizer.ids_to_tokens[token_id][0:2] == '##':

              padded_labels.append(null_label_id)

          else:
            # print(tokenizer.ids_to_tokens[token_id], orig_labels_i, len(orig_labels))
            if orig_labels_i >= len(orig_labels):
              ty+=1
              break
            else:

              padded_labels.append(to_bool_vec(orig_labels[orig_labels_i]))
            orig_labels_i += 1

        # assert(len(sen) == len(padded_labels))    
        if ty == 0:
          new_labels.append(padded_labels)
        else:
          to_remove_idx.append(idx)
        idx+=1
    filtered_attention_masks = [attention_masks[idx] for idx in range(len(attention_masks)) if idx not in to_remove_idx]
    filtered_input_ids = [input_ids[idx] for idx in range(len(input_ids)) if idx not in to_remove_idx]
    pt_input_ids = torch.stack(filtered_input_ids, dim=0)

    pt_attention_masks = torch.stack(filtered_attention_masks, dim=0)

    pt_labels = torch.tensor(new_labels, dtype=torch.float32)
    return pt_input_ids, pt_attention_masks, pt_labels

In [238]:
pt_input_ids, pt_attention_masks, pt_labels = get_test_vecs()

100%|███████████████████████████████████████| 278/278 [00:00<00:00, 1620.21it/s]


In [239]:
pt_input_ids.size(), pt_attention_masks.size(), pt_labels.size()

(torch.Size([278, 105]), torch.Size([278, 105]), torch.Size([278, 105, 227]))

In [240]:
from torch.utils.data import  SequentialSampler

pt_input_ids, pt_attention_masks, pt_labels = get_test_vecs()
batch_size = 256
prediction_data = TensorDataset(pt_input_ids, pt_attention_masks, pt_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


100%|███████████████████████████████████████| 278/278 [00:00<00:00, 1614.22it/s]


In [241]:
print('Predicting labels for {:,} test sentences...'.format(len(pt_input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 

  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]
  logits = m(logits)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 278 test sentences...
    DONE.


In [242]:
all_predictions = np.concatenate(predictions, axis=0)
all_true_labels = np.concatenate(true_labels, axis=0)

In [243]:
all_predictions.shape, all_true_labels.shape

((278, 105, 227), (278, 105, 227))

In [244]:
sum_mat = np.sum(all_true_labels, axis=2)
# [sum_mat >0.2])
predicted_label_ids = all_predictions [sum_mat > 0.1][:]
all_true_labels = all_true_labels [sum_mat > 0.1][:]

In [139]:
# sum_mat = np.sum(all_true_labels, axis=2)


In [112]:
# predicted_label_ids = all_predictions [sum_mat > 0.1][:]
# all_true_labels = all_true_labels [sum_mat > 0.1][:]

In [245]:
xt = [i for i in range(all_true_labels.shape[0])if np.round(all_true_labels[i][226]) != 1]

In [246]:
predicted_label_ids.shape, all_true_labels.shape

((6944, 227), (6944, 227))

In [252]:
t = 0.65
pred = []
true = []
for p in predicted_label_ids:
    rt = [i for (i,x) in enumerate(p) if x >t]
    pred.append(rt)
for p in all_true_labels:
    rt = [i for (i,x) in enumerate(p) if round(x)  == 1]
    true.append(rt)
    
    

In [253]:
loose_macro(true,pred), loose_micro(true,pred)

((0.8716565860215054, 0.8762480798771122, 0.8739463023527214),
 (0.8965367965367965, 0.8425549227013832, 0.8687080536912751))

In [210]:
# for a,b in zip(pred, true):
#     print(a,b)

In [211]:
def f1(p, r):
            if r == 0.:
                return 0.
            return 2 * p * r / float( p + r )
def loose_macro(true, pred):
    num_entities = len(true)
    p = 0.
    r = 0.
    for true_labels, predicted_labels in zip(true, pred):
        if len(predicted_labels) > 0:
            p += len(set(predicted_labels).intersection(set(true_labels))) / float(len(predicted_labels))
        if len(true_labels):
            r += len(set(predicted_labels).intersection(set(true_labels))) / float(len(true_labels))
    precision = p / num_entities
    recall = r / num_entities
    return precision, recall, f1( precision, recall)
def loose_micro(true, pred):
    num_predicted_labels = 0.
    num_true_labels = 0.
    num_correct_labels = 0.
    for true_labels, predicted_labels in zip(true, pred):
        num_predicted_labels += len(predicted_labels)
        num_true_labels += len(true_labels)
        num_correct_labels += len(set(predicted_labels).intersection(set(true_labels))) 
    if num_predicted_labels > 0:
        precision = num_correct_labels / num_predicted_labels
    else:
        precision = 0.
    recall = num_correct_labels / num_true_labels
    return precision, recall, f1( precision, recall)

In [178]:
predicted_label_ids = predicted_label_ids[xt]
all_true_labels = all_true_labels[xt]

In [179]:
rt = []
yt = []
for i1,at in enumerate(all_true_labels):
    for i,x in enumerate(at):
        if np.round(x) == 1:
            rt.append(predicted_label_ids[i1][i])
        else :
            yt.append(predicted_label_ids[i1][i])
            
            
    

In [181]:
def f1(p, r):
    if r == 0.:
        return 0.
    return 2 * p * r / float( p + r )


In [201]:
t = 0.23
TP = len([x for x in rt if x >= t]) 
TN = len([x for x in yt if x < t]) 
FP = len([x for x in yt if x >= t])
FN = len([x for x in rt if x < t])
recall = TP/(TP + FN)
precision =  TP/(TP + FP)
print("recall: ", TP/(TP + FN))
print("precision: ", TP/(TP + FP))
print("f1: ", f1(precision, recall))
print("accuracy: ", (TP+ TN)*100/(TP + FP + FN + TN))

recall:  0.4797979797979798
precision:  0.36004331348132107
f1:  0.4113826167646149
accuracy:  99.12308996737508


In [183]:
print("recall: ", TP/(TP + FN))

619 214542 1084 767


In [184]:
print("precision: ", TP/(TP + FP))

precision:  0.3634762184380505


In [186]:
print("recall: ", TP/(TP + FN))

recall:  0.4466089466089466


In [190]:
print("accuracy: ", (TP+ FP)*100/(TP + FP + FN + TN))

accuracy:  0.7847492304573019


In [None]:
rt

In [100]:
for i,z in enumerate(rt):
    if z == True:
        print(i)
    

94
134


In [None]:
def f1(p, r):
            if r == 0.:
                return 0.
            return 2 * p * r / float( p + r )
        def loose_macro(true, pred):
            num_entities = len(true)
            p = 0.
            r = 0.
            for true_labels, predicted_labels in zip(true, pred):
                if len(predicted_labels) > 0:
                    p += len(set(predicted_labels).intersection(set(true_labels))) / float(len(predicted_labels))
                if len(true_labels):
                    r += len(set(predicted_labels).intersection(set(true_labels))) / float(len(true_labels))
            precision = p / num_entities
            recall = r / num_entities
            return precision, recall, f1( precision, recall)
        def loose_micro(true, pred):
            num_predicted_labels = 0.
            num_true_labels = 0.
            num_correct_labels = 0.
            for true_labels, predicted_labels in zip(true, pred):
                num_predicted_labels += len(predicted_labels)
                num_true_labels += len(true_labels)
                num_correct_labels += len(set(predicted_labels).intersection(set(true_labels))) 
            if num_predicted_labels > 0:
                precision = num_correct_labels / num_predicted_labels
            else:
                precision = 0.
            recall = num_correct_labels / num_true_labels
            return precision, recall, f1( precision, recall)

In [52]:
def f1(p, r):
    if r == 0.:
        return 0.
    return 2 * p * r / float( p + r )
def loose_macro(true, pred):
    num_entities = len(true)
    p = 0.
    r = 0.
    for true_labels, predicted_labels in zip(true, pred):
        if len(predicted_labels) > 0:
            p += len(set(predicted_labels).intersection(set(true_labels))) / float(len(predicted_labels))
        if len(true_labels):
            r += len(set(predicted_labels).intersection(set(true_labels))) / float(len(true_labels))
    precision = p / num_entities
    recall = r / num_entities
    return precision, recall, f1( precision, recall)
def loose_micro(true, pred):
    num_predicted_labels = 0.
    num_true_labels = 0.
    num_correct_labels = 0.
    for true_labels, predicted_labels in zip(true, pred):
        num_predicted_labels += len(predicted_labels)
        num_true_labels += len(true_labels)
        num_correct_labels += len(set(predicted_labels).intersection(set(true_labels))) 
    if num_predicted_labels > 0:
        precision = num_correct_labels / num_predicted_labels
    else:
        precision = 0.
    recall = num_correct_labels / num_true_labels
    return precision, recall, f1( precision, recall)

In [None]:
from sklearn.metrics import f1_score

# First, combine the results across the batches.
all_predictions = np.concatenate(predictions, axis=0)
all_true_labels = np.concatenate(true_labels, axis=0)

print("After flattening the batches, the predictions have shape:")
print("    ", all_predictions.shape)

# Next, let's remove the third dimension (axis 2), which has the scores
# for all 18 labels. 

# For each token, pick the label with the highest score.
# predicted_label_ids = np.argmax(all_predictions, axis=2)

# print("\nAfter choosing the highest scoring label for each token:")
# print("    ", predicted_label_ids.shape) 


# Eliminate axis 0, which corresponds to the sentences.
predicted_label_ids = np.concatenate(all_predictions, axis=0)
all_true_labels = np.concatenate(all_true_labels, axis=0)
sum_mat = np.sum(all_true_labels, axis=1)
# [sum_mat >0.2])
predicted_label_ids = predicted_label_ids [sum_mat > 0.1][:]
all_true_labels = all_true_labels [sum_mat > 0.1][:]

print("\nAfter flattening the sentences, we have predictions:")
print("    ", predicted_label_ids.shape)
print("and ground truth:")
print("    ", all_true_labels.shape)
# print(sum_mat.shape)
predicted_label_ids = np.concatenate(predicted_label_ids, axis=0)
all_true_labels = np.concatenate(all_true_labels, axis=0)


print("\nAfter flattening the sentences, we have predictions:")
print("    ", predicted_label_ids.shape)
print("and ground truth:")
print("    ", all_true_labels.shape)


In [None]:
# all_true_labels[:1000]

In [99]:
# all_true_labels

In [256]:
str(0.1)

'0.1'

In [97]:
predicted_label_ids

NameError: name 'predicted_label_ids' is not defined

In [None]:
from sklearn.metrics import f1_score

f1 = f1_score(all_true_labels, predicted_label_ids, average='micro') 

print ("F1 score: {:.2%}".format(f1))

In [None]:
d = {}
pt_input_ids, pt_attention_masks, pt_labels = get_test_vecs()
prediction_data = TensorDataset(pt_input_ids, pt_attention_masks, pt_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

for m1 in tqdm(range(0,8)):
    d[m1] = {}
    name = 'model_{}.h5'.format(m1)
    model = joblib.load(name)
    model.cuda()
    batch_size = 512
#     print('Predicting labels for {:,} test sentences...'.format(len(pt_input_ids)))
    # Put model in evaluation mode
    model.eval()

    # Tracking variables 
    predictions , true_labels = [], []

    # Predict 
    for batch in prediction_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)

      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask, b_labels = batch

      # Telling the model not to compute or store gradients, saving memory and 

      with torch.no_grad():
          # Forward pass, calculate logit predictions
          outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)

      logits = outputs[0]
      logits = m(logits)

      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      # Store predictions and true labels
      predictions.append(logits)
      true_labels.append(label_ids)

#     print('    DONE.')
    all_predictions = np.concatenate(predictions, axis=0)
    all_true_labels = np.concatenate(true_labels, axis=0)
    sum_mat = np.sum(all_true_labels, axis=2)
    # [sum_mat >0.2])
    predicted_label_ids = all_predictions [sum_mat > 0.1][:]
    all_true_labels = all_true_labels [sum_mat > 0.1][:]
    for thresh in np.arange(0.1,9.0,0.01):
        d[m1][str(thresh)] = {}
        t = thresh
        pred = []
        true = []
        for p in predicted_label_ids:
            rt = [i for (i,x) in enumerate(p) if x >t]
            pred.append(rt)
        for p in all_true_labels:
            rt = [i for (i,x) in enumerate(p) if round(x)  == 1]
            true.append(rt)
        d[m1][str(thresh)]["macro"] = loose_macro(true,pred)
        d[m1][str(thresh)]["micro"] = loose_micro(true,pred)   


100%|█████████████████████████████████████| 9956/9956 [00:07<00:00, 1406.99it/s]
  0%|                                                     | 0/8 [00:00<?, ?it/s]

In [None]:
d

In [None]:
# import transformers
# from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
# import torch
# from transformers.modeling_outputs import SequenceClassifierOutput
# from transformers import BertConfig, BertModel
# class seq2SeqBERT(torch.nn.Module):
# 	def __init__(self):
# 		super(seq2SeqBERT, self).__init__()
# 		configuration = BertConfig()
# 		self.bert = BertModel(configuration)
# 		self.classifier = torch.nn.Linear(768, 5)
# 		self.criterion = torch.nn.BCEWithLogitsLoss()
# 	def forward(self, input_ids, attention_mask, labels = None):
# 		embeddings = self.bert(input_ids = input_ids, attention_mask = attention_mask)
# 		logits = self.classifier(embeddings['last_hidden_state'])
# 		loss_ = None
# 		flat_outputs = logits[labels!=-100]
# 		flat_labels  = labels[ labels!=-100]
# 		if labels is not None:
# 			loss_ = self.criterion(flat_outputs, flat_labels)
# 		return SequenceClassifierOutput(loss = loss_, logits = logits, attentions=embeddings