In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import pickle
import json
import wget
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print( torch.cuda.device_count())
    print('Available:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


4
Available: NVIDIA A100-SXM4-80GB


In [None]:
!pip install wget
!pip install transformers

In [None]:
# url_data = ''
# wget.download(url_data)


In [3]:
def word_shape_features(word):
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha(),word.isalnum(), word.isnumeric()])

def get_word_features(word):
    return word_shape_features(word)

def get_sent_features(sent):
    ret = []
    for word in sent:
        ret.append(get_word_features(word))
    return ret

In [4]:
TAGS =  pickle.load(open( "tags.pickle", "rb" ))
TAGS.remove("O") 
NUM_TAGS = len(TAGS)

tag2id = {}
for id,label in enumerate(TAGS):
    tag2id[label] = id 

def label2id(labels):
    ret = []
    prev_label = ""
    for label in labels:
        if label == "O":
            ret.append([2*NUM_TAGS])
        elif label == prev_label:
            l =[tag2id[t]+ NUM_TAGS for t in label]
            ret.append(l)
        else:
            l =[tag2id[t] for t in label]
            ret.append(l)
        prev_label = label
    return ret 

In [6]:
def get_label(label_id):
    if label_id == (2*NUM_TAGS):
        return "O"
    elif label_id >= NUM_TAGS:
        return [TAGS[label_id-NUM_TAGS]]
    else:
        return TAGS[label_id]

def id2label(labels):
    ret = []
    for label in labels:
        l = [get_label(x) for x in label]
        if len(l) == 1 and l[0] == "O":
            l = "O"
        ret.append(l)
    return ret 


In [5]:
import regex as re
def clean_text(sent):
    '''
    This is text cleaning function
    '''
    ret_sent= []
    for txt in sent:
#       if len(txt) < 1:
# #         print("HI",txt, sent)
#         return -1
      fil_txt = re.sub('[^A-Za-z0-9]+', '', str(txt))
      if len(fil_txt) == 0:
        fil_txt  = txt [0]
      ret_sent.append(fil_txt)
    assert(len(ret_sent) == len(sent))
    return ret_sent

In [7]:
clean_text(["---ABC", "--"])

['ABC', '-']

In [8]:
f = open('data/train.json')
data = json.load(f)
f.close()


In [9]:
a = [d["sent"] for d in data]
set_ = set()
for idx,s in enumerate(a):
    for t in s:
        if len(t)<1:
            set_.add(idx)
data = [data[i] for i in range(len(data)) if i not in set_]

In [10]:
import pandas as pd
df = pd.DataFrame(data)

In [None]:
data_cp =data.copy()

In [11]:
df = df [:2500]

In [None]:
# data = data_cp.copy()

In [12]:
df["sent"] = df["sent"].map( lambda x: clean_text(x))

In [13]:
df["features"] = df["sent"].map(lambda x: get_sent_features(x))

In [14]:
df["labels"] = df["tags"].map(lambda x: label2id(x))

In [15]:
file = open('processed_dataframe', 'wb')

# dump information to that file
pickle.dump(df, file)

# close the file
file.close()

In [None]:
len(df)

In [None]:
# for id, d in enumerate(data):
#     data[id]["sent"] = clean_text(data[id]["sent"])
# for id, d in enumerate(data):
#     data[id]["features"] = get_sent_features(d["sent"])
#     data[id]["labels"] = label2id(d["tags"])
    
    # import pandas as pd
# df = pd.DataFrame(data)

In [None]:
# df.head()

In [16]:
from transformers import BertTokenizer
import numpy as np
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [17]:
tokenizer.decode([101, 2265, 2033, 3152, 2007, 3881, 100, 2013, 1996, 3865, 102])


'[CLS] show me films with drew [UNK] from the 1980s [SEP]'

In [19]:
sentences = list(df["sent"])


In [20]:
labels = list(df["labels"])


In [None]:
' '.join(sentences[34])


In [None]:
print("Number of training sentences: {:,}".format(len(sentences)))


In [None]:
# TokenLength=[len(tokenizer.encode(' '.join(i),add_special_tokens=True)) for i in sentences]


In [None]:
from tqdm import tqdm 
mt = []
for i in tqdm(sentences):
    mt.append(len(tokenizer.encode(' '.join(i),add_special_tokens=True)))


In [None]:
TokenLength = [x for x in mt if x >100]

In [None]:
to_retain = [i for i,x in enumerate(mt) if x <103]

In [None]:
sentences = [ sentences[i] for i in to_retain]


In [None]:
labels =  [ labels[i] for i in to_retain]

In [None]:
len(mt), len(TokenLength), len(labels)

In [None]:
print('Minimum  length: {:,} tokens'.format(min(TokenLength)))
print('Maximum length: {:,} tokens'.format(max(TokenLength)))
print('Median length: {:,} tokens'.format(int(np.median(TokenLength))))


In [None]:
!pip install seaborn

In [22]:
from tqdm import tqdm

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.figure(figsize=(24,24))
plt.rcParams["figure.figsize"] = (10,5)

# Plot the distribution of comment lengths.
sns.distplot(TokenLength, kde=False, rug=False,color='plum')

plt.title('Sentence Lengths')
plt.xlabel('Sentence Length')


In [None]:
SampleSentence=tokenizer.encode_plus("- abc", add_special_tokens = True,truncation = True,max_length = 100,padding = True,return_attention_mask = True, return_tensors = 'pt')
SampleSentence

In [None]:
tokenizer.ids_to_tokens[1011]

In [23]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

input_ids = []
attention_masks = []

for sent in tqdm(sentences):

    sent_str = ' '.join(sent)
#     print(len(sent))
#     print(sent_str)
    encoded_dict = tokenizer.encode_plus(
                        sent_str,                 
                        add_special_tokens = False,
                        truncation = True,
                        max_length = 105,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    
        
    input_ids.append(encoded_dict['input_ids'][0])
    
    # And its attention mask
    attention_masks.append(encoded_dict['attention_mask'][0])
    # break

# print('Original: ', sentences[24])
# print('Token IDs:', input_ids[24])
# print('Masks:', attention_masks[24])


100%|█████████████████████████████████████| 2500/2500 [00:01<00:00, 1375.30it/s]


In [None]:
i11 = input_ids.copy()
i12 = attention_masks.copy()
i13= labels.copy()

In [None]:
input_ids = input_ids[:2000]
attention_masks = attention_masks[:2000]
labels = labels[:2000]

In [25]:
def to_bool_vec(y_id):
    y_bool = np.zeros(2*NUM_TAGS+1, np.int32)
    num_labels = len(y_id)
    for id in y_id:
        # for l in label:
          y_bool[id] = 1
    return y_bool


In [26]:
new_labels = []

# The special label ID we'll give to "extra" tokens.
to_remove_idx = []
null_label_id =  np.zeros(2*NUM_TAGS+1, np.int32) #-100
idx = 0
for (tokens, masks, orig_labels) in zip(input_ids, attention_masks, labels):
    
    padded_labels = []
    ty = 0
    orig_labels_i = 0 
    # print(tokens, masks, orig_labels)

    for token_id,mask_id in zip(tokens,masks):
      token_id = token_id.numpy().item()

      if mask_id.numpy().item() == 0:
        padded_labels.append(null_label_id)
      

      elif (token_id == tokenizer.pad_token_id) or \
          (token_id == tokenizer.cls_token_id) or \
          (token_id == tokenizer.sep_token_id):
          
          padded_labels.append(null_label_id)

      elif tokenizer.ids_to_tokens[token_id][0:2] == '##':

          padded_labels.append(null_label_id)
 
      else:
        # print(tokenizer.ids_to_tokens[token_id], orig_labels_i, len(orig_labels))
        if orig_labels_i >= len(orig_labels):
          ty+=1
          break
        else:
          
          padded_labels.append(to_bool_vec(orig_labels[orig_labels_i]))
        orig_labels_i += 1

    # assert(len(sen) == len(padded_labels))    
    if ty == 0:
      new_labels.append(padded_labels)
    else:
      to_remove_idx.append(idx)
    idx+=1

In [None]:
print(ty)

In [None]:
filtered_attention_masks = [attention_masks[idx] for idx in range(len(attention_masks)) if idx not in to_remove_idx]
filtered_input_ids = [input_ids[idx] for idx in range(len(input_ids)) if idx not in to_remove_idx]


In [None]:
len(input_ids), len(to_remove_idx)

In [None]:
pt_input_ids = torch.stack(filtered_input_ids, dim=0)

pt_attention_masks = torch.stack(filtered_attention_masks, dim=0)

pt_labels = torch.tensor(new_labels, dtype=torch.float32)


In [None]:
# for x in pt_labels:
#   for y in x:
#     if torch.sum(y) == 0:
#       print(y)  
#       break

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
train_dataset = TensorDataset(pt_input_ids, pt_attention_masks, pt_labels)

print('{:>5,} training samples'.format(len(train_dataset)))



In [None]:
print('\nSentence:    ', sentences[2])
print('\nLabels:      ', labels[2])
print('\nBERT Tokens: ', tokenizer.tokenize(' '.join(sentences[2])))
print('\nToken IDs:   ', input_ids[2])
print('\nNew Labels:  ', new_labels[2])
print('\nMask:        ', attention_masks[2])

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


batch_size = 256

train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size )

# validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size   )

In [None]:
from transformers import BertForTokenClassification, AdamW, BertConfig


model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels = len(TAGS)*2 + 1, output_attentions = False, output_hidden_states = False)


# model.cuda()

In [None]:
model.cuda()

In [None]:
# Load the AdamW optimizer
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate 
                  eps = 1e-8 # args.adam_epsilon 
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs 
epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


In [None]:
from torch import nn
from scipy.special import expit


m = nn.Sigmoid()
criterion = nn.BCELoss()

def compute_loss(pred,target):


        sum_mat = torch.sum(target, axis=2)
        flat_pred = pred[sum_mat > 0][:] 
        flat_target  = target[ sum_mat > 0][:]
        # print(criterion(flat_pred, flat_target))
        return criterion(m(flat_pred), flat_target)




In [None]:
z = np.array([ 0, 0, 0.1])
g = expit(z)
g

In [None]:
import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):

    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    total_loss = 0

    model.train()
    
    for step, batch in enumerate(train_dataloader):
       
        if step % 40 == 0 and not step == 0:
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
     
        # with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        loss = compute_loss(outputs.logits, b_labels)
        loss.requires_grad_()
        # print(loss.item())
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)            
    loss_values.append(avg_train_loss)

    print("  Average training loss: {0:.2f}".format(avg_train_loss))

In [None]:
import joblib
filename = 'model1.h5'
joblib.dump(model, filename)
 

In [None]:
len(train_dataloader)

In [None]:
import matplotlib.pyplot as plt
# % matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_values, 'b-o')

# Label the plot.
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()

In [None]:
def get_test_vecs():
    f = open('data/test.json')
    test_data = json.load(f)
    f.close()
    a = [d["sent"] for d in test_data]
    set_ = set()
    for idx,s in enumerate(a):
        for t in s:
            if len(t)<1:
                set_.add(idx)
    test_data = [test_data[i] for i in range(len(test_data)) if i not in set_]
    df = pd.DataFrame(test_data)
    df["sent"] = df["sent"].map( lambda x: clean_text(x))
    df["features"] = df["sent"].map(lambda x: get_sent_features(x))
    df["labels"] = df["tags"].map(lambda x: label2id(x))
    sentences = list(df["sent"])
    labels = list(df["labels"])
#     from warnings import simplefilter
    simplefilter(action='ignore', category=FutureWarning)

    input_ids = []
    attention_masks = []

    for sent in tqdm(sentences):

        sent_str = ' '.join(sent)
    #     print(len(sent))
    #     print(sent_str)
        encoded_dict = tokenizer.encode_plus(
                            sent_str,                 
                            add_special_tokens = False,
                            truncation = True,
                            max_length = 105,           
                            pad_to_max_length = True,
                            return_attention_mask = True,   
                            return_tensors = 'pt',     
                       )


        input_ids.append(encoded_dict['input_ids'][0])

        # And its attention mask
        attention_masks.append(encoded_dict['attention_mask'][0])
    new_labels = []

    # The special label ID we'll give to "extra" tokens.
    to_remove_idx = []
    null_label_id =  np.zeros(2*NUM_TAGS+1, np.int32) #-100
    idx = 0
    for (tokens, masks, orig_labels) in zip(input_ids, attention_masks, labels):

        padded_labels = []
        ty = 0
        orig_labels_i = 0 
        # print(tokens, masks, orig_labels)

        for token_id,mask_id in zip(tokens,masks):
          token_id = token_id.numpy().item()

          if mask_id.numpy().item() == 0:
            padded_labels.append(null_label_id)


          elif (token_id == tokenizer.pad_token_id) or \
              (token_id == tokenizer.cls_token_id) or \
              (token_id == tokenizer.sep_token_id):

              padded_labels.append(null_label_id)

          elif tokenizer.ids_to_tokens[token_id][0:2] == '##':

              padded_labels.append(null_label_id)

          else:
            # print(tokenizer.ids_to_tokens[token_id], orig_labels_i, len(orig_labels))
            if orig_labels_i >= len(orig_labels):
              ty+=1
              break
            else:

              padded_labels.append(to_bool_vec(orig_labels[orig_labels_i]))
            orig_labels_i += 1

        # assert(len(sen) == len(padded_labels))    
        if ty == 0:
          new_labels.append(padded_labels)
        else:
          to_remove_idx.append(idx)
        idx+=1
    filtered_attention_masks = [attention_masks[idx] for idx in range(len(attention_masks)) if idx not in to_remove_idx]
    filtered_input_ids = [input_ids[idx] for idx in range(len(input_ids)) if idx not in to_remove_idx]
    pt_input_ids = torch.stack(filtered_input_ids, dim=0)

    pt_attention_masks = torch.stack(filtered_attention_masks, dim=0)

    pt_labels = torch.tensor(new_labels, dtype=torch.float32)
    return pt_input_ids, pt_attention_masks, pt_labels

In [None]:
pt_input_ids.size(), pt_attention_masks.size(), pt_labels.size()

In [None]:
from torch.utils.data import  SequentialSampler

pt_input_ids, pt_attention_masks, pt_labels = get_test_vecs()
batch_size = 256
prediction_data = TensorDataset(pt_input_ids, pt_attention_masks, pt_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(pt_input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 

  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

In [None]:
predictions[0]

In [None]:
from sklearn.metrics import f1_score

# First, combine the results across the batches.
all_predictions = np.concatenate(predictions, axis=0)
all_true_labels = np.concatenate(true_labels, axis=0)

print("After flattening the batches, the predictions have shape:")
print("    ", all_predictions.shape)

# Next, let's remove the third dimension (axis 2), which has the scores
# for all 18 labels. 

# For each token, pick the label with the highest score.
# predicted_label_ids = np.argmax(all_predictions, axis=2)

# print("\nAfter choosing the highest scoring label for each token:")
# print("    ", predicted_label_ids.shape) 


# Eliminate axis 0, which corresponds to the sentences.
predicted_label_ids = np.concatenate(all_predictions, axis=0)
all_true_labels = np.concatenate(all_true_labels, axis=0)
sum_mat = np.sum(all_true_labels, axis=1)
# [sum_mat >0.2])
predicted_label_ids = predicted_label_ids [sum_mat > 0.1][:]
all_true_labels = all_true_labels [sum_mat > 0.1][:]

print("\nAfter flattening the sentences, we have predictions:")
print("    ", predicted_label_ids.shape)
print("and ground truth:")
print("    ", all_true_labels.shape)
# print(sum_mat.shape)
predicted_label_ids = np.concatenate(predicted_label_ids, axis=0)
all_true_labels = np.concatenate(all_true_labels, axis=0)


print("\nAfter flattening the sentences, we have predictions:")
print("    ", predicted_label_ids.shape)
print("and ground truth:")
print("    ", all_true_labels.shape)


In [None]:
# all_true_labels[:1000]

In [None]:
all_true_labels

In [None]:
predicted_label_ids

In [None]:
from sklearn.metrics import f1_score

f1 = f1_score(all_true_labels, predicted_label_ids, average='micro') 

print ("F1 score: {:.2%}".format(f1))

In [None]:
# import transformers
# from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
# import torch
# from transformers.modeling_outputs import SequenceClassifierOutput
# from transformers import BertConfig, BertModel
# class seq2SeqBERT(torch.nn.Module):
# 	def __init__(self):
# 		super(seq2SeqBERT, self).__init__()
# 		configuration = BertConfig()
# 		self.bert = BertModel(configuration)
# 		self.classifier = torch.nn.Linear(768, 5)
# 		self.criterion = torch.nn.BCEWithLogitsLoss()
# 	def forward(self, input_ids, attention_mask, labels = None):
# 		embeddings = self.bert(input_ids = input_ids, attention_mask = attention_mask)
# 		logits = self.classifier(embeddings['last_hidden_state'])
# 		loss_ = None
# 		flat_outputs = logits[labels!=-100]
# 		flat_labels  = labels[ labels!=-100]
# 		if labels is not None:
# 			loss_ = self.criterion(flat_outputs, flat_labels)
# 		return SequenceClassifierOutput(loss = loss_, logits = logits, attentions=embeddings