In [None]:
! pip install transformers
! pip install datasets
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import f1_score


In [2]:
from transformers import AutoTokenizer, AutoModel
from collections import defaultdict
from transformers import get_linear_schedule_with_warmup
import time

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

from sklearn.metrics import f1_score
import datasets
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
from transformers import BertModel, BertTokenizer

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/finer-main/

Mounted at /content/drive
/content/drive/MyDrive/finer-main


In [62]:


train = datasets.load_from_disk('finer_train')
test = datasets.load_dataset("nlpaueb/finer-139",split = 'test')

# validation = datasets.load_dataset("nlpaueb/finer-139",split = 'validation')

validation = datasets.load_from_disk('finer_validation')

Downloading builder script:   0%|          | 0.00/19.1k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/15.9k [00:00<?, ?B/s]

Downloading and preparing dataset finer-139/finer-139 (download: 98.42 MiB, generated: 824.09 MiB, post-processed: Unknown size, total: 922.51 MiB) to /root/.cache/huggingface/datasets/nlpaueb___finer-139/finer-139/1.0.0/5f5a8eb2a38e8b142bb8ca63f3f9600634cc6c8963e4c982926cf2b48e4e55ff...


Downloading data:   0%|          | 0.00/103M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/900384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/112494 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/108378 [00:00<?, ? examples/s]

Dataset finer-139 downloaded and prepared to /root/.cache/huggingface/datasets/nlpaueb___finer-139/finer-139/1.0.0/5f5a8eb2a38e8b142bb8ca63f3f9600634cc6c8963e4c982926cf2b48e4e55ff. Subsequent calls will reuse this data.


In [61]:
import re
import random

# mask number function, input a list and the probablity, traverse the token list and detect number by regular expression, 
# and convert it to a [MASK] token, finaaly return the list
def random_mask_number(probability, tokens_list):

    save = []
    processed_text = []
    for i in range(probability):
        save.append(1)
    for x in range(100 - probability):
        save.append(0)
    for token in tokens_list:
      if re.fullmatch(r"(\d+[\d,.]*)|([,.]\d+)", token):
        a = random.choice(save)
        if a == 0:
          processed_text.append(token)
        if a == 1:
          processed_text.append('[MASK]')
      else:
        processed_text.append(token)
    return processed_text

In [63]:
def data_preprocess(data,max_len):
  # max_length = 200, so remove or pad the tokens and ner_tags to 200
  tokens = []
  ner_tags = []
  for i in range(len(data)):
    if(len(data[i]['tokens'])>max_len): 
      tokens.append(data[i]['tokens'][:max_len])
    else:
      tokens.append(data[i]['tokens'])
    if(len(data[i]['ner_tags'])>max_len):
      ner_tags.append(data[i]['ner_tags'][:max_len])
    else:
      ner_tags.append(data[i]['ner_tags'])
  return tokens,ner_tags

In [79]:
class AutoNerClassifier(nn.Module):

  def __init__(self, n_classes,PRE_TRAINED_MODEL_NAME):
    super(AutoNerClassifier, self).__init__()
    self.bert = AutoModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
   
    )
    output = self.drop(output[0])
    
    return self.out(output)

In [None]:
class BertNerClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BertNerClassifier, self).__init__()
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.drop = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
   
    )
    output = self.drop(output[0])
    
    return self.out(output)

In [74]:
def produce_input_data(tokens,ner_tags,tokenizer,mask_number,probability):

  token_id= []
  ner_tags_id = []
  attention_mask = []
  # process data, do mask number, add [cls],[sep] tokens to the start and end of the tokens,
  #produce attention masks
  #produce targets matrix by padding -1
  for i in tokens:
    if(mask_number == True):
      i = random_mask_number(probability, i)
    text = ['[CLS]']+i+['[SEP]']
    token_id.append(tokenizer.convert_tokens_to_ids(text))
    attention_mask.append(np.ones_like(text))
    
  all_pad_tokens = pad_sequences(token_id, maxlen=202, padding='post', truncating='post')
  all_mask = pad_sequences(attention_mask, maxlen=202, padding='post', truncating='post')

  for j in ner_tags:
    tags = [-1]+j+[-1]
    ner_tags_id.append(tags)
  all_pad_tags = pad_sequences(ner_tags_id, maxlen=202, padding='post', truncating='post',value=-1)
  
  return{
      
      'input_ids': torch.tensor(all_pad_tokens,dtype=torch.long),
      'attention_mask': torch.tensor(all_mask, dtype=torch.long),
      'targets': torch.tensor(all_pad_tags, dtype=torch.long)
    }

In [12]:
class DataSequence(Dataset):

    def __init__(self, data):

      self.features = data['input_ids']
      self.mask = data['attention_mask']
      self.labels = data['targets']

    def __len__(self):

        return len(self.labels)

    def __getitem__(self, index):

        return self.features[index],self.mask[index],self.labels[index]

In [75]:
def get_all_data(data,tokenizer,mask_number,max_len,probability):
  #produce tokens , ner_tags
  tokens,ner_tags = data_preprocess(data,max_len)
  #using data_preprocess function to convert the data to a dict which include attention mask, inputs id, and targets
  data = produce_input_data(tokens,ner_tags,tokenizer,mask_number,probability)
  data = DataSequence(data)
  dataloader = DataLoader(data, num_workers=10, batch_size=4)

  return dataloader

In [16]:
sec_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-base",do_lower_case=True)
finbert_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone',do_lower_case=True)
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)

Downloading:   0%|          | 0.00/263 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/216k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/533 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [78]:
mask_number = False
max_len = 200
probability = 50
# get train, validation,test dataloader
train_dataloader = get_all_data(train,sec_tokenizer,mask_number,max_len,probability)
validation_dataloader = get_all_data(validation,sec_tokenizer,mask_number,max_len,probability)
test_dataloader = get_all_data(test,sec_tokenizer,mask_number,max_len,probability)

In [56]:
def train_and_eval(model, 
  train_loader, 
  validation_loader,
  loss_fn, 
  optimizer,  
  device,
  epoch):
  model.train()
  for epo in range(epoch):
    
    total_loss_train = 0
    total_loss_validation = 0

    train_predict_list = []
    train_label_list = []

    validation_predict_list = []
    validation_label_list = []
   
    
    for data in train_loader:
      
      input_ids = data[0].to(device)
      attention_mask = data[1].to(device)
      targets = data[2].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      
      prediction_result, prediction_position = torch.max(outputs, dim=2)
      loss = loss_fn(outputs.view(-1, 170), targets.view(-1))
      
      #flatten targets matrix and get where targets matrix != 0 and targets matrix != -1

      targets = targets.flatten()

      clean_targets = np.take(targets.cpu(),np.where((targets.cpu()!=0)&(targets.cpu()!=-1))).squeeze(dim=0).tolist()

      pred = np.take(prediction_position.cpu(),np.where((targets.cpu()!=0)&(targets.cpu()!=-1))).squeeze(dim=0).tolist()
     
      # add targets to a list, add prediction result to a list, at last, calculate f1 score
      for i in clean_targets:
        train_label_list.append(i)
      for j in pred:
        train_predict_list.append(j)      
  
      total_loss_train += loss.item()
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    model.eval()

    with torch.no_grad():
      for data in validation_loader:
    
        input_ids = data[0].to(device)
        attention_mask = data[1].to(device)
        targets = data[2].to(device)

        outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )

        prediction_result, prediction_position = torch.max(outputs, dim=2)
        
        loss = loss_fn(outputs.view(-1, 170), targets.view(-1))
        total_loss_validation += loss.item()
       

        targets = targets.flatten()

        clean_targets = np.take(targets.cpu(),np.where((targets.cpu()!=0)&(targets.cpu()!=-1))).squeeze(dim=0).tolist()
        pred = np.take(prediction_position.cpu(),np.where((targets.cpu()!=0)&(targets.cpu()!=-1))).squeeze(dim=0).tolist()

        for i in clean_targets:
          validation_label_list.append(i)
        for j in pred:
          validation_predict_list.append(j)
   
    train_f1 = f1_score(train_label_list,train_predict_list,average='micro')
    val_f1 = f1_score(validation_label_list,validation_predict_list,average='micro')
    val_loss = total_loss_validation / len(validation_loader)   
    train_loss = total_loss_train / len(train_loader)
    print('Epoch [{}] Train Loss: {:.4f}, Train F1_SCORE: {:.4f}, Valid Loss: {:.4f}, Valid F1_SCORE: {:.4f},'
                      .format(epo+1,train_loss, train_f1, val_loss, val_f1))

In [80]:

EPOCHS = 10
PRE_TRAINED_MODEL_NAME = "nlpaueb/sec-bert-base"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = AutoNerClassifier(170,PRE_TRAINED_MODEL_NAME).to(device)
Batch_size = 4

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)


loss_fn = nn.CrossEntropyLoss(ignore_index=-1).to(device)

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlpaueb/sec-bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_and_eval(model,train_dataloader,validation_dataloader,loss_fn,optimizer,device,EPOCHS)

In [None]:
torch.save(model.state_dict(), 'mask_number_state.bin')

In [None]:
path = '/content/drive/MyDrive/finer-main/mask_number_state.bin'

In [None]:
model.load_state_dict(torch.load(path, map_location=torch.device("cuda:0")))

In [81]:
def test_prediction(model, 
  test_loader, 
  loss_fn, 
  optimizer,
  device
  ):
  
  total_test_f1 = 0
  test_predict_list = []
  test_label_list = []
  

  model.eval()
  with torch.no_grad():
    for data in test_loader:
      input_ids = data[0].to(device)
      attention_mask = data[1].to(device)
      targets = data[2].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      prediction_result, prediction_position = torch.max(outputs, dim=2)
      targets = targets.flatten()

      clean_targets = np.take(targets.cpu(),np.where((targets.cpu()!=0)&(targets.cpu()!=-1))).squeeze(dim=0).tolist()
      pred = np.take(prediction_position.cpu(),np.where((targets.cpu()!=0)&(targets.cpu()!=-1))).squeeze(dim=0).tolist()
      
      
      
      
      for i in clean_targets:
        test_label_list.append(i)
      for j in pred:
        test_predict_list.append(j)

  test_f1 = f1_score(test_label_list,test_predict_list,average='micro')

  print('Test F1_SCORE: {:.4f}'.format(test_f1))

In [None]:
test_prediction(model,test_dataloader,loss_fn,optimizer,device)