<a href="https://colab.research.google.com/github/abhi0618/BERT-CONLL03/blob/main/Conll_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers
!pip install -q simpletransformers
!pip install -q nervaluate

[K     |████████████████████████████████| 5.3 MB 15.2 MB/s 
[K     |████████████████████████████████| 163 kB 68.0 MB/s 
[K     |████████████████████████████████| 7.6 MB 56.4 MB/s 
[K     |████████████████████████████████| 250 kB 27.5 MB/s 
[K     |████████████████████████████████| 9.2 MB 43.0 MB/s 
[K     |████████████████████████████████| 43 kB 53 kB/s 
[K     |████████████████████████████████| 441 kB 67.2 MB/s 
[K     |████████████████████████████████| 1.9 MB 53.3 MB/s 
[K     |████████████████████████████████| 1.3 MB 68.9 MB/s 
[K     |████████████████████████████████| 182 kB 60.1 MB/s 
[K     |████████████████████████████████| 162 kB 60.6 MB/s 
[K     |████████████████████████████████| 63 kB 1.9 MB/s 
[K     |████████████████████████████████| 162 kB 45.2 MB/s 
[K     |████████████████████████████████| 158 kB 74.1 MB/s 
[K     |████████████████████████████████| 157 kB 79.4 MB/s 
[K     |████████████████████████████████| 157 kB 74.9 MB/s 
[K     |████████████████████

In [2]:
import urllib.request
from pathlib import Path
from simpletransformers.ner import NERModel
from transformers import AutoTokenizer
import pandas as pd
import logging
import numpy as np
from sklearn.model_selection import GroupShuffleSplit 
from nervaluate import Evaluator
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

def download_file(url, output_file):
  Path(output_file).parent.mkdir(parents=True, exist_ok=True)
  urllib.request.urlretrieve (url, output_file)

def read_conll(filename):
    df = pd.read_csv(filename,
                    sep = ' ', header = None, keep_default_na = False,
                    names = ['words', 'pos', 'chunk', 'labels'],
                    quoting = 3, skip_blank_lines = False)
    df = df[~df['words'].astype(str).str.startswith('-DOCSTART-')] 
    df['sentence_id'] = (df.words == '').cumsum()
    return df[df.words != '']

download_file('https://raw.githubusercontent.com/bhuvanakundumani/BERT-NER-TF2/master/data/train.txt', '/content/data/train.txt')
download_file('https://raw.githubusercontent.com/bhuvanakundumani/BERT-NER-TF2/master/data/test.txt', '/content/data/test.txt')
download_file('https://raw.githubusercontent.com/bhuvanakundumani/BERT-NER-TF2/master/data/valid.txt', '/content/data/valid.txt')

In [3]:
train_df = read_conll('/content/data/train.txt')
test_df = read_conll('/content/data/test.txt')
val_df = read_conll('/content/data/valid.txt')
train_df.head()

Unnamed: 0,words,pos,chunk,labels,sentence_id
2,EU,NNP,B-NP,B-ORG,1
3,rejects,VBZ,B-VP,O,1
4,German,JJ,B-NP,B-MISC,1
5,call,NN,I-NP,O,1
6,to,TO,B-VP,O,1


In [4]:
print("Total Number of Sentences in Train Set: ",len(set(train_df["sentence_id"].values)))
print("Total Number of Sentences in Test Set: ",len(set(test_df["sentence_id"].values)))

class config:
    TEST_SIZE = 0.20
    RANDOM_STATE = 12
    MAX_LEN = 256
    EPOCHS = 5
    BATCH_SIZE= 32

custom_labels = list(train_df['labels'].unique())
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_window': True,
    'max_seq_length': config.MAX_LEN,
    'num_train_epochs': config.EPOCHS,
    'train_batch_size': config.BATCH_SIZE,
    'fp16': True,
    'output_dir': '/outputs/',
    'best_model_dir': '/outputs/best_model/',
    'evaluate_during_training': True,
}

logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
model = NERModel( "bert", "bert-base-cased", labels=custom_labels, args=train_args)
model.train_model(train_df, eval_data= val_df)
result, model_outputs, preds_list = model.eval_model(test_df)

print(result)

Total Number of Sentences in Train Set:  14041
Total Number of Sentences in Test Set:  3453


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/439 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/407 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/439 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/407 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/439 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/407 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/439 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/407 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/439 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/407 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/407 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/432 [00:00<?, ?it/s]

{'eval_loss': 0.13353414574109795, 'precision': 0.9033379694019471, 'recall': 0.919971671388102, 'f1_score': 0.911578947368421}


In [9]:
sentence = "South Africa managed to avoid a fifth successive defeat in 1996 at the hands of the All Blacks with an emphatic 32-22 victory in front of an ecstatic Ellis Park crowd on Saturday .  They scored three tries in recording their highest total against New Zealand , salvaging some pride in a season in which the world champions have lost five out of eight tests .  It also ended a run of nine successive victories this year for New Zealand but arrived too late to prevent a 44563 series defeat and an historic first All Black series triumph on South African soil .  Springbok scrum-half Joost van der Westhuizen was his side 's inspiration , scoring their opening try and making the third for flanker Andre Venter from a quickly taken penalty to give his side a 44802 lead after 54 minutes .  Fullback Andre Joubert scored the other , scorching in from 40 metres at the start of the second half to add to his three long-range penalties ."
samples = [sentence]
predictions, _ = model.predict(samples)
print(sentence)
for idx, sample in enumerate(samples):
  print('{}: '.format(idx))
  for word in predictions[idx]:
    print('{}'.format(word))

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

South Africa managed to avoid a fifth successive defeat in 1996 at the hands of the All Blacks with an emphatic 32-22 victory in front of an ecstatic Ellis Park crowd on Saturday .  They scored three tries in recording their highest total against New Zealand , salvaging some pride in a season in which the world champions have lost five out of eight tests .  It also ended a run of nine successive victories this year for New Zealand but arrived too late to prevent a 44563 series defeat and an historic first All Black series triumph on South African soil .  Springbok scrum-half Joost van der Westhuizen was his side 's inspiration , scoring their opening try and making the third for flanker Andre Venter from a quickly taken penalty to give his side a 44802 lead after 54 minutes .  Fullback Andre Joubert scored the other , scorching in from 40 metres at the start of the second half to add to his three long-range penalties .
0: 
{'South': 'B-LOC'}
{'Africa': 'I-LOC'}
{'managed': 'O'}
{'to': 

In [10]:
preds = []
for p in preds_list:
    preds.extend(p)
preds = np.array(preds)
labels =test_df["labels"].values
 
assert len(preds) == len(labels)

In [7]:
from sklearn.metrics import classification_report
print(classification_report(preds, labels))

              precision    recall  f1-score   support

       B-LOC       0.94      0.94      0.94      1667
      B-MISC       0.84      0.83      0.84       710
       B-ORG       0.93      0.89      0.91      1727
       B-PER       0.95      0.97      0.96      1594
       I-LOC       0.91      0.89      0.90       263
      I-MISC       0.76      0.68      0.72       244
       I-ORG       0.94      0.89      0.91       877
       I-PER       0.99      0.99      0.99      1161
           O       0.99      1.00      1.00     38192

    accuracy                           0.98     46435
   macro avg       0.92      0.90      0.91     46435
weighted avg       0.98      0.98      0.98     46435



In [11]:
import spacy 
preds = predictions[0]
tag_words = [list(preds[i].keys())[0] for i in range(len(preds)) if list(preds[i].values())[0] == "B-THE" ]
start_pos = []
end_pos = []
for word in tag_words:
    start_pos.append(sentence.find(word))
    end_pos.append(sentence.find(word) + len(word))
    
ents = []
for i in range(len(start_pos)):
    ents.append({
        'start': int(start_pos[i]), 
        'end' : int(end_pos[i]),
        "label" : "Annotation"
    })
    
doc = {
    'text' : sentence,
    "ents" : ents
}
colors = {"Annotation" :"linear-gradient(90deg, #aa9cfc, #fc9ce7)" } 
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options = options , manual=True, jupyter=True);