In [None]:



!pip install -q transformers
!pip install -q simpletransformers
!pip install -q nervaluate


from simpletransformers.ner import NERModel
from transformers import AutoTokenizer
import pandas as pd
import logging
import numpy as np
from sklearn.model_selection import GroupShuffleSplit 
from nervaluate import Evaluator
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv(config.INPUT_FILE)
print("Total Number of Unique Sentence: ",len(set(df["sentence_id"].values)))
df["labels"].fillna("O", inplace = True)


splitter = GroupShuffleSplit(test_size=config.TEST_SIZE, n_splits=1, random_state = config.RANDOM_STATE)
split = splitter.split(df, groups=df['sentence_id'])
train_inds, test_inds = next(split)
train_df = df.iloc[train_inds]
test_df = df.iloc[test_inds]

train_df["words"] = train_df["words"].astype("str")
test_df["words"] = test_df["words"].astype("str")
train_df.reset_index(drop = True,inplace = True)
test_df.reset_index(drop = True,inplace = True)

print("Total Number of Sentences in Train Set: ",len(set(train_df["sentence_id"].values)))
print("Total Number of Sentences in Test Set: ",len(set(test_df["sentence_id"].values)))

class config:
    INPUT_FILE = "/content/drive/MyDrive/FR/NER/final2.csv" # input file 
    TEST_SIZE = 0.20
    RANDOM_STATE = 12
    MAX_LEN = 512
    EPOCHS = 10
    BATCH_SIZE= 16

custom_labels = list(train_df['labels'].unique())
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_window': True,
    'max_seq_length': config.MAX_LEN,
    'num_train_epochs': config.EPOCHS,
    'train_batch_size': config.BATCH_SIZE,
    'fp16': True,
    'output_dir': '/outputs/',
    'best_model_dir': '/outputs/best_model/',
    'evaluate_during_training': True,
}

logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
model = NERModel( "bert", "allenai/scibert_scivocab_uncased", labels=custom_labels, args=train_args)
model.train_model(train_df, eval_data= test_df)
result, model_outputs, preds_list = model.eval_model(test_df)

print(result)

[K     |████████████████████████████████| 4.9 MB 30.0 MB/s 
[K     |████████████████████████████████| 163 kB 66.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 47.3 MB/s 
[K     |████████████████████████████████| 250 kB 13.0 MB/s 
[K     |████████████████████████████████| 9.2 MB 66.4 MB/s 
[K     |████████████████████████████████| 1.3 MB 48.7 MB/s 
[K     |████████████████████████████████| 432 kB 67.2 MB/s 
[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[K     |████████████████████████████████| 1.8 MB 58.5 MB/s 
[K     |████████████████████████████████| 181 kB 68.7 MB/s 
[K     |████████████████████████████████| 162 kB 67.3 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[K     |████████████████████████████████| 162 kB 74.7 MB/s 
[K     |████████████████████████████████| 158 kB 62.4 MB/s 
[K     |████████████████████████████████| 157 kB 79.0 MB/s 
[K     |████████████████████████████████| 157 kB 73.2 MB/s 
[K     |███████████████████

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

Downloading:   0%|          | 0.00/228k [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 0.33352196257677863, 'precision': 0.6041666666666666, 'recall': 0.6497641509433962, 'f1_score': 0.6261363636363636}


# SAMPLE INFERENCE

In [None]:
sentence = "This paper examines the shareholder primacy norm (SPN) as a widely acknowledged impediment to corporate social responsibility and explores the role of business schools in promoting the SPN but also potentially as an avenue for change by addressing misconceptions about shareholder primacy and the purpose of business. We start by explaining the SPN and then review its status under US and UK laws and show that it is not a likely legal requirement, at least under the guise of shareholder value maximization. This is in contrast to the common assertion that managers are legally constrained from addressing CSR issues if doing so is inconsistent with the economic interests of shareholders. Nonetheless, while the SPN might be muted as a legal norm, we show that it is certainly evident as a social norm among managers and in business schools—reflective, in part, of the sole voting rights of shareholders on corporate boards and of the dominance of shareholder theory—and justifiably so in the view of many managers and business academics. We argue that this view is misguided, not least when associated with claims of a purported legally enforceable requirement to maximize shareholder value. We propose two ways by which the influence of the SPN among managers might be attenuated: extending fiduciary duties of executives to non-shareholder stakeholders and changes in business school teaching such that it covers a plurality of conceptions of the purpose of the corporation. ©️ 2014, Springer Science+Business Media Dordrecht."
samples = [sentence]
predictions, _ = model.predict(samples)
print(sentence)
for idx, sample in enumerate(samples):
  print('{}: '.format(idx))
  for word in predictions[idx]:
    print('{}'.format(word))

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

This paper examines the shareholder primacy norm (SPN) as a widely acknowledged impediment to corporate social responsibility and explores the role of business schools in promoting the SPN but also potentially as an avenue for change by addressing misconceptions about shareholder primacy and the purpose of business. We start by explaining the SPN and then review its status under US and UK laws and show that it is not a likely legal requirement, at least under the guise of shareholder value maximization. This is in contrast to the common assertion that managers are legally constrained from addressing CSR issues if doing so is inconsistent with the economic interests of shareholders. Nonetheless, while the SPN might be muted as a legal norm, we show that it is certainly evident as a social norm among managers and in business schools—reflective, in part, of the sole voting rights of shareholders on corporate boards and of the dominance of shareholder theory—and justifiably so in the view 

In [None]:
start = 0
ents = []

for i, pred in enumerate(preds):
    word = list(pred.keys())[0]
    end = start + len(word) + 1 

    if word[-1] == ".":
        # start += 1
        # end += 1
        start = start

    else: 
        ent = pred[word] 
        if ent == "B-THE":       
            sp = start
            ep = end
            for j in range(i+1,len(preds)):
                next_word = list(preds[j].keys())[0]
                next_ent = preds[j][next_word]
                if next_ent == "I-THE":
                    ep = ep + len(next_word) + 1
                    next_next_word = list(preds[j+1].keys())[0]
                    if preds[j+1][next_next_word] == "O":
                        break
            #ep = ep + 1 
            # end = ep + 1 
            ents.append({
                'start': sp, 
                'end' : ep,
                "label" : "Annotation"})

    start = end
doc = {
    'text' : sentence,
    "ents" : ents
}
colors = {"Annotation" :"linear-gradient(90deg, #aa9cfc, #fc9ce7)" } 
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options = options , manual=True, jupyter=True);

# Classification Report 

In [None]:
preds = []
for p in preds_list:
    preds.extend(p)
preds = np.array(preds)
labels =test_df["labels"].values
 
assert len(preds) == len(labels)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(preds, labels))

              precision    recall  f1-score   support

       B-THE       0.72      0.68      0.70       750
       I-THE       0.73      0.78      0.75      1127
           O       0.97      0.97      0.97     12572

    accuracy                           0.94     14449
   macro avg       0.81      0.81      0.81     14449
weighted avg       0.94      0.94      0.94     14449



# NER Metrics

In [None]:
label_list = []
for id, p in test_df.groupby("sentence_id"):
    label_list.append(list(p["labels"].values))
    
evaluator = Evaluator(label_list, preds_list, tags = ["THE"],loader="list")
results, results_per_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 686,
  'incorrect': 0,
  'partial': 0,
  'missed': 162,
  'spurious': 226,
  'possible': 848,
  'actual': 912,
  'precision': 0.7521929824561403,
  'recall': 0.8089622641509434,
  'f1': 0.7795454545454545},
 'partial': {'correct': 551,
  'incorrect': 0,
  'partial': 135,
  'missed': 162,
  'spurious': 226,
  'possible': 848,
  'actual': 912,
  'precision': 0.6781798245614035,
  'recall': 0.7293632075471698,
  'f1': 0.7028409090909091},
 'strict': {'correct': 551,
  'incorrect': 135,
  'partial': 0,
  'missed': 162,
  'spurious': 226,
  'possible': 848,
  'actual': 912,
  'precision': 0.6041666666666666,
  'recall': 0.6497641509433962,
  'f1': 0.6261363636363636},
 'exact': {'correct': 551,
  'incorrect': 135,
  'partial': 0,
  'missed': 162,
  'spurious': 226,
  'possible': 848,
  'actual': 912,
  'precision': 0.6041666666666666,
  'recall': 0.6497641509433962,
  'f1': 0.6261363636363636}}

In [None]:
!mkdir /content/drive/MyDrive/FR/NER/models/
!cp -r /outputs/best_model  /content/drive/MyDrive/FR/NER/models/

mkdir: cannot create directory ‘/content/drive/MyDrive/FR/NER/models/’: File exists
^C
