In [4]:
!pip install -q transformers
!pip install -q simpletransformers
!pip install -q nervaluate



In [6]:
from google.colab import drive
drive.mount('/content/drive')

class config:
    INPUT_FILE = "/content/drive/MyDrive/NER Constructs Only/more_data.csv"
    TEST_SIZE = 0.20
    RANDOM_STATE = 12
    MAX_LEN = 512
    EPOCHS = 10
    BATCH_SIZE= 16

from simpletransformers.ner import NERModel
from transformers import AutoTokenizer
import pandas as pd
import logging
import numpy as np
from sklearn.model_selection import GroupShuffleSplit 
from nervaluate import Evaluator
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv(config.INPUT_FILE)
print("Total Number of Unique Sentence: ",len(set(df["sentence_id"].values)))
df["labels"].fillna("O", inplace = True)


splitter = GroupShuffleSplit(test_size=config.TEST_SIZE, n_splits=1, random_state = config.RANDOM_STATE)
split = splitter.split(df, groups=df['sentence_id'])
train_inds, test_inds = next(split)
train_df = df.iloc[train_inds]
test_df = df.iloc[test_inds]

train_df["words"] = train_df["words"].astype("str")
test_df["words"] = test_df["words"].astype("str")
train_df.reset_index(drop = True,inplace = True)
test_df.reset_index(drop = True,inplace = True)

print("Total Number of Sentences in Train Set: ",len(set(train_df["sentence_id"].values)))
print("Total Number of Sentences in Test Set: ",len(set(test_df["sentence_id"].values)))

custom_labels = list(train_df['labels'].unique())
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_window': True,
    'max_seq_length': config.MAX_LEN,
    'num_train_epochs': config.EPOCHS,
    'train_batch_size': config.BATCH_SIZE,
    'fp16': True,
    'output_dir': '/outputs/',
    'best_model_dir': '/outputs/best_model/',
    'evaluate_during_training': True,
    'no_save' : True
}

logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
model = NERModel( "bert", "bert-base-cased", labels=custom_labels, args=train_args)
model.train_model(train_df, eval_data= test_df)
result, model_outputs, preds_list = model.eval_model(test_df)

print(result)

Mounted at /content/drive
Total Number of Unique Sentence:  101
Total Number of Sentences in Train Set:  80
Total Number of Sentences in Test Set:  21


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.21539153158664703, 'precision': 0.6955645161290323, 'recall': 0.8023255813953488, 'f1_score': 0.7451403887688985}


# SAMPLE INFERENCE

In [None]:
sentence = "This paper examines the shareholder primacy norm (SPN) as a widely acknowledged impediment to corporate social responsibility and explores the role of business schools in promoting the SPN but also potentially as an avenue for change by addressing misconceptions about shareholder primacy and the purpose of business. We start by explaining the SPN and then review its status under US and UK laws and show that it is not a likely legal requirement, at least under the guise of shareholder value maximization. This is in contrast to the common assertion that managers are legally constrained from addressing CSR issues if doing so is inconsistent with the economic interests of shareholders. Nonetheless, while the SPN might be muted as a legal norm, we show that it is certainly evident as a social norm among managers and in business schools—reflective, in part, of the sole voting rights of shareholders on corporate boards and of the dominance of shareholder theory—and justifiably so in the view of many managers and business academics. We argue that this view is misguided, not least when associated with claims of a purported legally enforceable requirement to maximize shareholder value. We propose two ways by which the influence of the SPN among managers might be attenuated: extending fiduciary duties of executives to non-shareholder stakeholders and changes in business school teaching such that it covers a plurality of conceptions of the purpose of the corporation. ©️ 2014, Springer Science+Business Media Dordrecht."
samples = [sentence]
predictions, _ = model.predict(samples)
print(sentence)
for idx, sample in enumerate(samples):
  print('{}: '.format(idx))
  for word in predictions[idx]:
    print('{}'.format(word))

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

This paper examines the shareholder primacy norm (SPN) as a widely acknowledged impediment to corporate social responsibility and explores the role of business schools in promoting the SPN but also potentially as an avenue for change by addressing misconceptions about shareholder primacy and the purpose of business. We start by explaining the SPN and then review its status under US and UK laws and show that it is not a likely legal requirement, at least under the guise of shareholder value maximization. This is in contrast to the common assertion that managers are legally constrained from addressing CSR issues if doing so is inconsistent with the economic interests of shareholders. Nonetheless, while the SPN might be muted as a legal norm, we show that it is certainly evident as a social norm among managers and in business schools—reflective, in part, of the sole voting rights of shareholders on corporate boards and of the dominance of shareholder theory—and justifiably so in the view 

In [None]:
import spacy 
preds = predictions[0]
tag_words = [list(preds[i].keys())[0] for i in range(len(preds)) if list(preds[i].values())[0] == "B-THE" ]
start_pos = []
end_pos = []
for word in tag_words:
    start_pos.append(sentence.find(word))
    end_pos.append(sentence.find(word) + len(word))
    
ents = []
for i in range(len(start_pos)):
    ents.append({
        'start': int(start_pos[i]), 
        'end' : int(end_pos[i]),
        "label" : "Annotation"
    })
    
doc = {
    'text' : sentence,
    "ents" : ents
}
colors = {"Annotation" :"linear-gradient(90deg, #aa9cfc, #fc9ce7)" } 
options = {"colors": colors}
spacy.displacy.render(doc, style="ent", options = options , manual=True, jupyter=True);

# Classification Report 

In [None]:
preds = []
for p in preds_list:
    preds.extend(p)
preds = np.array(preds)
labels =test_df["labels"].values

assert len(preds) == len(labels)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(preds, labels))

              precision    recall  f1-score   support

       B-THE       0.89      0.80      0.84       465
       I-THE       0.79      0.87      0.83       486
           O       0.96      0.96      0.96      2951

    accuracy                           0.93      3902
   macro avg       0.88      0.88      0.88      3902
weighted avg       0.93      0.93      0.93      3902



# NER Metrics

In [None]:
label_list = []
for id, p in test_df.groupby("sentence_id"):
    label_list.append(list(p["labels"].values))
    
evaluator = Evaluator(label_list, preds_list, tags = ["THE"],loader="list")
results, results_per_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 403,
  'incorrect': 0,
  'partial': 0,
  'missed': 27,
  'spurious': 84,
  'possible': 430,
  'actual': 487,
  'precision': 0.8275154004106776,
  'recall': 0.9372093023255814,
  'f1': 0.8789531079607416},
 'partial': {'correct': 337,
  'incorrect': 0,
  'partial': 66,
  'missed': 27,
  'spurious': 84,
  'possible': 430,
  'actual': 487,
  'precision': 0.7597535934291582,
  'recall': 0.8604651162790697,
  'f1': 0.806979280261723},
 'strict': {'correct': 337,
  'incorrect': 66,
  'partial': 0,
  'missed': 27,
  'spurious': 84,
  'possible': 430,
  'actual': 487,
  'precision': 0.6919917864476386,
  'recall': 0.7837209302325582,
  'f1': 0.7350054525627044},
 'exact': {'correct': 337,
  'incorrect': 66,
  'partial': 0,
  'missed': 27,
  'spurious': 84,
  'possible': 430,
  'actual': 487,
  'precision': 0.6919917864476386,
  'recall': 0.7837209302325582,
  'f1': 0.7350054525627044}}