In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Installation 

In [5]:
!pip install -q transformers
!pip install -q simpletransformers


[K     |████████████████████████████████| 5.3 MB 26.0 MB/s 
[K     |████████████████████████████████| 7.6 MB 44.7 MB/s 
[K     |████████████████████████████████| 163 kB 73.5 MB/s 
[K     |████████████████████████████████| 250 kB 33.2 MB/s 
[K     |████████████████████████████████| 1.9 MB 47.3 MB/s 
[K     |████████████████████████████████| 9.2 MB 53.0 MB/s 
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 54.4 MB/s 
[K     |████████████████████████████████| 441 kB 71.3 MB/s 
[K     |████████████████████████████████| 162 kB 70.4 MB/s 
[K     |████████████████████████████████| 182 kB 67.9 MB/s 
[K     |████████████████████████████████| 63 kB 1.9 MB/s 
[K     |████████████████████████████████| 162 kB 74.3 MB/s 
[K     |████████████████████████████████| 158 kB 76.3 MB/s 
[K     |████████████████████████████████| 157 kB 78.6 MB/s 
[K     |████████████████████████████████| 157 kB 79.7 MB/s 
[K     |███████████████████

# Imports 

In [None]:
!pip install  nervaluate
from simpletransformers.ner import NERModel
import pandas as pd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Data Loading 

In [None]:

class config:
    INPUT_FILE = "/content/drive/MyDrive/NER Constructs Only/final2.csv" # input file 
    TEST_SIZE = 0.20
    RANDOM_STATE = 12
    MAX_LEN = 512
    EPOCHS = 10
    BATCH_SIZE= 16

from simpletransformers.ner import NERModel
from transformers import AutoTokenizer
import pandas as pd
import logging
import numpy as np
from sklearn.model_selection import GroupShuffleSplit 
from nervaluate import Evaluator
import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv(config.INPUT_FILE)
print("Total Number of Unique Sentence: ",len(set(df["sentence_id"].values)))
df["labels"].fillna("O", inplace = True)


splitter = GroupShuffleSplit(test_size=config.TEST_SIZE, n_splits=1, random_state = config.RANDOM_STATE)
split = splitter.split(df, groups=df['sentence_id'])
train_inds, test_inds = next(split)
train_df = df.iloc[train_inds]
test_df = df.iloc[test_inds]

train_df["words"] = train_df["words"].astype("str")
test_df["words"] = test_df["words"].astype("str")
train_df.reset_index(drop = True,inplace = True)
test_df.reset_index(drop = True,inplace = True)

print("Total Number of Sentences in Train Set: ",len(set(train_df["sentence_id"].values)))
print("Total Number of Sentences in Test Set: ",len(set(test_df["sentence_id"].values)))

custom_labels = list(train_df['labels'].unique())
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_window': True,
    'max_seq_length': config.MAX_LEN,
    'num_train_epochs': config.EPOCHS,
    'train_batch_size': config.BATCH_SIZE,
    'fp16': True,
    'output_dir': '/outputs/',
    'best_model_dir': '/outputs/best_model/',
    'evaluate_during_training': True,
}

logging.basicConfig(level=logging.DEBUG)
transformers_logger = logging.getLogger('transformers')
transformers_logger.setLevel(logging.WARNING)
model = NERModel( "bert", "bert-base-cased", labels=custom_labels, args=train_args)
model.train_model(train_df, eval_data= test_df)
result, model_outputs, preds_list = model.eval_model(test_df)

print(result)

Total Number of Unique Sentence:  1962
Total Number of Sentences in Train Set:  1569
Total Number of Sentences in Test Set:  393


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 0.32086980833671985, 'precision': 0.5969827586206896, 'recall': 0.6533018867924528, 'f1_score': 0.6238738738738738}


# Training Model 

# Evaluation Model 

In [None]:
result, model_outputs, preds_list = model.eval_model(test)
print(result)

In [None]:
import shutil
shutil.move('/outputs', "/content/drive/MyDrive/BERT-NER")

'/content/drive/MyDrive/NER/outputs'