In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Installation 

In [4]:
!pip install -q simpletransformers

[?25l[K     |█▎                              | 10 kB 27.5 MB/s eta 0:00:01[K     |██▋                             | 20 kB 29.1 MB/s eta 0:00:01[K     |████                            | 30 kB 19.3 MB/s eta 0:00:01[K     |█████▎                          | 40 kB 7.9 MB/s eta 0:00:01[K     |██████▋                         | 51 kB 7.9 MB/s eta 0:00:01[K     |███████▉                        | 61 kB 9.3 MB/s eta 0:00:01[K     |█████████▏                      | 71 kB 9.1 MB/s eta 0:00:01[K     |██████████▌                     | 81 kB 9.5 MB/s eta 0:00:01[K     |███████████▉                    | 92 kB 10.5 MB/s eta 0:00:01[K     |█████████████▏                  | 102 kB 8.7 MB/s eta 0:00:01[K     |██████████████▍                 | 112 kB 8.7 MB/s eta 0:00:01[K     |███████████████▊                | 122 kB 8.7 MB/s eta 0:00:01[K     |█████████████████               | 133 kB 8.7 MB/s eta 0:00:01[K     |██████████████████▍             | 143 kB 8.7 MB/s eta 0:00:01[K 

# Imports 

In [9]:
from simpletransformers.ner import NERModel
import pandas as pd

# Data Loading 

In [12]:
def read_conll_dataset(filename):
    df = pd.read_csv(filename,
                    sep = ' ', header = None, keep_default_na = False,
                    names = ['words', 'pos', 'chunk', 'labels'],
                    quoting = 3, skip_blank_lines = False)
    df = df[~df['words'].astype(str).str.startswith('-DOCSTART-')] 
    df['sentence_id'] = (df.words == '').cumsum()
    return df[df.words != '']

sciie_train = read_conll_dataset("/content/sciie_train.txt")
sciie_test = read_conll_dataset("/content/sciie_test.txt")

custom_train = read_conll_dataset("/content/train.txt")
custom_test = read_conll_dataset("/content/test.txt")

# Data Preprocessing

In [13]:
custom_train["sentence_id"] = custom_train["sentence_id"] + sciie_train["sentence_id"].max()
custom_test["sentence_id"] = custom_test["sentence_id"] + sciie_test["sentence_id"].max()

train = pd.concat([sciie_train, custom_train], axis = 0, ignore_index= True)
test = pd.concat([sciie_test, custom_test], axis = 0, ignore_index= True)


mapping ={
    "B-Task" : "O",
    "I-Task" : "O",
    "B-Generic" : "O",
    "I-Generic" : "O",
    "B-OtherScientificTerm" : "B-THE",
    "I-OtherScientificTerm" : "I-THE",
    "I-OBJ" : "O",
    "B-OBJ" : "O",
    "I-Material" : "I-MAT",
    "B-Material" : "B-MAT",
    "B-Method":"B-METH",
    "I-Method":"I-METH",
    "B-DAT" : "B-METH",
    "I-DAT" : "I-METH",
    "B-Metric":"B-METR",
    "I-Metric": "I-METR",
    "B-RES" : "O",
    "I-RES" : "O"
}

def map_fun(label):
    if label in mapping.keys():
        return mapping[label]
    else:
        return label
    
train["labels"] = train["labels"].apply(map_fun)
test["labels"] = test["labels"].apply(map_fun)

# Model Prep

In [14]:
data = [[train['sentence_id'].nunique(), test['sentence_id'].nunique()]]

custom_labels = list(train['labels'].unique())
print("Custom Labels = ",custom_labels)
pd.DataFrame(data, columns=["Train", "Test"])

Custom Labels =  ['I-MAT', 'O', 'I-THE', 'I-METH', 'I-METR', 'B-METH', 'B-THE']


Unnamed: 0,Train,Test
0,2391,616


In [15]:
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'sliding_window': True,
    'max_seq_length': 128,
    'num_train_epochs': 10,
    'train_batch_size': 64,
    'fp16': True,
    'output_dir': '/outputs/',
    'best_model_dir': '/outputs/best_model/',
    'evaluate_during_training': False,
}

# Training Model 

In [18]:
## copy of the folder path 
model = NERModel('bert', '/content/drive/MyDrive/NER/3.1 Trained NER Model- 280-10 epoch', labels=custom_labels, args=train_args)
model.train_model(train, eval_data=test)

  0%|          | 0/3 [00:00<?, ?it/s]



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/38 [00:00<?, ?it/s]

Running Epoch 1 of 10:   0%|          | 0/38 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/38 [00:00<?, ?it/s]

(380, 0.004836551923500864)

# Evaluation Model 

In [None]:
result, model_outputs, preds_list = model.eval_model(test)
print(result)

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 0.4333766686984084, 'precision': 0.554863813229572, 'recall': 0.599663582842725, 'f1_score': 0.5763945028294261}


In [None]:
import shutil
shutil.move('/outputs', "/content/drive/MyDrive/NER")

'/content/drive/MyDrive/NER/outputs'