In [None]:
!pip install seqeval

In [None]:
!pip install arabert

In [1]:
from preprocess import NERDataset
from cleaning import DataReader
import numpy as np
from utils import compute_metrics, get_label_map, get_inv_label_map, read_labels
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import torch
from torch import nn
from config import Config
import dill

  from pandas.core import (
2024-06-12 12:12:15.522756: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-12 12:12:15.522904: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-12 12:12:15.525175: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-12 12:12:15.551022: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class NERTrainer:
    def __init__(self, test_dataset_path):
        self.cfg = Config()
        self.data_reader = DataReader("TrainingData.txt")
        self.data, _, _ = self.data_reader.read_data_bert()
        self.label_list = read_labels('NewEntities.txt')

        self.label_map = get_label_map(self.label_list)
        self.inv_label_map = get_inv_label_map(self.label_list)

        # Load the test dataset
        self.test_data_reader = DataReader(test_dataset_path)
        self.test_data, _, _ = self.test_data_reader.read_data_bert()

        self.TOKENIZER = AutoTokenizer.from_pretrained(self.cfg.MODEL_NAME)

        self.train_dataset = NERDataset(
            texts=[x[0] for x in self.data],
            tags=[x[1] for x in self.data],
            label_list=self.label_list,
            model_name=self.cfg.MODEL_NAME,
            max_length=self.cfg.MAX_LEN
        )

        self.test_dataset = NERDataset(
            texts=[x[0] for x in self.test_data],
            tags=[x[1] for x in self.test_data],
            label_list=self.label_list,
            model_name=self.cfg.MODEL_NAME,
            max_length=self.cfg.MAX_LEN
        )

        self.train_data_loader = DataLoader(dataset=self.train_dataset, batch_size=self.cfg.TRAIN_BATCH_SIZE, shuffle=True)
        self.test_data_loader = DataLoader(dataset=self.test_dataset, batch_size=self.cfg.VALID_BATCH_SIZE, shuffle=False)

        self.model = BertForTokenClassification.from_pretrained(self.cfg.MODEL_NAME,
                                                                return_dict=True,
                                                                num_labels=len(self.label_map),
                                                                output_attentions=False,
                                                                output_hidden_states=False).to(self.cfg.device)

        self.optimizer = AdamW(self.model.parameters(), lr=5e-5, correct_bias=False)
        total_steps = len(self.train_data_loader) * self.cfg.EPOCHS

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        self.best_eval_loss = float('inf')
        self.best_model = None

    def train_epoch(self):
        self.model.train()
        final_loss = 0

        for data in tqdm(self.train_data_loader, total=len(self.train_data_loader)):
            input_ids = data['input_ids'].to(self.cfg.device)
            attention_mask = data['attention_mask'].to(self.cfg.device)
            token_type_ids = data['token_type_ids'].to(self.cfg.device)
            labels = data['labels'].to(self.cfg.device)

            self.optimizer.zero_grad()
            outputs = self.model(input_ids=input_ids,
                                 token_type_ids=token_type_ids,
                                 attention_mask=attention_mask,
                                 labels=labels)

            loss = outputs.loss
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            final_loss += loss.item()

        loss = final_loss / len(self.train_data_loader)
        print(f"Train loss: {loss}")

        return loss

    def eval_epoch(self):
        self.model.eval()
        final_loss = 0
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for data in tqdm(self.test_data_loader, total=len(self.test_data_loader)):
                input_ids = data['input_ids'].to(self.cfg.device)
                attention_mask = data['attention_mask'].to(self.cfg.device)
                token_type_ids = data['token_type_ids'].to(self.cfg.device)
                labels = data['labels'].to(self.cfg.device)

                outputs = self.model(input_ids=input_ids,
                                     token_type_ids=token_type_ids,
                                     attention_mask=attention_mask,
                                     labels=labels)

                loss = outputs.loss
                final_loss += loss.item()

                logits = outputs.logits.detach().cpu().numpy()
                labels = labels.to('cpu').numpy()

                all_preds.extend(logits)
                all_labels.extend(labels)

        all_preds = np.array(all_preds)
        all_labels = np.asarray(all_labels)

        metrics = compute_metrics(all_preds, all_labels, self.inv_label_map, False)
        final_loss = final_loss / len(self.test_data_loader)

        print(f"Eval loss: {final_loss}")
        print(f"Eval Metrics: {metrics}")

        return final_loss, metrics

    def train(self):
        for epoch in range(self.cfg.EPOCHS):
            print(f"Training Epoch: {epoch + 1}")
            self.train_epoch()

            print(f"Evaluating Epoch: {epoch + 1}")
            eval_loss, _ = self.eval_epoch()

            if eval_loss < self.best_eval_loss:
                self.best_eval_loss = eval_loss
                self.best_model = self.model.state_dict()
                torch.save(self.best_model, "JuneModel.pt")


if __name__ == '__main__':
    test_dataset_path = "TestingData.txt"
    ner_trainer = NERTrainer(test_dataset_path)
    ner_trainer.train()


Data: 27268 , Sentences: 27268 , Tags: 27268
Data: 856 , Sentences: 856 , Tags: 856


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Epoch: 1




  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.25565278703166594
Evaluating Epoch: 1


  0%|          | 0/54 [00:00<?, ?it/s]



Eval loss: 0.6241424347584447
Eval Metrics: {'accuracy_score': 0.8629716981132075, 'precision': 0.6913229018492176, 'recall': 0.2109375, 'f1': 0.3232457598935817}
Training Epoch: 2


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.15984513775781453
Evaluating Epoch: 2


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.5225715895390345
Eval Metrics: {'accuracy_score': 0.882688679245283, 'precision': 0.636290967226219, 'recall': 0.3454861111111111, 'f1': 0.4478199718706048}
Training Epoch: 3


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.13180156476424237
Evaluating Epoch: 3


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.5988952554762363
Eval Metrics: {'accuracy_score': 0.8829716981132075, 'precision': 0.6262626262626263, 'recall': 0.3767361111111111, 'f1': 0.47046070460704603}
Training Epoch: 4


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.11361750074910804
Evaluating Epoch: 4


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.6375154807508268
Eval Metrics: {'accuracy_score': 0.8843867924528301, 'precision': 0.6308243727598566, 'recall': 0.3819444444444444, 'f1': 0.4758042714247093}
Training Epoch: 5


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.09921180142138886
Evaluating Epoch: 5


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.7659042167856738
Eval Metrics: {'accuracy_score': 0.8824056603773585, 'precision': 0.6526994359387591, 'recall': 0.3515625, 'f1': 0.456981664315938}
Training Epoch: 6


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.08974516779686874
Evaluating Epoch: 6


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.8623304644220129
Eval Metrics: {'accuracy_score': 0.8736320754716981, 'precision': 0.6412639405204461, 'recall': 0.2994791666666667, 'f1': 0.40828402366863903}
Training Epoch: 7


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.0805683888795395
Evaluating Epoch: 7


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9009396765657045
Eval Metrics: {'accuracy_score': 0.877311320754717, 'precision': 0.6457627118644068, 'recall': 0.3307291666666667, 'f1': 0.4374282433983927}
Training Epoch: 8


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.07387912622748538
Evaluating Epoch: 8


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.8979252271216225
Eval Metrics: {'accuracy_score': 0.8793396226415094, 'precision': 0.6406639004149378, 'recall': 0.3350694444444444, 'f1': 0.4400113992590482}
Training Epoch: 9


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.06732628600196668
Evaluating Epoch: 9


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0265820781113926
Eval Metrics: {'accuracy_score': 0.8785849056603774, 'precision': 0.6466165413533834, 'recall': 0.3359375, 'f1': 0.442159383033419}
Training Epoch: 10


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.06251011343230425
Evaluating Epoch: 10


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0380753763709907
Eval Metrics: {'accuracy_score': 0.8798584905660377, 'precision': 0.6451349141455437, 'recall': 0.3424479166666667, 'f1': 0.4474057272469521}
Training Epoch: 11


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.058093534229743865
Evaluating Epoch: 11


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0720968949436038
Eval Metrics: {'accuracy_score': 0.8760849056603773, 'precision': 0.6476014760147601, 'recall': 0.3046875, 'f1': 0.41440377804014167}
Training Epoch: 12


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.05480133936306205
Evaluating Epoch: 12


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.1104016184323915
Eval Metrics: {'accuracy_score': 0.8773584905660378, 'precision': 0.6615798922800719, 'recall': 0.3198784722222222, 'f1': 0.4312463428905793}
Training Epoch: 13


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.051467800753387756
Evaluating Epoch: 13


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.1733827760650053
Eval Metrics: {'accuracy_score': 0.8769811320754717, 'precision': 0.6393728222996515, 'recall': 0.3185763888888889, 'f1': 0.425260718424102}
Training Epoch: 14


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.04846560056444225
Evaluating Epoch: 14


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.1343377257386844
Eval Metrics: {'accuracy_score': 0.8814150943396226, 'precision': 0.6334106728538283, 'recall': 0.35546875, 'f1': 0.4553794829024187}
Training Epoch: 15


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.046630166362304666
Evaluating Epoch: 15


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.2256761291236788
Eval Metrics: {'accuracy_score': 0.8772641509433963, 'precision': 0.6532534246575342, 'recall': 0.3311631944444444, 'f1': 0.439516129032258}
Training Epoch: 16


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.04471928303121778
Evaluating Epoch: 16


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.2451434881874808
Eval Metrics: {'accuracy_score': 0.8767924528301887, 'precision': 0.6462346760070052, 'recall': 0.3203125, 'f1': 0.428322692977365}
Training Epoch: 17


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.04280979700844504
Evaluating Epoch: 17


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.2969638413439195
Eval Metrics: {'accuracy_score': 0.8777830188679245, 'precision': 0.6397941680960549, 'recall': 0.3237847222222222, 'f1': 0.429971181556196}
Training Epoch: 18


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.04138206051583361
Evaluating Epoch: 18


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.2718904809681353
Eval Metrics: {'accuracy_score': 0.8784433962264151, 'precision': 0.654639175257732, 'recall': 0.3307291666666667, 'f1': 0.439446366782007}
Training Epoch: 19


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.03971532835144429
Evaluating Epoch: 19


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.3284340656741902
Eval Metrics: {'accuracy_score': 0.8776415094339622, 'precision': 0.6581722319859402, 'recall': 0.3250868055555556, 'f1': 0.4352120859965137}
Training Epoch: 20


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.03827919815059293
Evaluating Epoch: 20


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.3608575689571876
Eval Metrics: {'accuracy_score': 0.8776415094339622, 'precision': 0.6572687224669603, 'recall': 0.3237847222222222, 'f1': 0.43384704856062806}
