In [None]:
!pip install seqeval

In [None]:
!pip install arabert

In [1]:
from preprocess import NERDataset
from cleaning import DataReader
import numpy as np
from utils import compute_metrics, get_label_map, get_inv_label_map, read_labels
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import torch
from torch import nn
from config import Config
import dill

  from pandas.core import (
2024-06-12 12:00:08.461666: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-12 12:00:08.461766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-12 12:00:08.464027: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-12 12:00:08.479684: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
class NERTrainer:
    def __init__(self, test_dataset_path):
        self.cfg = Config()
        self.data_reader = DataReader("TrainingG_Data.txt")
        self.data, _, _ = self.data_reader.read_data_bert()
        self.label_list = read_labels('NewEntities.txt')

        self.label_map = get_label_map(self.label_list)
        self.inv_label_map = get_inv_label_map(self.label_list)

        # Load the test dataset
        self.test_data_reader = DataReader(test_dataset_path)
        self.test_data, _, _ = self.test_data_reader.read_data_bert()

        self.TOKENIZER = AutoTokenizer.from_pretrained(self.cfg.MODEL_NAME)

        self.train_dataset = NERDataset(
            texts=[x[0] for x in self.data],
            tags=[x[1] for x in self.data],
            label_list=self.label_list,
            model_name=self.cfg.MODEL_NAME,
            max_length=self.cfg.MAX_LEN
        )

        self.test_dataset = NERDataset(
            texts=[x[0] for x in self.test_data],
            tags=[x[1] for x in self.test_data],
            label_list=self.label_list,
            model_name=self.cfg.MODEL_NAME,
            max_length=self.cfg.MAX_LEN
        )

        self.train_data_loader = DataLoader(dataset=self.train_dataset, batch_size=self.cfg.TRAIN_BATCH_SIZE, shuffle=True)
        self.test_data_loader = DataLoader(dataset=self.test_dataset, batch_size=self.cfg.VALID_BATCH_SIZE, shuffle=False)

        self.model = BertForTokenClassification.from_pretrained(self.cfg.MODEL_NAME,
                                                                return_dict=True,
                                                                num_labels=len(self.label_map),
                                                                output_attentions=False,
                                                                output_hidden_states=False).to(self.cfg.device)

        self.optimizer = AdamW(self.model.parameters(), lr=5e-5, correct_bias=False)
        total_steps = len(self.train_data_loader) * self.cfg.EPOCHS

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        self.best_eval_loss = float('inf')
        self.best_model = None

    def train_epoch(self):
        self.model.train()
        final_loss = 0

        for data in tqdm(self.train_data_loader, total=len(self.train_data_loader)):
            input_ids = data['input_ids'].to(self.cfg.device)
            attention_mask = data['attention_mask'].to(self.cfg.device)
            token_type_ids = data['token_type_ids'].to(self.cfg.device)
            labels = data['labels'].to(self.cfg.device)

            self.optimizer.zero_grad()
            outputs = self.model(input_ids=input_ids,
                                 token_type_ids=token_type_ids,
                                 attention_mask=attention_mask,
                                 labels=labels)

            loss = outputs.loss
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            final_loss += loss.item()

        loss = final_loss / len(self.train_data_loader)
        print(f"Train loss: {loss}")

        return loss

    def eval_epoch(self):
        self.model.eval()
        final_loss = 0
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for data in tqdm(self.test_data_loader, total=len(self.test_data_loader)):
                input_ids = data['input_ids'].to(self.cfg.device)
                attention_mask = data['attention_mask'].to(self.cfg.device)
                token_type_ids = data['token_type_ids'].to(self.cfg.device)
                labels = data['labels'].to(self.cfg.device)

                outputs = self.model(input_ids=input_ids,
                                     token_type_ids=token_type_ids,
                                     attention_mask=attention_mask,
                                     labels=labels)

                loss = outputs.loss
                final_loss += loss.item()

                logits = outputs.logits.detach().cpu().numpy()
                labels = labels.to('cpu').numpy()

                all_preds.extend(logits)
                all_labels.extend(labels)

        all_preds = np.array(all_preds)
        all_labels = np.asarray(all_labels)

        metrics = compute_metrics(all_preds, all_labels, self.inv_label_map, False)
        final_loss = final_loss / len(self.test_data_loader)

        print(f"Eval loss: {final_loss}")
        print(f"Eval Metrics: {metrics}")

        return final_loss, metrics

    def train(self):
        for epoch in range(self.cfg.EPOCHS):
            print(f"Training Epoch: {epoch + 1}")
            self.train_epoch()

            print(f"Evaluating Epoch: {epoch + 1}")
            eval_loss, _ = self.eval_epoch()

            if eval_loss < self.best_eval_loss:
                self.best_eval_loss = eval_loss
                self.best_model = self.model.state_dict()
                torch.save(self.best_model, "JuneModel_G.pt")


if __name__ == '__main__':
    test_dataset_path = "TestingData.txt"
    ner_trainer = NERTrainer(test_dataset_path)
    ner_trainer.train()


Data: 27268 , Sentences: 27268 , Tags: 27268
Data: 856 , Sentences: 856 , Tags: 856


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Epoch: 1




  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.2294567667188183
Evaluating Epoch: 1


  0%|          | 0/54 [00:00<?, ?it/s]



Eval loss: 0.518251573735917
Eval Metrics: {'accuracy_score': 0.8902358490566038, 'precision': 0.6436855670103093, 'recall': 0.43359375, 'f1': 0.5181535269709543}
Training Epoch: 2


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.13192784864226045
Evaluating Epoch: 2


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.5110966275204663
Eval Metrics: {'accuracy_score': 0.8921698113207547, 'precision': 0.6338924233661076, 'recall': 0.4756944444444444, 'f1': 0.5435159930572775}
Training Epoch: 3


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.10532371480771698
Evaluating Epoch: 3


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.5663646755533086
Eval Metrics: {'accuracy_score': 0.8937264150943396, 'precision': 0.6423957721667646, 'recall': 0.4748263888888889, 'f1': 0.5460444222610432}
Training Epoch: 4


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.08838365914414792
Evaluating Epoch: 4


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.5957603935083305
Eval Metrics: {'accuracy_score': 0.8925943396226416, 'precision': 0.6452941176470588, 'recall': 0.4761284722222222, 'f1': 0.547952047952048}
Training Epoch: 5


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.07663462626532026
Evaluating Epoch: 5


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.7130870754589085
Eval Metrics: {'accuracy_score': 0.890377358490566, 'precision': 0.6512210394489668, 'recall': 0.4513888888888889, 'f1': 0.533196616252243}
Training Epoch: 6


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.06761237015166591
Evaluating Epoch: 6


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.82756708158801
Eval Metrics: {'accuracy_score': 0.8881603773584905, 'precision': 0.6272335181762169, 'recall': 0.4418402777777778, 'f1': 0.5184619302266361}
Training Epoch: 7


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.060515024859418606
Evaluating Epoch: 7


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.8772957626454256
Eval Metrics: {'accuracy_score': 0.8919811320754717, 'precision': 0.6416009702850213, 'recall': 0.4592013888888889, 'f1': 0.5352896534277763}
Training Epoch: 8


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.05466273433113705
Evaluating Epoch: 8


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9212848657604169
Eval Metrics: {'accuracy_score': 0.8918396226415094, 'precision': 0.6449451887941535, 'recall': 0.4596354166666667, 'f1': 0.5367460719716168}
Training Epoch: 9


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.05071124992185871
Evaluating Epoch: 9


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9474132106880899
Eval Metrics: {'accuracy_score': 0.8946226415094339, 'precision': 0.6488095238095238, 'recall': 0.4730902777777778, 'f1': 0.5471887550200804}
Training Epoch: 10


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.046045837715224434
Evaluating Epoch: 10


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9547588255394388
Eval Metrics: {'accuracy_score': 0.8919811320754717, 'precision': 0.6621287128712872, 'recall': 0.4644097222222222, 'f1': 0.5459183673469389}
Training Epoch: 11


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.04296648622943837
Evaluating Epoch: 11


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9535812002227262
Eval Metrics: {'accuracy_score': 0.8938679245283019, 'precision': 0.6619018023617154, 'recall': 0.4622395833333333, 'f1': 0.5443393815486839}
Training Epoch: 12


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.04029287374332613
Evaluating Epoch: 12


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0369078187892835
Eval Metrics: {'accuracy_score': 0.8934905660377358, 'precision': 0.6459701492537313, 'recall': 0.4696180555555556, 'f1': 0.5438552400100527}
Training Epoch: 13


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.03800945744557508
Evaluating Epoch: 13


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.003760863267989
Eval Metrics: {'accuracy_score': 0.8933490566037736, 'precision': 0.642434988179669, 'recall': 0.4717881944444444, 'f1': 0.5440440440440439}
Training Epoch: 14


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.03605194869701723
Evaluating Epoch: 14


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0230379826906655
Eval Metrics: {'accuracy_score': 0.8937264150943396, 'precision': 0.6516587677725119, 'recall': 0.4774305555555556, 'f1': 0.5511022044088176}
Training Epoch: 15


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.03435237121019277
Evaluating Epoch: 15


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0768399826354451
Eval Metrics: {'accuracy_score': 0.8925471698113208, 'precision': 0.6566153846153846, 'recall': 0.4631076388888889, 'f1': 0.5431407482820056}
Training Epoch: 16


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.03266017355594601
Evaluating Epoch: 16


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.1192223235136933
Eval Metrics: {'accuracy_score': 0.8928301886792452, 'precision': 0.6472727272727272, 'recall': 0.4635416666666667, 'f1': 0.5402124430955993}
Training Epoch: 17


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.031140081993085017
Evaluating Epoch: 17


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.1309639543846801
Eval Metrics: {'accuracy_score': 0.8946698113207547, 'precision': 0.6470588235294118, 'recall': 0.4774305555555556, 'f1': 0.5494505494505494}
Training Epoch: 18


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.03018947002512766
Evaluating Epoch: 18


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.1689056327083596
Eval Metrics: {'accuracy_score': 0.8932547169811321, 'precision': 0.6571428571428571, 'recall': 0.4691840277777778, 'f1': 0.5474803747784249}
Training Epoch: 19


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.028989540649095927
Evaluating Epoch: 19


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.2072874757426757
Eval Metrics: {'accuracy_score': 0.894245283018868, 'precision': 0.6566083283041642, 'recall': 0.4722222222222222, 'f1': 0.5493562231759656}
Training Epoch: 20


  0%|          | 0/1705 [00:00<?, ?it/s]

Train loss: 0.027854127639536253
Evaluating Epoch: 20


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.22627540477724
Eval Metrics: {'accuracy_score': 0.894245283018868, 'precision': 0.6590361445783133, 'recall': 0.4748263888888889, 'f1': 0.5519677093844603}
