In [1]:
from preprocess import NERDataset
from cleaning import DataReader
import numpy as np
from utils import compute_metrics, get_label_map, get_inv_label_map, read_labels
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import torch
from torch import nn
from config import Config

from transformers import AutoTokenizer, BertForTokenClassification # Import AutoTokenizer and BertForTokenClassification from the transformers library for NLP tasks.
import torch # Import the PyTorch library for tensor computations and deep learning.
import numpy as np # Import NumPy for numerical operations and array manipulations.
import argparse # Import argparse for parsing command-line arguments.
from typing import List # Import List from the typing module for type annotations.
from config import Config # Import Config class from the config module, used for loading and accessing configuration settings.
# Import utility functions: read_labels (to read label data), get_label_map and get_inv_label_map (for mapping labels to indices and vice versa).
from utils import read_labels, get_label_map, get_inv_label_map
import argparse # Re-import argparse (duplicate import, not necessary).
import sys # Import sys for interacting with the Python interpreter (e.g., command-line arguments, system exit).

import os
import numpy as np
from fuzzywuzzy import fuzz
import re

import torch.nn.functional as F
from typing import Tuple

  from pandas.core import (
2024-06-13 03:48:23.202741: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-13 03:48:23.202878: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-13 03:48:23.204646: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-13 03:48:23.217892: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
sentences = []
labels = []

curr_sentence = []
curr_labels = []

with open("Labeled_SemiLabeledData.txt", "r") as file:
    for line in file:
        if line != "\n":
            label = line.split()[0]
            word = line.split()[1]
            
            curr_sentence.append(word)
            curr_labels.append(label)
        else:
            sentences.append(curr_sentence)
            labels.append(curr_labels)
            curr_sentence = []
            curr_labels = []
            
print("DONE!")           

DONE!


In [3]:
print(len(sentences), len(labels))
print(sentences[-1])
print(labels[-1])

21724 21724
['يعد', 'مرض', 'آلزهايمر', 'واحدا', 'من', 'أكثر', 'الأمراض', 'كلفة', 'مالية', 'في', 'الدول', 'المتقدمة', '.']
['OUTSIDE', 'OUTSIDE', 'B-Disease', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE']


In [4]:
def remove_diacritics(text):
    return re.sub(re.compile(r'[\u0617-\u061A\u064B-\u0652]'),"", text)

In [5]:
for i in range(len(sentences)):
    for j in range(len(sentences[i])):
        sentences[i][j] = remove_diacritics(sentences[i][j])

In [6]:
class NERTrainer:
    def __init__(self, test_dataset_path):
        self.cfg = Config()
        self.data_reader = DataReader("Labeled_SemiLabeledData.txt")
        self.data, _, _ = self.data_reader.read_data_bert()
        self.label_list = read_labels('NewEntities.txt')

        self.label_map = get_label_map(self.label_list)
        self.inv_label_map = get_inv_label_map(self.label_list)

        # Load the test dataset
        self.test_data_reader = DataReader(test_dataset_path)
        self.test_data, _, _ = self.test_data_reader.read_data_bert()

        self.TOKENIZER = AutoTokenizer.from_pretrained(self.cfg.MODEL_NAME)

        self.train_dataset = NERDataset(
            texts=[x[0] for x in self.data],
            tags=[x[1] for x in self.data],
            label_list=self.label_list,
            model_name=self.cfg.MODEL_NAME,
            max_length=self.cfg.MAX_LEN
        )

        self.test_dataset = NERDataset(
            texts=[x[0] for x in self.test_data],
            tags=[x[1] for x in self.test_data],
            label_list=self.label_list,
            model_name=self.cfg.MODEL_NAME,
            max_length=self.cfg.MAX_LEN
        )

        self.train_data_loader = DataLoader(dataset=self.train_dataset, batch_size=self.cfg.TRAIN_BATCH_SIZE, shuffle=True)
        self.test_data_loader = DataLoader(dataset=self.test_dataset, batch_size=self.cfg.VALID_BATCH_SIZE, shuffle=False)

        self.model = BertForTokenClassification.from_pretrained(self.cfg.MODEL_NAME,
                                                                return_dict=True,
                                                                num_labels=len(self.label_map),
                                                                output_attentions=False,
                                                                output_hidden_states=False).to(self.cfg.device)

        self.optimizer = AdamW(self.model.parameters(), lr=5e-5, correct_bias=False)
        total_steps = len(self.train_data_loader) * self.cfg.EPOCHS

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        self.best_eval_loss = float('inf')
        self.best_model = None

    def train_epoch(self):
        self.model.train()
        final_loss = 0

        for data in tqdm(self.train_data_loader, total=len(self.train_data_loader)):
            input_ids = data['input_ids'].to(self.cfg.device)
            attention_mask = data['attention_mask'].to(self.cfg.device)
            token_type_ids = data['token_type_ids'].to(self.cfg.device)
            labels = data['labels'].to(self.cfg.device)

            self.optimizer.zero_grad()
            outputs = self.model(input_ids=input_ids,
                                 token_type_ids=token_type_ids,
                                 attention_mask=attention_mask,
                                 labels=labels)

            loss = outputs.loss
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            final_loss += loss.item()

        loss = final_loss / len(self.train_data_loader)
        print(f"Train loss: {loss}")

        return loss

    def eval_epoch(self):
        self.model.eval()
        final_loss = 0
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for data in tqdm(self.test_data_loader, total=len(self.test_data_loader)):
                input_ids = data['input_ids'].to(self.cfg.device)
                attention_mask = data['attention_mask'].to(self.cfg.device)
                token_type_ids = data['token_type_ids'].to(self.cfg.device)
                labels = data['labels'].to(self.cfg.device)

                outputs = self.model(input_ids=input_ids,
                                     token_type_ids=token_type_ids,
                                     attention_mask=attention_mask,
                                     labels=labels)

                loss = outputs.loss
                final_loss += loss.item()

                logits = outputs.logits.detach().cpu().numpy()
                labels = labels.to('cpu').numpy()

                all_preds.extend(logits)
                all_labels.extend(labels)

        all_preds = np.array(all_preds)
        all_labels = np.asarray(all_labels)

        metrics = compute_metrics(all_preds, all_labels, self.inv_label_map, False)
        final_loss = final_loss / len(self.test_data_loader)

        print(f"Eval loss: {final_loss}")
        print(f"Eval Metrics: {metrics}")

        return final_loss, metrics

    def train(self):
        for epoch in range(self.cfg.EPOCHS):
            print(f"Training Epoch: {epoch + 1}")
            self.train_epoch()

            print(f"Evaluating Epoch: {epoch + 1}")
            eval_loss, _ = self.eval_epoch()

            if eval_loss < self.best_eval_loss:
                self.best_eval_loss = eval_loss
                self.best_model = self.model.state_dict()
                torch.save(self.best_model, "StudentModel.pt")


if __name__ == '__main__':
    test_dataset_path = "TestingData.txt"
    ner_trainer = NERTrainer(test_dataset_path)
    ner_trainer.train()
    print("DONE!")

Data: 21724 , Sentences: 21724 , Tags: 21724
Data: 856 , Sentences: 856 , Tags: 856


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Epoch: 1




  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.2241937559418962
Evaluating Epoch: 1


  0%|          | 0/54 [00:00<?, ?it/s]



Eval loss: 0.5785462433058355
Eval Metrics: {'accuracy_score': 0.8723885365973554, 'precision': 0.5720279720279721, 'recall': 0.3550347222222222, 'f1': 0.4381360471344403}
Training Epoch: 2


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.12284210735495821
Evaluating Epoch: 2


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.6368755266001379
Eval Metrics: {'accuracy_score': 0.8790247259499582, 'precision': 0.6093544137022397, 'recall': 0.4014756944444444, 'f1': 0.48403976975405544}
Training Epoch: 3


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.09502660423388296
Evaluating Epoch: 3


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.7409004374165777
Eval Metrics: {'accuracy_score': 0.8799587081551393, 'precision': 0.6554989075018208, 'recall': 0.390625, 'f1': 0.48952950775088383}
Training Epoch: 4


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.07904220175886847
Evaluating Epoch: 4


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.7106025096856885
Eval Metrics: {'accuracy_score': 0.876812662832424, 'precision': 0.5952063914780293, 'recall': 0.3880208333333333, 'f1': 0.46978455070940617}
Training Epoch: 5


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.06749599689064995
Evaluating Epoch: 5


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.7272277344363155
Eval Metrics: {'accuracy_score': 0.880843533402153, 'precision': 0.6119205298013245, 'recall': 0.4010416666666667, 'f1': 0.48453067645516523}
Training Epoch: 6


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.058933742754147644
Evaluating Epoch: 6


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.883101160362087
Eval Metrics: {'accuracy_score': 0.8786806272427862, 'precision': 0.6108815426997245, 'recall': 0.3849826388888889, 'f1': 0.472310969116081}
Training Epoch: 7


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.05286339055835949
Evaluating Epoch: 7


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0821369019578453
Eval Metrics: {'accuracy_score': 0.8778449589539399, 'precision': 0.6071188717259905, 'recall': 0.3923611111111111, 'f1': 0.47666754547851303}
Training Epoch: 8


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.04733116663385468
Evaluating Epoch: 8


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9450830361909337
Eval Metrics: {'accuracy_score': 0.8809418473184879, 'precision': 0.5907928388746803, 'recall': 0.4010416666666667, 'f1': 0.47776628748707345}
Training Epoch: 9


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.04331877847515535
Evaluating Epoch: 9


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0522784943071504
Eval Metrics: {'accuracy_score': 0.876468564125252, 'precision': 0.6148308135349172, 'recall': 0.3706597222222222, 'f1': 0.46249661521797997}
Training Epoch: 10


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.03987832376661413
Evaluating Epoch: 10


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.1103051307576675
Eval Metrics: {'accuracy_score': 0.8785331563682839, 'precision': 0.5980392156862745, 'recall': 0.3971354166666667, 'f1': 0.47730829420970267}
Training Epoch: 11


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.03705516251910612
Evaluating Epoch: 11


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.114584366928924
Eval Metrics: {'accuracy_score': 0.8792705107407953, 'precision': 0.5916824196597353, 'recall': 0.4075520833333333, 'f1': 0.4826522744795682}
Training Epoch: 12


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.034745620163887964
Evaluating Epoch: 12


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.1650072342405717
Eval Metrics: {'accuracy_score': 0.8770092906650937, 'precision': 0.6054982817869415, 'recall': 0.3823784722222222, 'f1': 0.4687416866187816}
Training Epoch: 13


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.03254670706782997
Evaluating Epoch: 13


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.183531285918973
Eval Metrics: {'accuracy_score': 0.878778941159121, 'precision': 0.6064814814814815, 'recall': 0.3980034722222222, 'f1': 0.4806079664570231}
Training Epoch: 14


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.030693079641088673
Evaluating Epoch: 14


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.2150339333823434
Eval Metrics: {'accuracy_score': 0.8786314702846187, 'precision': 0.602496714848883, 'recall': 0.3980034722222222, 'f1': 0.4793518034500784}
Training Epoch: 15


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.029290905543617577
Evaluating Epoch: 15


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.2293675292835191
Eval Metrics: {'accuracy_score': 0.8773042324140982, 'precision': 0.6009421265141319, 'recall': 0.3875868055555556, 'f1': 0.4712401055408971}
Training Epoch: 16


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.02768696439200865
Evaluating Epoch: 16


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.3306724572623219
Eval Metrics: {'accuracy_score': 0.8769109767487588, 'precision': 0.6028708133971292, 'recall': 0.3828125, 'f1': 0.46827714361560924}
Training Epoch: 17


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.026501539913475904
Evaluating Epoch: 17


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.3562464725778058
Eval Metrics: {'accuracy_score': 0.8781399007029445, 'precision': 0.6, 'recall': 0.3880208333333333, 'f1': 0.4712704269899841}
Training Epoch: 18


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.025203276272526335
Evaluating Epoch: 18


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.3639080892834399
Eval Metrics: {'accuracy_score': 0.8775991741631027, 'precision': 0.601078167115903, 'recall': 0.3871527777777778, 'f1': 0.470960929250264}
Training Epoch: 19


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.024498248856496003
Evaluating Epoch: 19


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.3793820437319853
Eval Metrics: {'accuracy_score': 0.8775008602467679, 'precision': 0.6025469168900804, 'recall': 0.3901909722222222, 'f1': 0.47365648050579556}
Training Epoch: 20


  0%|          | 0/1358 [00:00<?, ?it/s]

Train loss: 0.0234110610485773
Evaluating Epoch: 20


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.409962546011364
Eval Metrics: {'accuracy_score': 0.8776483311212702, 'precision': 0.6018766756032171, 'recall': 0.3897569444444444, 'f1': 0.4731296101159115}
DONE!


In [7]:
import dill
dill.dump_session('StudentCompleted.db')