In [1]:
from preprocess import NERDataset
from cleaning import DataReader
import numpy as np
from utils import compute_metrics, get_label_map, get_inv_label_map, read_labels
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import torch
from torch import nn
from config import Config

from transformers import AutoTokenizer, BertForTokenClassification # Import AutoTokenizer and BertForTokenClassification from the transformers library for NLP tasks.
import torch # Import the PyTorch library for tensor computations and deep learning.
import numpy as np # Import NumPy for numerical operations and array manipulations.
import argparse # Import argparse for parsing command-line arguments.
from typing import List # Import List from the typing module for type annotations.
from config import Config # Import Config class from the config module, used for loading and accessing configuration settings.
# Import utility functions: read_labels (to read label data), get_label_map and get_inv_label_map (for mapping labels to indices and vice versa).
from utils import read_labels, get_label_map, get_inv_label_map
import argparse # Re-import argparse (duplicate import, not necessary).
import sys # Import sys for interacting with the Python interpreter (e.g., command-line arguments, system exit).

import os
import numpy as np
from fuzzywuzzy import fuzz
import re

import torch.nn.functional as F
from typing import Tuple

  from pandas.core import (
2024-06-12 22:53:40.808019: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-12 22:53:40.808145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-12 22:53:40.809912: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-12 22:53:40.833728: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
sentences = []
labels = []

curr_sentence = []
curr_labels = []

with open("LabeledArticles_G.txt", "r") as file:
    for line in file:
        if line != "\n":
            label = line.split()[0]
            word = line.split()[1]
            
            curr_sentence.append(word)
            curr_labels.append(label)
        else:
            sentences.append(curr_sentence)
            labels.append(curr_labels)
            curr_sentence = []
            curr_labels = []
            
print("DONE!")           

DONE!


In [3]:
print(len(sentences), len(labels))
print(sentences[-1])
print(labels[-1])

5545 5545
['ولهذا', 'السبب', 'يمكن', 'وصف', 'مثبطات', 'إعادة', 'امتصاص', 'السيروتونين', 'الانتقائية', 'لعلاج', 'سرعة', 'القذف', '.']
['OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE', 'OUTSIDE']


In [4]:
def remove_diacritics(text):
    return re.sub(re.compile(r'[\u0617-\u061A\u064B-\u0652]'),"", text)

In [5]:
for i in range(len(sentences)):
    for j in range(len(sentences[i])):
        sentences[i][j] = remove_diacritics(sentences[i][j])

In [6]:
class NERTrainer:
    def __init__(self, test_dataset_path):
        self.cfg = Config()
        self.data_reader = DataReader("LabeledArticles_G.txt")
        self.data, _, _ = self.data_reader.read_data_bert()
        self.label_list = read_labels('NewEntities.txt')

        self.label_map = get_label_map(self.label_list)
        self.inv_label_map = get_inv_label_map(self.label_list)

        # Load the test dataset
        self.test_data_reader = DataReader(test_dataset_path)
        self.test_data, _, _ = self.test_data_reader.read_data_bert()

        self.TOKENIZER = AutoTokenizer.from_pretrained(self.cfg.MODEL_NAME)

        self.train_dataset = NERDataset(
            texts=[x[0] for x in self.data],
            tags=[x[1] for x in self.data],
            label_list=self.label_list,
            model_name=self.cfg.MODEL_NAME,
            max_length=self.cfg.MAX_LEN
        )

        self.test_dataset = NERDataset(
            texts=[x[0] for x in self.test_data],
            tags=[x[1] for x in self.test_data],
            label_list=self.label_list,
            model_name=self.cfg.MODEL_NAME,
            max_length=self.cfg.MAX_LEN
        )

        self.train_data_loader = DataLoader(dataset=self.train_dataset, batch_size=self.cfg.TRAIN_BATCH_SIZE, shuffle=True)
        self.test_data_loader = DataLoader(dataset=self.test_dataset, batch_size=self.cfg.VALID_BATCH_SIZE, shuffle=False)

        self.model = BertForTokenClassification.from_pretrained(self.cfg.MODEL_NAME,
                                                                return_dict=True,
                                                                num_labels=len(self.label_map),
                                                                output_attentions=False,
                                                                output_hidden_states=False).to(self.cfg.device)

        self.optimizer = AdamW(self.model.parameters(), lr=5e-5, correct_bias=False)
        total_steps = len(self.train_data_loader) * self.cfg.EPOCHS

        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        self.best_eval_loss = float('inf')
        self.best_model = None

    def train_epoch(self):
        self.model.train()
        final_loss = 0

        for data in tqdm(self.train_data_loader, total=len(self.train_data_loader)):
            input_ids = data['input_ids'].to(self.cfg.device)
            attention_mask = data['attention_mask'].to(self.cfg.device)
            token_type_ids = data['token_type_ids'].to(self.cfg.device)
            labels = data['labels'].to(self.cfg.device)

            self.optimizer.zero_grad()
            outputs = self.model(input_ids=input_ids,
                                 token_type_ids=token_type_ids,
                                 attention_mask=attention_mask,
                                 labels=labels)

            loss = outputs.loss
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            final_loss += loss.item()

        loss = final_loss / len(self.train_data_loader)
        print(f"Train loss: {loss}")

        return loss

    def eval_epoch(self):
        self.model.eval()
        final_loss = 0
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for data in tqdm(self.test_data_loader, total=len(self.test_data_loader)):
                input_ids = data['input_ids'].to(self.cfg.device)
                attention_mask = data['attention_mask'].to(self.cfg.device)
                token_type_ids = data['token_type_ids'].to(self.cfg.device)
                labels = data['labels'].to(self.cfg.device)

                outputs = self.model(input_ids=input_ids,
                                     token_type_ids=token_type_ids,
                                     attention_mask=attention_mask,
                                     labels=labels)

                loss = outputs.loss
                final_loss += loss.item()

                logits = outputs.logits.detach().cpu().numpy()
                labels = labels.to('cpu').numpy()

                all_preds.extend(logits)
                all_labels.extend(labels)

        all_preds = np.array(all_preds)
        all_labels = np.asarray(all_labels)

        metrics = compute_metrics(all_preds, all_labels, self.inv_label_map, False)
        final_loss = final_loss / len(self.test_data_loader)

        print(f"Eval loss: {final_loss}")
        print(f"Eval Metrics: {metrics}")

        return final_loss, metrics

    def train(self):
        for epoch in range(self.cfg.EPOCHS):
            print(f"Training Epoch: {epoch + 1}")
            self.train_epoch()

            print(f"Evaluating Epoch: {epoch + 1}")
            eval_loss, _ = self.eval_epoch()

            if eval_loss < self.best_eval_loss:
                self.best_eval_loss = eval_loss
                self.best_model = self.model.state_dict()
                torch.save(self.best_model, "TeacherG_Model.pt")


if __name__ == '__main__':
    test_dataset_path = "TestingData.txt"  # Path to the custom test dataset
    ner_trainer = NERTrainer(test_dataset_path)
    ner_trainer.train()
    print("DONE!")

Data: 5545 , Sentences: 5545 , Tags: 5545
Data: 856 , Sentences: 856 , Tags: 856


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Epoch: 1




  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.4689902917881864
Evaluating Epoch: 1


  0%|          | 0/54 [00:00<?, ?it/s]



Eval loss: 0.6059350295475235
Eval Metrics: {'accuracy_score': 0.8731750479280342, 'precision': 0.592638036809816, 'recall': 0.4192708333333333, 'f1': 0.49110320284697506}
Training Epoch: 2


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.21957403593173289
Evaluating Epoch: 2


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.6039681938235406
Eval Metrics: {'accuracy_score': 0.8803519638204788, 'precision': 0.626733921815889, 'recall': 0.4314236111111111, 'f1': 0.5110539845758355}
Training Epoch: 3


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.15388356267289402
Evaluating Epoch: 3


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.6255987374870865
Eval Metrics: {'accuracy_score': 0.8830064395615199, 'precision': 0.5871866295264624, 'recall': 0.4574652777777778, 'f1': 0.5142717736033179}
Training Epoch: 4


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.11581500417281812
Evaluating Epoch: 4


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.7595524376741162
Eval Metrics: {'accuracy_score': 0.8839404217667011, 'precision': 0.6110134739308729, 'recall': 0.4526909722222222, 'f1': 0.5200698080279232}
Training Epoch: 5


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.09028154863876636
Evaluating Epoch: 5


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.809215217138882
Eval Metrics: {'accuracy_score': 0.8823182421471759, 'precision': 0.5975609756097561, 'recall': 0.4466145833333333, 'f1': 0.511177347242921}
Training Epoch: 6


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.07631284804472577
Evaluating Epoch: 6


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.7961839575458456
Eval Metrics: {'accuracy_score': 0.8846286191810451, 'precision': 0.5923248053392659, 'recall': 0.4622395833333333, 'f1': 0.5192588980984885}
Training Epoch: 7


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.06416562255955052
Evaluating Epoch: 7


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.8373791569912875
Eval Metrics: {'accuracy_score': 0.8842845204738731, 'precision': 0.5769230769230769, 'recall': 0.4752604166666667, 'f1': 0.5211803902903379}
Training Epoch: 8


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.05651189106105899
Evaluating Epoch: 8


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.8437369910103304
Eval Metrics: {'accuracy_score': 0.8840387356830359, 'precision': 0.5982192543127435, 'recall': 0.4665798611111111, 'f1': 0.5242623750304805}
Training Epoch: 9


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.051024890912734294
Evaluating Epoch: 9


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.854430982084186
Eval Metrics: {'accuracy_score': 0.8852676596372216, 'precision': 0.5929108485499462, 'recall': 0.4791666666666667, 'f1': 0.530004800768123}
Training Epoch: 10


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.04653405823532803
Evaluating Epoch: 10


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9700526621330667
Eval Metrics: {'accuracy_score': 0.8850218748463845, 'precision': 0.6017997750281214, 'recall': 0.4644097222222222, 'f1': 0.5242528172464477}
Training Epoch: 11


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.042946181708265106
Evaluating Epoch: 11


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9584150235686038
Eval Metrics: {'accuracy_score': 0.8841862065575382, 'precision': 0.6114318706697459, 'recall': 0.4596354166666667, 'f1': 0.5247770069375619}
Training Epoch: 12


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.03935922264699859
Evaluating Epoch: 12


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9524180121995784
Eval Metrics: {'accuracy_score': 0.8839895787248685, 'precision': 0.5943600867678959, 'recall': 0.4756944444444444, 'f1': 0.5284474445515912}
Training Epoch: 13


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.03613666059168845
Evaluating Epoch: 13


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 0.9665944060241735
Eval Metrics: {'accuracy_score': 0.8841862065575382, 'precision': 0.5970982142857143, 'recall': 0.4644097222222222, 'f1': 0.5224609375}
Training Epoch: 14


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.034737942031093524
Evaluating Epoch: 14


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0519982190043837
Eval Metrics: {'accuracy_score': 0.8831539104360222, 'precision': 0.6087209302325581, 'recall': 0.4544270833333333, 'f1': 0.5203777335984094}
Training Epoch: 15


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.03224588186555255
Evaluating Epoch: 15


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.048440913911219
Eval Metrics: {'accuracy_score': 0.8837437939340314, 'precision': 0.5962732919254659, 'recall': 0.4583333333333333, 'f1': 0.5182822085889571}
Training Epoch: 16


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.03060057525229788
Evaluating Epoch: 16


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.0977225838987916
Eval Metrics: {'accuracy_score': 0.8839895787248685, 'precision': 0.6069819819819819, 'recall': 0.4678819444444444, 'f1': 0.5284313725490195}
Training Epoch: 17


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.029367484422722665
Evaluating Epoch: 17


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.110070372345271
Eval Metrics: {'accuracy_score': 0.8830555965196873, 'precision': 0.60250569476082, 'recall': 0.4592013888888889, 'f1': 0.5211822660098523}
Training Epoch: 18


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.027367006767922288
Evaluating Epoch: 18


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.117373406611107
Eval Metrics: {'accuracy_score': 0.8830555965196873, 'precision': 0.5958751393534002, 'recall': 0.4639756944444444, 'f1': 0.5217179111761836}
Training Epoch: 19


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.02556927624575912
Evaluating Epoch: 19


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.122582860842899
Eval Metrics: {'accuracy_score': 0.8835471661013616, 'precision': 0.5973229224762967, 'recall': 0.46484375, 'f1': 0.5228215767634855}
Training Epoch: 20


  0%|          | 0/347 [00:00<?, ?it/s]

Train loss: 0.024970477223309074
Evaluating Epoch: 20


  0%|          | 0/54 [00:00<?, ?it/s]

Eval loss: 1.126682222717338
Eval Metrics: {'accuracy_score': 0.8836454800176965, 'precision': 0.5992196209587514, 'recall': 0.4665798611111111, 'f1': 0.52464616886286}
DONE!


In [10]:
sentences = []
labels = []

curr_sentence = []
curr_labels = []

with open("SemiLabeledArticles_G.txt", "r") as file:
    for line in file:
        if line != "\n":
            label = line.split()[0]
            word = line.split()[1]
            
            curr_sentence.append(word)
            curr_labels.append(label)
        else:
            sentences.append(curr_sentence)
            labels.append(curr_labels)
            curr_sentence = []
            curr_labels = []
            
print("DONE!")           

DONE!


In [11]:
print(len(sentences), len(labels))

21726 21726


In [12]:
def remove_diacritics(text):
    return re.sub(re.compile(r'[\u0617-\u061A\u064B-\u0652]'),"", text)

In [13]:
class NERPredictor:
    def __init__(self, model_path: str):
        self.cfg = Config() # Initialize and load configuration settings from the Config class.
        
        # Read the label list from the specified file path.
        self.label_list = read_labels('NewEntities.txt')
        # Create mappings from labels to indices and vice versa.
        self.label_map = get_label_map(self.label_list)
        self.inv_label_map = get_inv_label_map(self.label_list)

        # Load the pre-trained BERT model for token classification.
        self.model = BertForTokenClassification.from_pretrained(
            self.cfg.MODEL_NAME,
            return_dict=True,
            num_labels=len(self.label_map),
            output_attentions=False,
            output_hidden_states=False
        )

        # Load the saved model weights.
        self.model.load_state_dict(torch.load(model_path, map_location='cpu'))
        # Load the tokenizer associated with the pre-trained BERT model.
        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.MODEL_NAME)

    def predict(self, sentences: str) -> Tuple[List[str], List[float]]:

        
        foundDecimal = False
        
        tokenArray = sentences.split()
        
        for i in range(0, len(tokenArray)):
            tokenArray[i] = remove_diacritics(tokenArray[i])
            
        sentences = ' '.join(tokenArray)
        # print(sentences)
        
        # Tokenize the input sentence to get input IDs.
        input_ids = self.tokenizer.encode(sentences, return_tensors='pt')
        # print(len(input_ids[0]), input_ids)
        with torch.no_grad(): # Disable gradient calculations for inference.
            self.model.to('cpu') # Ensure the model is on CPU for inference.
            # Get model predictions for the input IDs.
            output = self.model(input_ids)

        logits = output.logits.to('cpu')  # Get logits from the model's output
        probabilities = F.softmax(logits, dim=-1)  # Apply softmax to get probabilities
       
        # Extract predicted labels and confidence scores
        label_indices = torch.argmax(logits, dim=-1).numpy()
        max_probabilities, predicted_labels = torch.max(probabilities, dim=-1)
        confidence_scores = max_probabilities.numpy()

    
        # Convert model logits to label indices.
        # label_indices = np.argmax(output.logits.to('cpu').numpy(), axis=2)
        # Convert input IDs back to tokens.
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

        new_tokens, new_labels, new_confidence_scores = [], [], []
        merges = []
        
        new_confidence_scores = confidence_scores[0].copy()
        
        label_indices_0 = label_indices[0]
        for i in range(len(tokens)):
            token = tokens[i]
            label_idx = label_indices_0[i]
            # Merge subword tokens that start with "##".
            # print(token, label_idx, "DEBUG", i)
            # print(token, "DEBUG", i)

            if foundDecimal:
                # print("__________foundDecimal__________")
                token = "##" + tokens[i]
                foundDecimal = False
            if token.startswith(".") and tokens[i-1][-1].isdigit() and i != len(tokens)-2:
                # print("__________foundDecimal__________")
                foundDecimal = True
                token = "##" + tokens[i]
            # Disabled below as that means there's an error in the sentences itself
            if token == "%" or token == "٪" and i != 1:
                # print("________foundPercentage_________")
                token = "##" + tokens[i]
                # print(token)
            if (token == "٬" or token == "٫") and (any(char.isdigit() for char in prev_token) and any(char.isdigit() for char in tokens[i+1])):
                # print("___________foundComma___________")
                token = "##" + tokens[i]
                foundDecimal = True
                
            elif (token == "٬" or token == "٫") and not (any(char.isdigit() for char in prev_token) and any(char.isdigit() for char in tokens[i+1])):
                # print("___________foundComma___________")
                continue
                
            if token == "؟" or token == "?":
                # print("___________foundQuestionMark___________")
                token = "##" + tokens[i]                            
            if token.startswith("##"):
                merges.append(i)
                # print("_____________MERGE_______________")
                new_tokens[-1] = new_tokens[-1] + token[2:]

            else:
                if input_ids[0][i] == 2 or input_ids[0][i] == 3:
                    continue
                # Append the label for the token to new_labels.
                new_labels.append(self.inv_label_map[label_idx])
                # Append the token to new_tokens.
                new_tokens.append(token)


                
            prev_token = token   
            
        new_confidence_scores = confidence_scores[0].copy()

        # Iterate through confidence scores
        i = 0
        while i < len(new_confidence_scores):
            # Check if current index is a merge point
            if i in merges:
                # Determine which element to keep and delete the other
                if new_confidence_scores[i] >= new_confidence_scores[i - 1]:
                    new_confidence_scores = np.delete(new_confidence_scores, i - 1)
                    i -= 1  # Adjust index after deletion
                else:
                    new_confidence_scores = np.delete(new_confidence_scores, i)
                    i -= 1  # Adjust index after deletion
                # Adjust merge indices
                merges = [merge - 1 for merge in merges if merge > i]
            i += 1  # Move to the next element


            
            
        # Return the list of labels corresponding to each token in the input.
        return new_tokens, new_labels, new_confidence_scores[1:-1]
    
if __name__ == '__main__':
    predictor = NERPredictor(model_path='TeacherG_Model.pt')

    # index = 10
    # curr_tokens, curr_predicted_labels, curr_confidence_scores = predictor.predict(' '.join(sentences[index]))
    # # predictions[index] = curr_predicted_labels
    # print(len(curr_predicted_labels), len(curr_confidence_scores), len(sentences[index]))
    # print(curr_tokens, curr_predicted_labels, curr_confidence_scores)       
    
    
    predictions = []
    confidence_scores = []
    copySentences = []
    for i in range(0, len(sentences)):
        curr_tokens, curr_predicted_labels, curr_confidence_scores = predictor.predict(' '.join(sentences[i]))
        predictions.append(curr_predicted_labels)
        copySentences.append(curr_tokens)
        confidence_scores.append(curr_confidence_scores)
        if i % 100 == 0:
            print(round((i/len(sentences)) * 100, 2), "%")
        
    print("100 %")
    print("Done!")

    # index = 4
    # index = 4054
    # index = 5105
    # index = 6948
    # index = 9796
    # index = 12370
    # index = 14121
    # index = 18021
    # index = 20745
    # index = 253
    # index = 261
    # index = 268
    
#     curr_tokens, curr_predicted_labels, curr_confidence_scores = predictor.predict(' '.join(sentences[index]))

#     # Print titles
#     print(f"{'Original Token':<20} {'True Label':<20} {'Word':<20} {'Prediction':<20}")

#     # Print tokens and corresponding labels
#     for i in range(0, len(sentences[index])):
#         print(f"{sentences[index][i]:<20} {labels[index][i]:<20} {curr_tokens[i]:<20} {curr_predicted_labels[i]:<20}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.0 %
0.46 %
0.92 %
1.38 %
1.84 %
2.3 %
2.76 %
3.22 %
3.68 %
4.14 %
4.6 %
5.06 %
5.52 %
5.98 %
6.44 %
6.9 %
7.36 %
7.82 %
8.29 %
8.75 %
9.21 %
9.67 %
10.13 %
10.59 %
11.05 %
11.51 %
11.97 %
12.43 %
12.89 %
13.35 %
13.81 %
14.27 %
14.73 %
15.19 %
15.65 %
16.11 %
16.57 %
17.03 %
17.49 %
17.95 %
18.41 %
18.87 %
19.33 %
19.79 %
20.25 %
20.71 %
21.17 %
21.63 %
22.09 %
22.55 %
23.01 %
23.47 %
23.93 %
24.39 %
24.86 %
25.32 %
25.78 %
26.24 %
26.7 %
27.16 %
27.62 %
28.08 %
28.54 %
29.0 %
29.46 %
29.92 %
30.38 %
30.84 %
31.3 %
31.76 %
32.22 %
32.68 %
33.14 %
33.6 %
34.06 %
34.52 %
34.98 %
35.44 %
35.9 %
36.36 %
36.82 %
37.28 %
37.74 %
38.2 %
38.66 %
39.12 %
39.58 %
40.04 %
40.5 %
40.96 %
41.43 %
41.89 %
42.35 %
42.81 %
43.27 %
43.73 %
44.19 %
44.65 %
45.11 %
45.57 %
46.03 %
46.49 %
46.95 %
47.41 %
47.87 %
48.33 %
48.79 %
49.25 %
49.71 %
51.09 %
51.55 %
52.01 %
52.47 %
52.93 %
53.39 %
53.85 %
54.31 %
54.77 %
55.23 %
55.69 %
56.15 %
56.61 %
57.07 %
57.53 %
58.0 %
58.46 %
58.92 %
59.38 %
59.84 %
60

In [14]:
import dill
# dill.load_session('June9STeachingCompleted.db')
dill.dump_session('TeachingG_Completed.db')

In [23]:
index = 0
#     index = 4054
#     index = 5105
#     index = 6948
#     index = 9796
#     index = 12370
#     index = 14121
#     index = 18021
# index = 20745
# index = 253
# index = 6919
# index = 12612
# index = 12617
# index = 20745
curr_tokens, curr_predicted_labels, curr_confidence_scores = predictor.predict(' '.join(sentences[index]))

# print(f"{'Original Token':<20} {'True Label':<20} {'Word':<20} {'Prediction':<20} {'UpdatedLabels':<20}")
print(f"{'Original Token':<20} {'True Label':<20} {'Word':<20} {'Prediction':<20}")


for i in range(0, len(sentences[index])):
    print(f"{sentences[index][i]:<20} {labels[index][i]:<20} {curr_tokens[i]:<20} {curr_predicted_labels[i]:<20}")
    # print(f"{sentences[index][i]:<20} {labels[index][i]:<20} {curr_tokens[i]:<20} {curr_predicted_labels[i]:<20} {updated_labels[index][i]:<20}")


Original Token       True Label           Word                 Prediction          
إنجلترا              B-Country            إنجلترا              B-Country           
أو                   OUTSIDE              أو                   OUTSIDE             
إنكلترا              B-Country            إنكلترا              B-Country           
أو                   OUTSIDE              أو                   OUTSIDE             
إنكلترة              B-Country            إنكلترة              B-Country           
بالإنجليزية          B-Language           بالإنجليزية          B-Language          
إنگلاند              OUTSIDE              [UNK]                OUTSIDE             
هي                   OUTSIDE              هي                   OUTSIDE             
أكبر                 OUTSIDE              أكبر                 OUTSIDE             
دولة                 OUTSIDE              دولة                 OUTSIDE             
في                   OUTSIDE              في                   OUTSIDE      

In [19]:
print(len(sentences), len(predictions), len(labels))

21726 21726 21726


In [20]:
#Snippet to check if we have an error at some index
for i in range(0, len(sentences)):
    # print(len(sentences[i]), len(labels[i]), len(predictions[i]))
    if len(labels[i]) != len(predictions[i]):
        print(i)


In [21]:
updated_labels = []
# len(sentences)

for i in range(0, len(sentences)):
    sentence = sentences[i]
    sentence_labels = labels[i]
    label_confidence = confidence_scores[i]
    predictedLabels = predictions[i]
    nonLabeledCount = 0
    nonLabeledIndex = []
    averageConfidence = 0
    threshold = 0.80
    tmpCurrLabels = []
    
    for j in range(0, len(sentence_labels)):
        curr_label = sentence_labels[j]
        
        if curr_label == "OUTSIDE":
            nonLabeledCount += 1
            nonLabeledIndex.append(j)
            
    for k in range(0, len(label_confidence)):
        if k in nonLabeledIndex:
            averageConfidence += label_confidence[k]
    
    averageConfidence = averageConfidence / nonLabeledCount
    
    if nonLabeledCount > 0 and averageConfidence >= threshold:
        for c in range(0, len(sentence_labels)):
            if c not in nonLabeledIndex:
                tmpCurrLabels.append(sentence_labels[c])

            else:
                # print(c)
                tmpCurrLabels.append(predictedLabels[c])
    else:
        tmpCurrLabels = sentence_labels.copy()
        
    updated_labels.append(tmpCurrLabels)
            
print("DONE!")

DONE!


In [22]:
with open('Labeled_SemiLabeledData_G.txt', 'w') as file:
    for i in range(0, len(updated_labels)):
        for j in range(0, len(updated_labels[i])):
            file.write(f"{updated_labels[i][j]} {sentences[i][j]}\n")
        file.write('\n')
