In [3]:
# ! pip install seqeval transformers datasets tokenizers seqeval evaluate
! pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=c5f22c4f4cbcfae7ea07e871ed6db418418d020a3b7042586aed501bd9505010
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
import re
import datasets
import json
import numpy as np 
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support
from datasets import load_dataset, Dataset
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForTokenClassification, BertConfig, DataCollatorForTokenClassification, BertTokenizerFast, TrainingArguments, Trainer, EarlyStoppingCallback
from tqdm import tqdm

2024-05-05 22:31:43.161648: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-05 22:31:43.161743: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-05 22:31:43.287533: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
def normalize_arabic(text):
    alif_maksura_to_yeh = re.sub(r'[يى]', 'ي', text)
    teh_marbuta_to_heh = re.sub(r'ة', 'ه', alif_maksura_to_yeh)
    alifs_normalized = re.sub(r'[أإآ]', 'ا', teh_marbuta_to_heh)
    kafs_normalized = re.sub(r'ک', 'ك', alifs_normalized)
    text_cleaned = re.sub(r'[\u064B-\u065F]', '', kafs_normalized)

    return text_cleaned

In [5]:
def process_json_file_nested(file_path, tag_to_int):
    with open(file_path, 'r') as file:
        data = json.load(file)

    token_data_layers = []
    label_data_layers = []
    max_depth = 0

    for sentence in data:
        for token_info in sentence['tokens']:
            max_depth = max(max_depth, len(token_info['tags']))

    for _ in range(max_depth):
        token_data_layers.append([])
        label_data_layers.append([])

    for sentence in data:
        for depth in range(max_depth):
            token_list = []
            label_list = []
            for token_info in sentence['tokens']:
                token = token_info['token']
                if depth < len(token_info['tags']):
                    tag_info = token_info['tags'][depth]
                    value = tag_info['value']
                else:
                    value = "O"
                token_list.append(token)
                label_list.append(tag_to_int[value])
            token_data_layers[depth].append(token_list)
            label_data_layers[depth].append(label_list)

    datasets = []
    for i in range(max_depth):
        datasets.append({'tokens': token_data_layers[i], 'labels': label_data_layers[i]})

    return datasets

In [6]:
train_path = '/kaggle/input/ner-nested/split70.json'
valid_path = '/kaggle/input/ner-nested/split10.json'
test_path = '/kaggle/input/ner-nested/split10.json'

In [7]:
import json

def extract_unique_tags(file_paths):
    unique_tags = set()
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            data = json.load(file)
            for sentence in data:
                for token_info in sentence['tokens']:
                    for tag_info in token_info['tags']:
                        unique_tags.add(tag_info['value'])
                        if tag_info.get('tags'):
                            for nested_tag in tag_info['tags']:
                                unique_tags.add(nested_tag['value'])
    return unique_tags

# Define file paths for your training, validation, and test datasets
file_paths = [train_path, valid_path, test_path]
unique_tags = extract_unique_tags(file_paths)
tag_to_int = {tag: idx for idx, tag in enumerate(unique_tags)}


In [8]:
# Load datasets for the first layer only
train_datasets = process_json_file_nested(train_path, tag_to_int)
valid_datasets = process_json_file_nested(valid_path, tag_to_int)
test_datasets = process_json_file_nested(test_path, tag_to_int)
label_ids = {idx: label for label, idx in tag_to_int.items()}

train_ds = Dataset.from_dict(train_datasets[0])
valid_ds = Dataset.from_dict(valid_datasets[0])
test_ds = Dataset.from_dict(test_datasets[0])

train_ds_layers = [Dataset.from_dict(layer) for layer in train_datasets]
valid_ds_layers = [Dataset.from_dict(layer) for layer in valid_datasets]
test_ds_layers = [Dataset.from_dict(layer) for layer in test_datasets]

# Select the first layer (index 0)
# train_dataset = train_datasets[0]
# valid_dataset = valid_datasets[0]
# test_dataset = test_datasets[0]
# tag_to_int = {label: idx for idx, label in enumerate(set([lbl for sublist in train_dataset['labels'] for lbl in sublist]))}
# label_ids = {idx: label for label, idx in tag_to_int.items()}

# train_ds = Dataset.from_dict(train_dataset)
# valid_ds = Dataset.from_dict(valid_dataset)
# test_ds = Dataset.from_dict(test_dataset)


In [9]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [10]:
model_path = "aubmindlab/bert-base-arabertv2"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
data_collator = DataCollatorForTokenClassification(tokenizer) 
metric = datasets.load_metric("seqeval") 

tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

  metric = datasets.load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [11]:
def tokenize_and_prepare(dataset):
    return dataset.map(tokenize_and_align_labels, batched=True)

In [12]:
tokenized_train_ds = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_valid_ds = valid_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test_ds = test_ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/23125 [00:00<?, ? examples/s]

Map:   0%|          | 0/3304 [00:00<?, ? examples/s]

Map:   0%|          | 0/3304 [00:00<?, ? examples/s]

In [13]:
class ArabNERModel(AutoModelForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.loss = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        outputs = super().forward(input_ids, attention_mask=attention_mask, **kwargs)
        if labels is not None:
            loss = self.loss(outputs.logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs[1:]
        return outputs

In [14]:
label_list = list(label_ids.values())
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    pred_logits = np.argmax(pred_logits, axis=2) 
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
    "precision": results["overall_precision"], 
    "recall": results["overall_recall"], 
    "f1": results["overall_f1"], 
    "accuracy": results["overall_accuracy"], 
  } 

In [15]:
def refine_input_features(model, dataset, tokenizer, tag_to_int):
    model.eval()  # Set the model to evaluation mode to disable training-specific behaviors
    refined_datasets = []

#         print(data)
    tokens = dataset['tokens']  # Accessing tokens directly
    labels = dataset['labels']  # Accessing labels directly

    # Tokenizing the tokens for model input
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = outputs.logits.argmax(-1).squeeze().tolist()

    # Select tokens and labels based on predictions not being 'O'
    refined_tokens = [token for token, pred in zip(tokens, predictions) if tag_to_int[labels[pred]] != tag_to_int['O']]
    refined_labels = [label for label, pred in zip(labels, predictions) if tag_to_int[label] != tag_to_int['O']]

    refined_datasets.append({'tokens': refined_tokens, 'labels': refined_labels})

    return refined_datasets


In [18]:
# Set up training arguments
args = TrainingArguments(
    "Results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    weight_decay=0.01,
    report_to="tensorboard",
    load_best_model_at_end=True,
    save_total_limit=5,
    lr_scheduler_type='linear',
    warmup_ratio=0.1
)

# Initialize and train the model
model = ArabNERModel.from_pretrained(model_path, num_labels=len(tag_to_int))

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_valid_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=8)]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = model.to(device)
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Using device: cuda


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.3264,0.488925,0.728722,0.671565,0.698977,0.902456
2,0.3476,0.218906,0.801759,0.843131,0.821924,0.940027
3,0.1922,0.176181,0.838428,0.878715,0.858099,0.948389
4,0.1227,0.170293,0.849036,0.890604,0.869323,0.951121


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [None]:
# Evaluate the model
metrics = trainer.evaluate(eval_dataset=tokenized_test_ds)
print(f"Evaluation metrics: {metrics}")

# Save the trained model
model.save_pretrained("outer_layer_model")
tokenizer.save_pretrained("outer_layer_tokenizer")

In [None]:
# from transformers import TrainingArguments, Trainer

# def train_sequential_layers(models, datasets, tokenizer, num_layers, tag_to_int, device):
#     for layer in range(1, num_layers):  # Start from the second layer
#         print(f"Training layer {layer}")
#         # Assume refine_input_features returns a list of refined texts
#         refined_dataset = refine_input_features(models[layer - 1], datasets[layer - 1], tokenizer, tag_to_int)

#         # Prepare the new training dataset
#         input_ids = tokenizer([data['tokens'] for data in refined_dataset], is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)['input_ids']
#         labels = [data['labels'] for data in refined_dataset]  # Assuming labels need similar handling

#         # Create Dataset objects for training
#         train_dataset = Dataset.from_dict({'input_ids': input_ids, 'labels': labels})

#         # Train a new model for this layer
#         new_model = ArabNERModel.from_pretrained('aubmindlab/bert-base-arabertv2', num_labels=len(tag_to_int))
#         new_model.to(device)  # Ensure the model is on the correct device
        
#         training_args = TrainingArguments(
#             output_dir=f"results_layer_{layer}",
#             evaluation_strategy="epoch",
#             learning_rate=2e-5,
#             per_device_train_batch_size=8,
#             num_train_epochs=3,
#             save_strategy="no",
#             logging_dir=f"logs_layer_{layer}"  # Added logging directory for clarity
#         )
        
#         trainer = Trainer(
#             model=new_model,
#             args=training_args,
#             train_dataset=train_dataset,
#             tokenizer=tokenizer,
#             callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
#         )
        
#         # Train the model
#         trainer.train()
#         models.append(new_model)

#     return models

# # Assuming initial_model is pre-trained and ready
# models = [model]
# num_layers = 5  # Total number of layers including the initial pre-trained layer
# trained_models = train_sequential_layers(models, train_datasets[1:], tokenizer, num_layers, tag_to_int, device)

In [None]:
# from transformers import TrainingArguments, Trainer
# # args = TrainingArguments(
# #     "Results",
# #     evaluation_strategy="epoch",
# #     save_strategy="epoch",
# #     learning_rate=2e-5,
# #     per_device_train_batch_size=16,
# #     per_device_eval_batch_size=16,
# #     num_train_epochs=1,
# #     weight_decay=0.01,
# #     report_to="tensorboard",
# #     load_best_model_at_end=True,
# #     save_total_limit=5,
# #     lr_scheduler_type='linear',
# #     warmup_ratio=0.1
# # )

# # # Initialize and train the model
# # model = ArabNERModel.from_pretrained(model_path, num_labels=len(tag_to_int))

# # trainer = Trainer(
# #     model,
# #     args,
# #     train_dataset=tokenized_train_ds,
# #     eval_dataset=tokenized_valid_ds,
# #     data_collator=data_collator,
# #     tokenizer=tokenizer,
# #     compute_metrics=compute_metrics,
# #     callbacks=[EarlyStoppingCallback(early_stopping_patience=8)]
# # )

# def train_layer(model, tokenized_train_dataset, tokenized_valid_ds, tokenized_test_ds, tokenizer, layer_index, device):
#     training_args = TrainingArguments(
#         f"Results_{layer_index}",
#         evaluation_strategy="epoch",
#         save_strategy="epoch",
#         learning_rate=2e-5,
#         per_device_train_batch_size=16,
#         per_device_eval_batch_size=16,
#         num_train_epochs=1,
#         weight_decay=0.01,
#         report_to="tensorboard",
#         load_best_model_at_end=True,
#         save_total_limit=5,
#         lr_scheduler_type='linear',
#         warmup_ratio=0.1
#     )
    
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=tokenized_train_dataset,
#         eval_dataset=tokenized_valid_ds,
#         tokenizer=tokenizer,
#         compute_metrics=compute_metrics,
#         callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
#     )
    
#     # Train the model
#     trainer.train()
#     # Evaluate the model
#     metrics = trainer.evaluate(eval_dataset=tokenized_test_ds)
#     print(f"Evaluation metrics: {metrics}")
    
#     # Save model to disk
#     model.save_pretrained(f"saved_model_layer_{layer_index}")
    
#     # Clear memory
#     del model
#     torch.cuda.empty_cache()

# def train_sequential_layers(model_paths, train_datasets, validation_datasets, test_datasets, tokenizer, num_layers, device):
#     models = []
#     for layer in range(1, num_layers):
#         print(f"Training layer {layer}")
#         # Load the model for the current layer
#         model = ArabNERModel.from_pretrained(model_paths[layer - 1])
        
#         tokenizer = BertTokenizerFast.from_pretrained(model_paths[layer - 1])
#         data_collator = DataCollatorForTokenClassification(tokenizer) 
#         model.to(device)
        
#         if not train_datasets[layer-1] or not validation_datasets[layer-1] or not test_datasets[layer-1]:
#             logging.warning(f"Empty dataset for layer {layer}, skipping training.")
#             continue
        
#         train_ds = Dataset.from_dict(train_datasets[layer-1])
#         valid_ds = Dataset.from_dict(validation_datasets[layer-1])
#         test_ds = Dataset.from_dict(test_datasets[layer-1])
#         tokenized_train_ds = train_ds.map(tokenize_and_align_labels, batched=True)
#         tokenized_valid_ds = valid_ds.map(tokenize_and_align_labels, batched=True)
#         tokenized_test_ds = test_ds.map(tokenize_and_align_labels, batched=True)
        
#         # Train the model
#         train_layer(model, tokenized_train_ds, tokenized_valid_ds, tokenized_test_ds, tokenizer, layer, device)
        
#         # Append model path for next layer initialization
#         models.append(f"saved_model_layer_{layer}")
    
#     return models

# # Initial model path
# # initial_model_path = "initial_model_directory"
# # initial_model_path = "/kaggle/working/Results/runs"
# num_layers = 5

# model_paths = [model_path] * (num_layers - 1)  # Paths for later initialized layers
# print(model_paths)
# trained_models = train_sequential_layers(model_paths, train_datasets[1:], valid_datasets[1:], test_datasets[1:], tokenizer, num_layers, device)


In [20]:
torch.cuda.empty_cache()  # Clear cache before starting the training