In [1]:
# ! pip install seqeval transformers datasets tokenizers seqeval evaluate
! pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=7679254343dd6bca0a018461108649a4c4f38d0c9dd1122f3703a2ccb79a81dd
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
import re
import datasets
import json
import numpy as np 
import pandas as pd
import torch
import os
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support
from datasets import load_dataset, Dataset
from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForTokenClassification, BertConfig, DataCollatorForTokenClassification, BertTokenizerFast, TrainingArguments, Trainer, EarlyStoppingCallback
from tqdm import tqdm

2024-05-06 20:11:55.414583: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 20:11:55.414683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 20:11:55.608303: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [35]:
def normalize_arabic(text):
    alif_maksura_to_yeh = re.sub(r'[يى]', 'ي', text)
    teh_marbuta_to_heh = re.sub(r'ة', 'ه', alif_maksura_to_yeh)
    alifs_normalized = re.sub(r'[أإآ]', 'ا', teh_marbuta_to_heh)
    kafs_normalized = re.sub(r'ک', 'ك', alifs_normalized)
    text_cleaned = re.sub(r'[\u064B-\u065F]', '', kafs_normalized)

    return text_cleaned

In [None]:
# def process_json_file_nested(file_path, tag_to_int):
#     with open(file_path, 'r') as file:
#         data = json.load(file)

#     token_data_layers = []
#     label_data_layers = []
#     max_depth = 0

#     for sentence in data:
#         for token_info in sentence['tokens']:
#             max_depth = max(max_depth, len(token_info['tags']))

#     for _ in range(max_depth):
#         token_data_layers.append([])
#         label_data_layers.append([])

#     for sentence in data:
#         for depth in range(max_depth):
#             token_list = []
#             label_list = []
#             for token_info in sentence['tokens']:
#                 token = token_info['token']
#                 if depth < len(token_info['tags']):
#                     tag_info = token_info['tags'][depth]
#                     value = tag_info['value']
#                 else:
#                     value = "O"
#                 token_list.append(token)
#                 label_list.append(tag_to_int[value])
#             token_data_layers[depth].append(token_list)
#             label_data_layers[depth].append(label_list)

#     datasets = []
#     for i in range(max_depth):
#         datasets.append({'tokens': token_data_layers[i], 'labels': label_data_layers[i]})

#     return datasets

In [3]:
def process_json_file_flat(file_path, tag_to_int):
    with open(file_path, 'r') as file:
        data = json.load(file)

    token_data = []
    label_data = []
    
    for sentence in data:
        tokens = []
        combined_labels = []
        for token_info in sentence['tokens']:
            token = token_info['token']
            tokens.append(token)
            
            # Create a combined tag from all layers available for this token
            tag_combination = '-'.join([tag['value'] for tag in token_info['tags']])
            if tag_combination not in tag_to_int:
                tag_to_int[tag_combination] = len(tag_to_int)  # Assign new unique integer if not in dict
            combined_labels.append(tag_to_int[tag_combination])
        
        token_data.append(tokens)
        label_data.append(combined_labels)
    
    return {'tokens': token_data, 'labels': label_data}, tag_to_int

# Use this function to prepare your data
# combined_dataset, updated_tag_to_int = process_json_file_flat('path_to_your_file.json', {})


In [4]:
train_path = '/kaggle/input/ner-nested/split70.json'
valid_path = '/kaggle/input/ner-nested/split10.json'
test_path = '/kaggle/input/nested-ner-test/split20-nested-unlabeled.json'

In [5]:
import json

def extract_unique_tags(file_paths):
    unique_tags = set()
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            data = json.load(file)
            for sentence in data:
                for token_info in sentence['tokens']:
                    for tag_info in token_info['tags']:
                        unique_tags.add(tag_info['value'])
                        if tag_info.get('tags'):
                            for nested_tag in tag_info['tags']:
                                unique_tags.add(nested_tag['value'])
    return unique_tags

def calculate_tag_frequencies(labels, label_ids):
    tag_counts = {}
    for sublist in labels:
        for label in sublist:
            tag_name = label_ids[label]
            if tag_name in tag_counts:
                tag_counts[tag_name] += 1
            else:
                tag_counts[tag_name] = 1
    return tag_counts

# Define file paths for your training, validation, and test datasets
file_paths = [train_path, valid_path, test_path]
unique_tags = extract_unique_tags(file_paths)
tag_to_int = {tag: idx for idx, tag in enumerate(unique_tags)}


In [6]:
# Load datasets for the first layer only
train_datasets, tag_to_int = process_json_file_flat(train_path, tag_to_int)
valid_datasets, _ = process_json_file_flat(valid_path, tag_to_int)
test_datasets, _ = process_json_file_flat(test_path, tag_to_int)
label_ids = {idx: label for label, idx in tag_to_int.items()}
tag_counts = calculate_tag_frequencies(train_datasets['labels'], label_ids)



train_ds = Dataset.from_dict(train_datasets)
valid_ds = Dataset.from_dict(valid_datasets)
test_ds = Dataset.from_dict(test_datasets)

# Select the first layer (index 0)
# train_dataset = train_datasets[0]
# valid_dataset = valid_datasets[0]
# test_dataset = test_datasets[0]
# tag_to_int = {label: idx for idx, label in enumerate(set([lbl for sublist in train_dataset['labels'] for lbl in sublist]))}
# label_ids = {idx: label for label, idx in tag_to_int.items()}

# train_ds = Dataset.from_dict(train_dataset)
# valid_ds = Dataset.from_dict(valid_dataset)
# test_ds = Dataset.from_dict(test_dataset)


In [66]:
all_labels = [label for sublist in train_datasets['labels'] for label in sublist]

# Count the frequency of each unique label
from collections import Counter
label_frequencies = Counter(all_labels)

# Print the frequencies
for label_id, freq in label_frequencies.items():
    label_name = next(key for key, value in tag_to_int.items() if value == label_id)
    print(f'Label: {label_name}, Frequency: {freq}')

Label: O, Frequency: 254050
Label: B-CARDINAL, Frequency: 1291
Label: B-ORG, Frequency: 10572
Label: I-ORG, Frequency: 10070
Label: B-DATE, Frequency: 10705
Label: B-LANGUAGE, Frequency: 139
Label: B-NORP, Frequency: 3585
Label: B-PERS, Frequency: 4515
Label: I-PERS, Frequency: 4698
Label: B-OCC, Frequency: 3716
Label: I-DATE, Frequency: 39338
Label: B-GPE, Frequency: 8052
Label: B-EVENT, Frequency: 1845
Label: I-EVENT, Frequency: 1560
Label: I-OCC-B-PERS, Frequency: 6
Label: I-OCC-I-PERS, Frequency: 2
Label: I-CARDINAL, Frequency: 350
Label: B-FAC, Frequency: 560
Label: I-FAC, Frequency: 472
Label: B-LOC, Frequency: 747
Label: B-MONEY, Frequency: 148
Label: I-MONEY-B-CURR, Frequency: 137
Label: B-ORDINAL, Frequency: 2739
Label: I-LANGUAGE, Frequency: 4
Label: B-TIME, Frequency: 309
Label: I-TIME, Frequency: 250
Label: I-ORG-B-GPE, Frequency: 4482
Label: B-MONEY-B-CURR, Frequency: 24
Label: I-GPE, Frequency: 4751
Label: I-LOC, Frequency: 463
Label: I-NORP, Frequency: 1668
Label: B-ORG-

In [7]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
model_path = "aubmindlab/bert-base-arabertv2"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
data_collator = DataCollatorForTokenClassification(tokenizer) 
metric = datasets.load_metric("seqeval") 

tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/720k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

  metric = datasets.load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
def tokenize_and_prepare(dataset):
    return dataset.map(tokenize_and_align_labels, batched=True)

In [9]:
tokenized_train_ds = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_valid_ds = valid_ds.map(tokenize_and_align_labels, batched=True)
tokenized_test_ds = test_ds.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/23125 [00:00<?, ? examples/s]

Map:   0%|          | 0/3304 [00:00<?, ? examples/s]

Map:   0%|          | 0/6606 [00:00<?, ? examples/s]

In [42]:
def calculate_weights(tag_counts):
    total_tags = sum(tag_counts.values())
    weights = {tag: total_tags / count for tag, count in tag_counts.items()}

    max_weight = max(weights.values())
    weights_normalized = {tag: weight / max_weight for tag, weight in weights.items()}

    return list(weights_normalized.values())

class_weights = torch.tensor(calculate_weights(tag_counts), dtype=torch.float32)

In [43]:
weighted_loss = nn.CrossEntropyLoss(weight=class_weights)

class ArabNERModelWithWeightedLoss(AutoModelForTokenClassification):
    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        outputs = super().forward(input_ids, attention_mask=attention_mask, **kwargs)
        if labels is not None:
            loss = weighted_loss(outputs.logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs[1:]
        return outputs

model = ArabNERModelWithWeightedLoss.from_pretrained(model_path, num_labels=len(label_ids.values()))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:


class ArabNERModel(AutoModelForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.loss = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        outputs = super().forward(input_ids, attention_mask=attention_mask, **kwargs)
        if labels is not None:
            loss = self.loss(outputs.logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs[1:]
        return outputs

In [12]:
label_list = list(label_ids.values())
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    pred_logits = np.argmax(pred_logits, axis=2) 
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
    "precision": results["overall_precision"], 
    "recall": results["overall_recall"], 
    "f1": results["overall_f1"], 
    "accuracy": results["overall_accuracy"], 
  } 

In [None]:
def refine_input_features(model, dataset, tokenizer, tag_to_int):
    model.eval()  # Set the model to evaluation mode to disable training-specific behaviors
    refined_datasets = []

#         print(data)
    tokens = dataset['tokens']  # Accessing tokens directly
    labels = dataset['labels']  # Accessing labels directly

    # Tokenizing the tokens for model input
    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = outputs.logits.argmax(-1).squeeze().tolist()

    # Select tokens and labels based on predictions not being 'O'
    refined_tokens = [token for token, pred in zip(tokens, predictions) if tag_to_int[labels[pred]] != tag_to_int['O']]
    refined_labels = [label for label, pred in zip(labels, predictions) if tag_to_int[label] != tag_to_int['O']]

    refined_datasets.append({'tokens': refined_tokens, 'labels': refined_labels})

    return refined_datasets


In [13]:
# Set up training arguments
args = TrainingArguments(
    "Results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    report_to="tensorboard",
    load_best_model_at_end=True,
    save_total_limit=5,
    lr_scheduler_type='linear',
    warmup_ratio=0.1
)

# Initialize and train the model
model = ArabNERModel.from_pretrained(model_path, num_labels=len(tag_to_int))

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_valid_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=8)]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = model.to(device)
trainer.train()


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Using device: cuda


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6491,0.313538,0.818655,0.779891,0.798803,0.93216
2,0.2785,0.224475,0.85059,0.856658,0.853613,0.946358
3,0.1886,0.21204,0.853284,0.869226,0.861181,0.948773


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=4338, training_loss=0.5607083990256212, metrics={'train_runtime': 800.0325, 'train_samples_per_second': 86.715, 'train_steps_per_second': 5.422, 'total_flos': 3288433713715920.0, 'train_loss': 0.5607083990256212, 'epoch': 3.0})

In [15]:
# Evaluate the model
metrics = trainer.evaluate(eval_dataset=tokenized_valid_ds)
print(f"Evaluation metrics: {metrics}")

# Save the trained model
model.save_pretrained("outer_layer_model")
tokenizer.save_pretrained("outer_layer_tokenizer")

Evaluation metrics: {'eval_loss': 0.21204015612602234, 'eval_precision': 0.8532844281427142, 'eval_recall': 0.8692255434782609, 'eval_f1': 0.8611812216052498, 'eval_accuracy': 0.9487732207751168, 'eval_runtime': 15.6393, 'eval_samples_per_second': 211.263, 'eval_steps_per_second': 13.236, 'epoch': 3.0}


('outer_layer_tokenizer/tokenizer_config.json',
 'outer_layer_tokenizer/special_tokens_map.json',
 'outer_layer_tokenizer/vocab.txt',
 'outer_layer_tokenizer/added_tokens.json',
 'outer_layer_tokenizer/tokenizer.json')

In [None]:
# from transformers import TrainingArguments, Trainer

# def train_sequential_layers(models, datasets, tokenizer, num_layers, tag_to_int, device):
#     for layer in range(1, num_layers):  # Start from the second layer
#         print(f"Training layer {layer}")
#         # Assume refine_input_features returns a list of refined texts
#         refined_dataset = refine_input_features(models[layer - 1], datasets[layer - 1], tokenizer, tag_to_int)

#         # Prepare the new training dataset
#         input_ids = tokenizer([data['tokens'] for data in refined_dataset], is_split_into_words=True, return_tensors="pt", padding=True, truncation=True)['input_ids']
#         labels = [data['labels'] for data in refined_dataset]  # Assuming labels need similar handling

#         # Create Dataset objects for training
#         train_dataset = Dataset.from_dict({'input_ids': input_ids, 'labels': labels})

#         # Train a new model for this layer
#         new_model = ArabNERModel.from_pretrained('aubmindlab/bert-base-arabertv2', num_labels=len(tag_to_int))
#         new_model.to(device)  # Ensure the model is on the correct device
        
#         training_args = TrainingArguments(
#             output_dir=f"results_layer_{layer}",
#             evaluation_strategy="epoch",
#             learning_rate=2e-5,
#             per_device_train_batch_size=8,
#             num_train_epochs=3,
#             save_strategy="no",
#             logging_dir=f"logs_layer_{layer}"  # Added logging directory for clarity
#         )
        
#         trainer = Trainer(
#             model=new_model,
#             args=training_args,
#             train_dataset=train_dataset,
#             tokenizer=tokenizer,
#             callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
#         )
        
#         # Train the model
#         trainer.train()
#         models.append(new_model)

#     return models

# # Assuming initial_model is pre-trained and ready
# models = [model]
# num_layers = 5  # Total number of layers including the initial pre-trained layer
# trained_models = train_sequential_layers(models, train_datasets[1:], tokenizer, num_layers, tag_to_int, device)

In [None]:
# from transformers import TrainingArguments, Trainer
# # args = TrainingArguments(
# #     "Results",
# #     evaluation_strategy="epoch",
# #     save_strategy="epoch",
# #     learning_rate=2e-5,
# #     per_device_train_batch_size=16,
# #     per_device_eval_batch_size=16,
# #     num_train_epochs=1,
# #     weight_decay=0.01,
# #     report_to="tensorboard",
# #     load_best_model_at_end=True,
# #     save_total_limit=5,
# #     lr_scheduler_type='linear',
# #     warmup_ratio=0.1
# # )

# # # Initialize and train the model
# # model = ArabNERModel.from_pretrained(model_path, num_labels=len(tag_to_int))

# # trainer = Trainer(
# #     model,
# #     args,
# #     train_dataset=tokenized_train_ds,
# #     eval_dataset=tokenized_valid_ds,
# #     data_collator=data_collator,
# #     tokenizer=tokenizer,
# #     compute_metrics=compute_metrics,
# #     callbacks=[EarlyStoppingCallback(early_stopping_patience=8)]
# # )

# def train_layer(model, tokenized_train_dataset, tokenized_valid_ds, tokenized_test_ds, tokenizer, layer_index, device):
#     training_args = TrainingArguments(
#         f"Results_{layer_index}",
#         evaluation_strategy="epoch",
#         save_strategy="epoch",
#         learning_rate=2e-5,
#         per_device_train_batch_size=16,
#         per_device_eval_batch_size=16,
#         num_train_epochs=1,
#         weight_decay=0.01,
#         report_to="tensorboard",
#         load_best_model_at_end=True,
#         save_total_limit=5,
#         lr_scheduler_type='linear',
#         warmup_ratio=0.1
#     )
    
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=tokenized_train_dataset,
#         eval_dataset=tokenized_valid_ds,
#         tokenizer=tokenizer,
#         compute_metrics=compute_metrics,
#         callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
#     )
    
#     # Train the model
#     trainer.train()
#     # Evaluate the model
#     metrics = trainer.evaluate(eval_dataset=tokenized_test_ds)
#     print(f"Evaluation metrics: {metrics}")
    
#     # Save model to disk
#     model.save_pretrained(f"saved_model_layer_{layer_index}")
    
#     # Clear memory
#     del model
#     torch.cuda.empty_cache()

# def train_sequential_layers(model_paths, train_datasets, validation_datasets, test_datasets, tokenizer, num_layers, device):
#     models = []
#     for layer in range(1, num_layers):
#         print(f"Training layer {layer}")
#         # Load the model for the current layer
#         model = ArabNERModel.from_pretrained(model_paths[layer - 1])
        
#         tokenizer = BertTokenizerFast.from_pretrained(model_paths[layer - 1])
#         data_collator = DataCollatorForTokenClassification(tokenizer) 
#         model.to(device)
        
#         if not train_datasets[layer-1] or not validation_datasets[layer-1] or not test_datasets[layer-1]:
#             logging.warning(f"Empty dataset for layer {layer}, skipping training.")
#             continue
        
#         train_ds = Dataset.from_dict(train_datasets[layer-1])
#         valid_ds = Dataset.from_dict(validation_datasets[layer-1])
#         test_ds = Dataset.from_dict(test_datasets[layer-1])
#         tokenized_train_ds = train_ds.map(tokenize_and_align_labels, batched=True)
#         tokenized_valid_ds = valid_ds.map(tokenize_and_align_labels, batched=True)
#         tokenized_test_ds = test_ds.map(tokenize_and_align_labels, batched=True)
        
#         # Train the model
#         train_layer(model, tokenized_train_ds, tokenized_valid_ds, tokenized_test_ds, tokenizer, layer, device)
        
#         # Append model path for next layer initialization
#         models.append(f"saved_model_layer_{layer}")
    
#     return models

# # Initial model path
# # initial_model_path = "initial_model_directory"
# # initial_model_path = "/kaggle/working/Results/runs"
# num_layers = 5

# model_paths = [model_path] * (num_layers - 1)  # Paths for later initialized layers
# print(model_paths)
# trained_models = train_sequential_layers(model_paths, train_datasets[1:], valid_datasets[1:], test_datasets[1:], tokenizer, num_layers, device)


In [None]:
torch.cuda.empty_cache()  # Clear cache before starting the training

In [52]:
def find_last_created_folder(directory, prefix):
    # Ensure the directory exists
    if not os.path.exists(directory):
        print(f"The directory {directory} does not exist.")
        return None

    # List all items in the directory
    all_folders = [os.path.join(directory, f) for f in os.listdir(directory)]
    # Filter list to include only directories that start with the specified prefix
    folders = [folder for folder in all_folders if os.path.isdir(folder) and os.path.basename(folder).startswith(prefix)]
    
    # Check if the list is not empty
    if not folders:
        print(f"No folders found in the directory that start with '{prefix}'.")
        return None

    # Get the last created folder
    last_created_folder = max(folders, key=os.path.getctime)

    return last_created_folder

# Path to the directory where folders are to be checked
directory_path = '/kaggle/input/model-outputs-version-5/Results'
folder_prefix = 'checkpoint-18798'  # The prefix to look for in folder names
checkpoint_path = find_last_created_folder(directory_path, folder_prefix)
# checkpoint_path = '/kaggle/input/model-outputs-version-5/Results/checkpoint-18798'

In [53]:
# test_model = ArabNERModelWithHybridLoss.from_pretrained(checkpoint_path, num_labels=len(label_ids.values()))
# test_model = ArabNERModelWithWeightedLoss.from_pretrained(checkpoint_path, num_labels=len(tag_to_int))
# tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

In [55]:
inverse_label_ids = {v: k for k, v in label_ids.items()}


In [16]:
from tqdm import tqdm
import torch

def predict_in_batches(model, tokenized_inputs, batch_size=32):
    # Check for GPU availability
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to the appropriate device
    model.eval()
    
    all_predictions = []
    input_ids = tokenized_inputs['input_ids']
    attention_mask = tokenized_inputs['attention_mask']

    # Wrap the range with tqdm for a progress bar
    for i in tqdm(range(0, input_ids.size(0), batch_size)):
        batch_input_ids = input_ids[i:i + batch_size].to(device)
        batch_attention_mask = attention_mask[i:i + batch_size].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs.logits
            predictions = logits.argmax(-1)
        
        all_predictions.append(predictions.cpu())  # Move predictions back to CPU if needed

    return torch.cat(all_predictions, dim=0)

def map_predictions_to_original_tokens(predictions, tokenized_inputs, label_ids):
    word_level_predictions = []
    offset_mappings = tokenized_inputs['offset_mapping']
    word_ids = [tokenized_inputs.word_ids(batch_index=i) for i in range(predictions.size(0))]

    for idx, (preds, length) in enumerate(zip(predictions, tokenized_inputs['attention_mask'].sum(1))):
        current_word_ids = word_ids[idx]
        word_predictions = []
        previous_word_idx = None
        for word_idx, pred in zip(current_word_ids, preds[:length]):
            if word_idx is not None and word_idx != previous_word_idx:
                word_predictions.append(label_ids[pred.item()])
            previous_word_idx = word_idx
        word_level_predictions.append(word_predictions)

    return word_level_predictions

tokenized_inputs = tokenizer(test_ds['tokens'], is_split_into_words=True, padding=True, truncation=True, return_tensors="pt", return_offsets_mapping=True)
# predictions = predict_in_batches(test_model, tokenized_inputs, batch_size=16)
predictions = predict_in_batches(model, tokenized_inputs, batch_size=16)
word_level_predictions = map_predictions_to_original_tokens(predictions, tokenized_inputs, label_ids)

100%|██████████| 413/413 [01:44<00:00,  3.95it/s]


In [19]:
# print(label_ids)

In [60]:
# 

In [17]:
def format_predictions_to_conll(tokens_list, word_level_predictions):
    """Format the predictions to the CoNLL output format."""
    output = []
    for tokens, predictions in zip(tokens_list, word_level_predictions):
        for token, tag in zip(tokens, predictions):
            output.append(f"{token} {tag}")
        output.append("")  # Add a blank line after each sentence for segment separation
    return "\n".join(output)

def write_to_file(content, filename):
    """Write the given content to a text file."""
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)
    print(f"Data written to {filename}")

In [18]:
conll_output = format_predictions_to_conll(test_ds['tokens'], word_level_predictions)
write_to_file(conll_output, "/kaggle/working/ArabNER_subtask2_valid_pred_2_new.txt")

Data written to /kaggle/working/ArabNER_subtask2_valid_pred_2_new.txt
