In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
from datasets import load_dataset

persian_dataset = load_dataset('AliFartout/PEYMA-ARMAN-Mixed')
english_dataset = load_dataset("conll2003", trust_remote_code=True)

README.md:   0%|          | 0.00/3.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.31M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/431k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/423k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/26384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3296 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3296 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [2]:
tag_ids = {'O': 0, 'B_PER': 1, 'I_PER': 2, 'B_ORG': 3, 'I_ORG': 4, 'B_LOC': 5, 'I_LOC': 6, 'B_MISC': 7, 'I_MISC': 8}

<div dir='rtl'style="font-family: Vazir; width: 85%; font-size: 18px;">دو تا دیتاست از لحاظ خروجی با هم فرق دارند. دیتاست فارسی علاوه بر اینکه انواع موجودیت های بیشتری را تشخیص میدهد آیدی که برای تگ ها در نظر میگیرد با دیتاست انگلیسی متفاوت است. دیتاست که برای آموزش مدل استفاده میشود باید یکپارچه باشد بنابراین این یکسان سازی دو دیتاست و سپس ترکیب آن ها را در ادامه انجام میدهیم.</div>

In [3]:
def modify_label(sample):
    ner_tags = list(map(lambda x: tag_ids.get(x, 7 if x[0] == 'B' else 8), sample['ner_tags_names']))
    sample['ner_tags'] = ner_tags
    return sample

In [4]:
updated_persian_dataset = persian_dataset.map(modify_label)

Map:   0%|          | 0/26384 [00:00<?, ? examples/s]

Map:   0%|          | 0/3296 [00:00<?, ? examples/s]

Map:   0%|          | 0/3296 [00:00<?, ? examples/s]

In [5]:
updated_persian_dataset['train'][0]

{'tokens': ['یوکوویچ',
  'متولد',
  'دانمارک',
  'است',
  'اما',
  'والدین',
  'او',
  'صرب',
  'هستند',
  '.'],
 'ner_tags': [1, 0, 5, 0, 0, 0, 0, 5, 0, 0],
 'ner_tags_names': ['B_PER',
  'O',
  'B_LOC',
  'O',
  'O',
  'O',
  'O',
  'B_LOC',
  'O',
  'O']}

In [52]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from transformers.modeling_outputs import TokenClassifierOutput

class XLMRobertaForTokenClassification(nn.Module):
    def __init__(self, model_name, num_labels, dropout_prob=0.1):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout_prob)
        self.num_labels = num_labels
        self.classifier = nn.Linear(self.backbone.config.hidden_size, num_labels)
        self.loss_fn = nn.CrossEntropyLoss()
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)

        logits = self.classifier(self.dropout(last_hidden_state))  # Shape: (batch_size, seq_len, num_labels)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits.view(-1, self.num_labels), labels.view(-1))

        return {'loss': loss, 'logits': logits} if loss is not None else logits


In [8]:
def align_labels_with_tokens(labels, word_ids):
    aligned_labels = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None:
            aligned_labels.append(-100)  # Special token or padding
        elif word_id != previous_word_id:
            aligned_labels.append(labels[word_id])  # Assign label to the first subword
        else:
            aligned_labels.append(-100)  # Ignore subsequent subwords
        previous_word_id = word_id
    return aligned_labels

In [9]:
english_dataset = english_dataset.remove_columns(['id','pos_tags', 'chunk_tags'])
updated_persian_dataset = updated_persian_dataset.remove_columns(['ner_tags_names'])

In [10]:
english_dataset['train'][0], updated_persian_dataset['train'][0]

({'tokens': ['EU',
   'rejects',
   'German',
   'call',
   'to',
   'boycott',
   'British',
   'lamb',
   '.'],
  'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]},
 {'tokens': ['یوکوویچ',
   'متولد',
   'دانمارک',
   'است',
   'اما',
   'والدین',
   'او',
   'صرب',
   'هستند',
   '.'],
  'ner_tags': [1, 0, 5, 0, 0, 0, 0, 5, 0, 0]})

In [28]:
combined_list[0]

{'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [11]:
from datasets import Dataset
list1 = [dict(item) for item in english_dataset["train"]]
list2 = [dict(item) for item in updated_persian_dataset["train"]]
combined_list = list1 + list2
data_dict = {key: [d[key] for d in combined_list] for key in combined_list[0].keys()}

final_train_dataset = Dataset.from_dict(data_dict)

In [12]:
from datasets import Dataset
list1 = [dict(item) for item in english_dataset["validation"]]
list2 = [dict(item) for item in updated_persian_dataset["validation"]]
combined_list = list1 + list2
data_dict = {key: [d[key] for d in combined_list] for key in combined_list[0].keys()}

final_validation_dataset = Dataset.from_dict(data_dict)

In [13]:
final_train_dataset

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 40425
})

In [14]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_data(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        return_tensors="np"
    )

    word_ids = tokenized_inputs.word_ids(batch_index=0)
    aligned_labels = align_labels_with_tokens(examples["ner_tags"], word_ids)

    tokenized_inputs["labels"] = aligned_labels

    return tokenized_inputs


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

<div dir='rtl' style="font-family: Vazir; width: 85%; font-size: 18px;">برای اینکه توکنایزر ممکن است بعضی کلمات را در هنگام توکنایز کردن به subword تبدیل کند در این صورت دیگر خروجی مدل که یک لیست از تگ های موجودیت ها (به ازای هر کلمه) است دیگر هماهنگی نخواهند داشت برای همین باید این شرایط هندل کنیم.</div>

In [15]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [37]:
final_train_dataset[0]

{'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [46]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    sentences = [torch.tensor(item['input_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    lengths = [len(sentence) for sentence in sentences]
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=0)  # 0 for <pad>
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 for ignore_index
    attention_mask = (padded_sentences != 0).long()  # Mask is 1 for non-padding tokens, 0 otherwise
    return {
        'input_ids': torch.tensor(padded_sentences),
        'attention_mask': attention_mask,
        'labels': torch.tensor(padded_labels),
    }

In [16]:
tokenized_train_dataset = final_train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_validation_dataset = final_validation_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/40425 [00:00<?, ? examples/s]

Map:   0%|          | 0/6546 [00:00<?, ? examples/s]

In [22]:
tokenized_train_dataset[0]

{'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [0,
  3747,
  456,
  75161,
  7,
  30839,
  11782,
  47,
  25299,
  47924,
  18,
  56101,
  21,
  6492,
  6,
  5,
  2],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100,
  3,
  0,
  -100,
  -100,
  7,
  0,
  0,
  0,
  -100,
  -100,
  7,
  0,
  -100,
  0,
  -100,
  -100]}

In [53]:
training_args = TrainingArguments(
    output_dir="Roberta-fa-en-ner",
    log_level="error",
    num_train_epochs=1,
    gradient_checkpointing=False,
    eval_accumulation_steps=10,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    seed=42,
    logging_strategy="steps",
    evaluation_strategy="steps",
)

print(tokenized_train_dataset)
print(tokenized_train_dataset.features["ner_tags"])
print(tokenized_train_dataset.features["ner_tags"].feature)
all_labels = set(tag for tags in tokenized_train_dataset["ner_tags"] for tag in tags)
print(all_labels)
num_labels = len(all_labels)
model = XLMRobertaForTokenClassification(model_name, num_labels=num_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
    data_collator=collate_fn
)

trainer.train()

results = trainer.evaluate()
print("Evaluation Results:", results)



Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 40425
})
Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)
Value(dtype='int64', id=None)
{0, 1, 2, 3, 4, 5, 6, 7, 8}


  trainer = Trainer(
  'input_ids': torch.tensor(padded_sentences),
  'labels': torch.tensor(padded_labels),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'loss': 0.2408, 'grad_norm': 2.1845710277557373, 'learning_rate': 2.0344009489916967e-05, 'epoch': 0.5931198102016607}
{'eval_loss': 0.1031973585486412, 'eval_runtime': 36.2539, 'eval_samples_per_second': 180.56, 'eval_steps_per_second': 3.779, 'epoch': 0.5931198102016607}


  'input_ids': torch.tensor(padded_sentences),
  'labels': torch.tensor(padded_labels),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'train_runtime': 804.3589, 'train_samples_per_second': 50.257, 'train_steps_per_second': 1.048, 'train_loss': 0.17864773253670507, 'epoch': 1.0}


  'input_ids': torch.tensor(padded_sentences),
  'labels': torch.tensor(padded_labels),
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.08001686632633209, 'eval_runtime': 36.4243, 'eval_samples_per_second': 179.715, 'eval_steps_per_second': 3.761, 'epoch': 1.0}
Evaluation Results: {'eval_loss': 0.08001686632633209, 'eval_runtime': 36.4243, 'eval_samples_per_second': 179.715, 'eval_steps_per_second': 3.761, 'epoch': 1.0}
