# Dataset

In [1]:
import torch
import transformers
import pandas as pd
import hazm
from tqdm import tqdm

In [2]:
file_path = 'All.conll'

dataset = pd.read_csv(file_path, delimiter='  ', skip_blank_lines=False)

  dataset = pd.read_csv(file_path, delimiter='  ', skip_blank_lines=False)


In [3]:
dataset.head()

Unnamed: 0,word,tag
0,۵,O
1,طبقه,O
2,۳,B-Attributes of the property (A)
3,واحدی,I-Attributes of the property (A)
4,طبقه,B-Attributes of the property (A)


In [4]:
sentences = []
sentence = []
list_tags = []
list_tag = []
for index, row in dataset.iterrows():
    if pd.isna(row['word']):
        sentences.append(sentence)
        sentence = []
        list_tags.append(list_tag)
        list_tag = []
        pass
    else:
        sentence.append(row['word'])
        list_tag.append(row['tag'])

In [5]:
print(len(sentences))

558


In [6]:
# Define the mapping from strings to numbers
mapping = {
    "O": 0,
    "B-Locality (L)": 1,
    "I-Locality (L)": 2,
    "B-Total Price (P)": 3,
    "I-Total Price (P)": 4,
    "B-Land Area (LA)": 5,
    "I-Land Area (LA)": 6,
    "B-Cost per land area (C)": 7,
    "I-Cost per land area (C)": 8,
    "B-Contact name (N)": 9,
    "I-Contact name (N)": 10,
    "B-Contact telephone (T)": 11,
    "I-Contact telephone (T)": 12,
    "B-Attributes of the property (A)": 13,
    "I-Attributes of the property (A)": 14
}

# Convert strings to numbers based on the mapping
list_tags = [[mapping[item] for item in list_tag] for list_tag in list_tags]

In [7]:
df = pd.DataFrame({'tokens': sentences, 'ner_tags': list_tags})
df.head()

Unnamed: 0,tokens,ner_tags
0,"[۵, طبقه, ۳, واحدی, طبقه, دوم, ۲, خوابه, ۲, سر...","[0, 0, 13, 14, 13, 14, 13, 14, 13, 14, 14, 13,..."
1,"[میدان, ولیعصر, چهار, پارکینگ, ۲۵۵, متر, زمین,...","[1, 2, 13, 14, 5, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0..."
2,"[بهترین, خرید, و, سرمایه, گذاری, سال, فرعی, نف...","[0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2, 13, 14, 13, ..."
3,"[۴۳۰, مترمستغلات, سند, اداری, دسترسی, عالی, به...","[5, 6, 13, 14, 13, 14, 14, 14, 14, 14, 14, 13,..."
4,"[دید, و, نور, عالی, خیابان, اصلی, بسیار, خوش, ...","[13, 14, 14, 14, 1, 2, 0, 13, 14, 0, 0, 0, 1, ..."


In [8]:
from sklearn.model_selection import train_test_split

# Split the DataFrame into train, test, and validation sets
train_df, test_val_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_val_df, test_size=0.5, random_state=42)

# Reset the index of the DataFrames
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# Print the number of samples in each set
print("Train set size:", len(train_df))
print("Test set size:", len(test_df))
print("Validation set size:", len(val_df))

Train set size: 446
Test set size: 56
Validation set size: 56


In [9]:
from datasets import Dataset, DatasetDict

import datasets
import pandas as pd

datasets_train_test = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df),
    "validation": Dataset.from_pandas(val_df)
    })

In [10]:
datasets_train_test

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 446
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 56
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 56
    })
})

# preprocess

In [11]:
raw_datasets = datasets_train_test
raw_datasets["train"][0]["tokens"]

['بزرگ\u200cترین',
 'شبکه',
 'فروش',
 'و',
 'بروز\u200cترین',
 'سامانه',
 'فایلینگ',
 'ملک',
 'در',
 'تبریز',
 'با',
 'بیش',
 'از',
 '۶۰',
 'کارشناس',
 'فعال',
 'در',
 'سطح',
 'تبریز',
 'شخصی',
 'ساز',
 'تک',
 'واحده',
 'دو',
 'انباری',
 'برای',
 'هر',
 'واحد',
 'مستر',
 'دار',
 'نما',
 'و',
 'مشاعات',
 'شیک',
 'آسانسور',
 '۶',
 'نفره',
 'پارکینگ',
 'اختصاصی',
 'پوشش',
 'کف',
 'سرامیک',
 'آماده',
 'تحویل',
 'فروشنده',
 'واقعی',
 'جهت',
 'کسب',
 'اطلاعات',
 'بیشتر',
 'تماس',
 'بگیرید',
 'کارشناس',
 'منطقه',
 'اسعدی',
 'از',
 'دادن',
 'آدرس',
 'دقیق',
 'واحد',
 'به',
 'صورت',
 'تلفنی',
 'معذوریم',
 'آدرس',
 'دفتر',
 'چهارراه',
 'منصور',
 'پشت',
 'برج',
 'ابریشم']

In [12]:
raw_datasets["train"][0]["ner_tags"]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 13,
 14,
 13,
 14,
 14,
 14,
 13,
 0,
 0,
 13,
 0,
 0,
 13,
 14,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 11,
 12,
 9,
 10,
 10,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 2]

In [13]:
label_names = ["O", "B-Locality (L)", "I-Locality (L)",
    "B-Total Price (P)", "I-Total Price (P)", "B-Land Area (LA)", "I-Land Area (LA)",
    "B-Cost per land area (C)", "I-Cost per land area (C)", "B-Contact name (N)",
    "I-Contact name (N)", "B-Contact telephone (T)", "I-Contact telephone (T)", 
    "B-Attributes of the property (A)", "I-Attributes of the property (A)"]

In [14]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

بزرگ‌ترین شبکه فروش و بروز‌ترین سامانه فایلینگ ملک در تبریز          با بیش از ۶۰ کارشناس فعال در سطح تبریز          شخصی ساز تک واحده دو انباری برای هر واحد مستر                             دار                              نما                              و                                مشاعات                           شیک                              آسانسور                          ۶ نفره پارکینگ                          اختصاصی پوشش کف                               سرامیک                           آماده تحویل فروشنده واقعی جهت کسب اطلاعات بیشتر تماس                    بگیرید                  کارشناس            منطقه              اسعدی              از دادن آدرس دقیق واحد به صورت تلفنی معذوریم آدرس           دفتر           چهارراه        منصور          پشت            برج            ابریشم         
O         O    O    O O         O      O       O   O  B-Locality (L) O  O   O  O  O       O    O  O   B-Locality (L) O    O   O  O     O  O      O    O  O    B-Attributes of the property (

# model

In [15]:
from transformers import AutoTokenizer

model_checkpoint = "HooshvareLab/bert-fa-zwnj-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [16]:
tokenizer.is_fast

True

In [17]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'بزرگ',
 '[ZWNJ]',
 'ترین',
 'شبکه',
 'فروش',
 'و',
 'بروز',
 '[ZWNJ]',
 'ترین',
 'سامانه',
 'فایلی',
 '##نگ',
 'ملک',
 'در',
 'تبریز',
 'با',
 'بیش',
 'از',
 '۶۰',
 'کارشناس',
 'فعال',
 'در',
 'سطح',
 'تبریز',
 'شخصی',
 'ساز',
 'تک',
 'واحده',
 'دو',
 'انباری',
 'برای',
 'هر',
 'واحد',
 'مستر',
 'دار',
 'نما',
 'و',
 'مشاع',
 '##ات',
 'شیک',
 'آ',
 '##سانس',
 '##ور',
 '۶',
 'نفره',
 'پارکینگ',
 'اختصاصی',
 'پوشش',
 'کف',
 'سرامیک',
 'آ',
 '##ماده',
 'تحویل',
 'فروشنده',
 'واقعی',
 'جهت',
 'کسب',
 'اطلاعات',
 'بیشتر',
 'تماس',
 'بگیرید',
 'کارشناس',
 'منطقه',
 'اسعدی',
 'از',
 'دادن',
 'آ',
 '##در',
 '##س',
 'دقیق',
 'واحد',
 'به',
 'صورت',
 'تلفنی',
 'معذ',
 '##وریم',
 'آ',
 '##در',
 '##س',
 'دفتر',
 'چهارراه',
 'منصور',
 'پشت',
 'برج',
 'ابریشم',
 '[SEP]']

In [18]:
inputs.word_ids()

[None,
 0,
 0,
 0,
 1,
 2,
 3,
 4,
 4,
 4,
 5,
 6,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 32,
 33,
 34,
 34,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 57,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 63,
 64,
 64,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 None]

In [19]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [20]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 13, 14, 14, 14, 13, 0, 0, 13, 0, 0, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 9, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 13, 14, 14, 14, 14, 13, 14, 14, 0, 0, 13, 0, 0, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 9, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, -100]


In [21]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [22]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/446 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [24]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    1,    0,    0,    0,    0,    0,    0,    0,    0,
            1,    0,    0,    0,    0,    0,    0,    0,    0,    0,   13,   14,
           13,   14,   14,   14,   14,   13,   14,   14,    0,    0,   13,    0,
            0,   13,   14,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           11,   12,    9,   10,   10,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    1,    2,    2,    2,    2,    2,    2,
            2,    2, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    1,    2,    2,    2,    2,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   13,   14,
           14,   14,   14,   14,   14,    0,    0,    0,   13,   14,   14,   14,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,


In [25]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 13, 14, 14, 14, 14, 13, 14, 14, 0, 0, 13, 0, 0, 13, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 9, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, -100]
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 14, 14, 14, 14, 14, 14, 0, 0, 0, 13, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 0, 0, 0, 0, 0, 0, 0, 11, 12, 12, 9, 10, 10, 1, 2, 0, 0, 0, 0, -100]


In [26]:
import evaluate

metric = evaluate.load("seqeval")

Using the latest cached version of the module from /home/user01/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Mon Jul  3 23:26:31 2023) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


In [27]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-Locality (L)',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-Locality (L)',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-Attributes of the property (A)',
 'I-Attributes of the property (A)',
 'B-Attributes of the property (A)',
 'I-Attributes of the property (A)',
 'I-Attributes of the property (A)',
 'I-Attributes of the property (A)',
 'B-Attributes of the property (A)',
 'O',
 'O',
 'B-Attributes of the property (A)',
 'O',
 'O',
 'B-Attributes of the property (A)',
 'I-Attributes of the property (A)',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-Contact telephone (T)',
 'I-Contact telephone (T)',
 'B-Contact name (N)',
 'I-Contact name (N)',
 'I-Contact name (N)',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-Locality (L)',
 'I-Locality (L)',
 'I-Locality (L)',
 'I-Locality (L)',
 'I-Locality (L)',
 'I-Locality (L)',
 'I-Locality (L)']

In [29]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'Attributes of the property (A)': {'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'number': 5},
 'Contact name (N)': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Contact telephone (T)': {'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0,
  'number': 1},
 'Locality (L)': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 3},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [30]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [32]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [33]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of the model checkpoint at HooshvareLab/bert-fa-zwnj-base were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-zwnj-

In [35]:
model.config.num_labels

15

In [36]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="ParsBERT_V3_ner_results",
    # evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=20,
    save_strategy="epoch",
    weight_decay=0.01,
    logging_steps=20,
    save_steps=50,
    logging_dir='ParsBERT_V3_ner_logs'
)

In [37]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [38]:
trainer.train()



Step,Training Loss
20,1.5391
40,0.8779
60,0.6654
80,0.5724
100,0.5114
120,0.4726
140,0.4069
160,0.3794
180,0.3302
200,0.3


TrainOutput(global_step=560, training_loss=0.31805915662220546, metrics={'train_runtime': 2189.194, 'train_samples_per_second': 4.075, 'train_steps_per_second': 0.256, 'total_flos': 837345267265500.0, 'train_loss': 0.31805915662220546, 'epoch': 20.0})

In [39]:
trainer.evaluate()

{'eval_loss': 0.8843041658401489,
 'eval_precision': 0.40850277264325324,
 'eval_recall': 0.4682203389830508,
 'eval_f1': 0.4363277393879565,
 'eval_accuracy': 0.8177975058127246,
 'eval_runtime': 5.1568,
 'eval_samples_per_second': 10.86,
 'eval_steps_per_second': 0.776,
 'epoch': 20.0}