# Preprosses and Dataset

In [None]:
!pip install transformers==4.28.0 datasets evaluate
!pip install hazm
!pip install tqdm

In [5]:
import torch
import transformers
import pandas as pd
import hazm
from tqdm import tqdm

In [3]:
dataset = pd.read_csv('normalized_raw_data.csv')
data = dataset['description'].tolist()
labels = dataset['cat1'].tolist()

In [6]:
from hazm import sent_tokenize, Normalizer
import re

print('orgi len', len(data))
normalized_description_list = []
normalizer = Normalizer()
for description in tqdm(data, desc = 'Normalization'):
    normalized_description = re.sub('[:,،.<>/!@#$%~{}();»«…“”"؛؟◊♦–\*\+_\^]', ' ', description)
    normalized_description = normalizer.normalize(normalized_description)
    normalized_description_list.append(normalized_description)
print('norm len: ', len(normalized_description_list))

orgi len 1428



Normalization:   0%|                                                                           | 0/1428 [00:00<?, ?it/s][A
Normalization: 100%|██████████████████████████████████████████████████████████████| 1428/1428 [00:00<00:00, 6366.03it/s][A

norm len:  1428





In [7]:
# Define the mapping from strings to numbers
mapping = {
    "electronic-devices": 0,
    "vehicles": 1,
    "real-estate": 2,
    "home-kitchen": 3
}

# Convert strings to numbers based on the mapping
labels = [mapping[item] for item in labels]

In [9]:
from sklearn.model_selection import train_test_split

# Create a DataFrame
df = pd.DataFrame({'data': normalized_description_list, 'labels': labels})

# Split the DataFrame into train, test, and validation sets
train_df, test_val_df = train_test_split(df, test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_val_df, test_size=0.5, random_state=42)

# Reset the index of the DataFrames
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

# Print the number of samples in each set
print("Train set size:", len(train_df))
print("Test set size:", len(test_df))
print("Validation set size:", len(val_df))

Train set size: 1142
Test set size: 143
Validation set size: 143


In [10]:
print(train_df)

                                                   data  labels
0     قیمت تمامی برندها را از ما استعلام کنید مینی و...       3
1     سلام به دوستان فریزر پارس ۷ کشو سلام وتمیز لطف...       3
2     مدل ۱۴۰۰ بدون تعمیر خانگی بدون خط و خش استارت ...       1
3     اقساط از یک ماه تا ۳۶ ماه فروش بدون پیش پرداخت...       0
4     ماشین لباسشویی سامسونگ شش کیلویی فروش بدلیل مه...       3
...                                                 ...     ...
1137  به قیمت ۱۴۰۱خرید کنید ۱۰۶متر دوخواب از دو طرف ...       2
1138  یخچال فریزر ۲۷ فوت رد جنرال دسته مخفی چراغ‌های...       3
1139  لپ تاپ Dell پردازنده core۲Dou رم ۴ هارد ۵۰۰ گی...       0
1140  ماشین فوق العاده تمیز در حد دارای بیمه بدنه ۶ ...       1
1141  فروش انواع یخچال فریزه و ساید بای ساید‌های است...       3

[1142 rows x 2 columns]


In [11]:
from datasets import Dataset, DatasetDict

import datasets
import pandas as pd

datasets_train_test = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df),
    "validation": Dataset.from_pandas(val_df)
    })

In [12]:
datasets_train_test

DatasetDict({
    train: Dataset({
        features: ['data', 'labels'],
        num_rows: 1142
    })
    test: Dataset({
        features: ['data', 'labels'],
        num_rows: 143
    })
    validation: Dataset({
        features: ['data', 'labels'],
        num_rows: 143
    })
})

In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-zwnj-base")


Downloading (…)okenizer_config.json: 100%|█████████████████████████████████████████████| 292/292 [00:00<00:00, 23.7kB/s][A

Downloading (…)lve/main/config.json: 100%|█████████████████████████████████████████████| 565/565 [00:00<00:00, 62.1kB/s][A

Downloading (…)solve/main/vocab.txt:   0%|                                                   | 0.00/426k [00:00<?, ?B/s][A
Downloading (…)solve/main/vocab.txt: 100%|███████████████████████████████████████████| 426k/426k [00:00<00:00, 1.01MB/s][A

Downloading (…)/main/tokenizer.json:   0%|                                                  | 0.00/1.11M [00:00<?, ?B/s][A
Downloading (…)/main/tokenizer.json: 100%|█████████████████████████████████████████| 1.11M/1.11M [00:00<00:00, 2.26MB/s][A

Downloading (…)cial_tokens_map.json: 100%|█████████████████████████████████████████████| 134/134 [00:00<00:00, 42.1kB/s][A


In [14]:
def preprocess_function(examples):
    return tokenizer(examples["data"], truncation=True)

tokenized_dataset = datasets_train_test.map(preprocess_function, batched=True)


Map:   0%|                                                                              | 0/1142 [00:00<?, ? examples/s][A
Map:  88%|████████████████████████████████████████████████████████▉        | 1000/1142 [00:00<00:00, 3499.90 examples/s][A
                                                                                                                        [A
Map:   0%|                                                                               | 0/143 [00:00<?, ? examples/s][A
                                                                                                                        [A
Map:   0%|                                                                               | 0/143 [00:00<?, ? examples/s][A
                                                                                                                        [A

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['data', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1142
    })
    test: Dataset({
        features: ['data', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 143
    })
    validation: Dataset({
        features: ['data', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 143
    })
})

In [16]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
import numpy as np

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return accuracy.compute(predictions=predictions, references=labels)

from datasets import load_metric
metric1 = load_metric("precision")
metric2 = load_metric("recall")
metric3 = load_metric("f1")
metric = load_metric('accuracy')


def compute_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(predictions=predictions, references=labels, average="micro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="micro")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="micro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

  if __name__ == "__main__":

Downloading builder script: 7.55kB [00:00, 2.73MB/s]                                                                    [A

Downloading builder script: 7.38kB [00:00, 3.81MB/s]                                                                    [A

Downloading builder script: 6.50kB [00:00, 3.36MB/s]                                                                    [A

Downloading builder script: 4.21kB [00:00, 2.32MB/s]                                                                    [A


In [18]:
from datasets import load_metric
import numpy as np

metric1 = load_metric("precision")
metric2 = load_metric("recall")
metric3 = load_metric("f1")
metric4 = load_metric("accuracy")
metric5 = evaluate.load("BucketHeadP65/confusion_matrix")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(predictions=predictions, references=labels, average="micro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="micro")["recall"]
    f1_micro = metric3.compute(predictions=predictions, references=labels, average="micro")["f1"]
    f1_macro = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]
    confusion_matrix = metric5.compute(predictions=predictions, references=labels)["confusion_matrix"]

    return {"precision": precision, "recall": recall, "f1_micro": f1_micro, "f1_macro": f1_macro, "accuracy": accuracy, "confusion_matrix": confusion_matrix}


NameError: name 'evaluate' is not defined

In [20]:
id2label = {0: "electronic-devices", 1: "vehicles", 2: "real-estate",
            3: "home-kitchen"}
label2id = {"electronic-devices": 0, "vehicles": 1, "real-estate": 2,
            "home-kitchen": 3}

# ParsBert V3

In [21]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "HooshvareLab/bert-fa-zwnj-base", num_labels=4, id2label=id2label, label2id=label2id)

Downloading pytorch_model.bin: 100%|█████████████████████████████████████████████████| 473M/473M [00:55<00:00, 8.59MB/s]
Some weights of the model checkpoint at HooshvareLab/bert-fa-zwnj-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassificati

In [22]:
from huggingface_hub import notebook_login

notebook_login()

In [26]:
training_args = TrainingArguments(
    output_dir="ParsBERT_V3_results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    # load_best_model_at_end=True,
    logging_steps=20,
    save_steps=50,
    logging_dir='./ParsBERT_V3_logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [27]:
trainer.train()



Step,Training Loss
20,0.0396
40,0.0138
60,0.0533
80,0.0111
100,0.0015
120,0.0116
140,0.0178
160,0.0014
180,0.0045
200,0.0129


TrainOutput(global_step=216, training_loss=0.015668955967865057, metrics={'train_runtime': 52.7772, 'train_samples_per_second': 64.914, 'train_steps_per_second': 4.093, 'total_flos': 315873577393296.0, 'train_loss': 0.015668955967865057, 'epoch': 3.0})

In [5]:
# trainer.evaluate()
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-zwnj-base')
model = AutoModelForSequenceClassification.from_pretrained(r'Phase 1/ParsBERT_V3_results/checkpoint-216')
model.evaluate

OSError: Can't load the configuration of 'Phase 1/ParsBERT_V3_results/checkpoint-216'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'Phase 1/ParsBERT_V3_results/checkpoint-216' is the correct path to a directory containing a config.json file