In [1]:
# 🧹 حذف کامل پکیج‌های ناسازگار
!pip uninstall -y transformers peft accelerate
!rm -rf /usr/local/lib/python3.11/dist-packages/transformers
!rm -rf /usr/local/lib/python3.11/dist-packages/peft
!rm -rf /usr/local/lib/python3.11/dist-packages/accelerate

# ✅ نصب نسخه‌های سالم و سازگار
!pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu \
    numpy==1.26.2 pandas==2.2.2 torch torchvision torchaudio \
    datasets==2.14.0 pyarrow==14.0.2 \
    scikit-learn fastapi uvicorn python-multipart \
    transformers==4.41.0 peft==0.10.0 accelerate hazm

Found existing installation: transformers 4.41.0
Uninstalling transformers-4.41.0:
  Successfully uninstalled transformers-4.41.0
Found existing installation: peft 0.10.0
Uninstalling peft-0.10.0:
  Successfully uninstalled peft-0.10.0
Found existing installation: accelerate 1.9.0
Uninstalling accelerate-1.9.0:
  Successfully uninstalled accelerate-1.9.0
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting transformers==4.41.0
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Downloading transformers-4.41.0-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m82.5 MB/s[0m eta [36m

In [2]:
from transformers import Trainer
from peft import PeftModel

print("✅ همه پکیج‌ها بدون مشکل ایمپورت شدند.")

✅ همه پکیج‌ها بدون مشکل ایمپورت شدند.


In [3]:
# 📦 Import core libraries
import pandas as pd
import numpy as np
import os
import torch

# 🔤 Text preprocessing
from hazm import Normalizer
from sklearn.preprocessing import LabelEncoder

# 🤗 Transformers and Datasets
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    logging
)

# ✅ Set transformers logging level
logging.set_verbosity_info()

# ✅ Check GPU availability (optional, for Colab/Notebook)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✅ Torch is using: {device.upper()}")

✅ Torch is using: CPU


In [4]:
# --- 1. Configuration ---
MODEL_NAME = "HooshvareLab/bert-base-parsbert-uncased"
DATASET_URLS = [
    "https://raw.githubusercontent.com/davardoust/PHICAD/main/PHICAD-part1.csv",
    "https://raw.githubusercontent.com/davardoust/PHICAD/main/PHICAD-part2.csv"
]
OUTPUT_DIR = "./phicad_model"
LOGGING_DIR = './phicad_logs'

In [5]:
def download_and_prepare_dataset():
    print("--- Downloading and preparing dataset ---")
    try:
        df1 = pd.read_csv(DATASET_URLS[0], sep="\\t", header=0, on_bad_lines='warn')
        df2 = pd.read_csv(DATASET_URLS[1], sep="\\t", header=0, on_bad_lines='warn')
        df = pd.concat([df1, df2], ignore_index=True)
        print(f"Initial number of rows: {len(df)}")

        expected_columns = ['comment_normalized', 'class']
        df = df[expected_columns]
        df.dropna(subset=['class', 'comment_normalized'], inplace=True)
        df = df[df['comment_normalized'].str.strip() != '']

        print(f"Number of rows after cleaning: {len(df)}")

        label_encoder = LabelEncoder()
        df['label'] = label_encoder.fit_transform(df['class'])
        label_map = {i: label for i, label in enumerate(label_encoder.classes_)}
        print("Class mapping:", label_map)

        dataset = Dataset.from_pandas(df[['comment_normalized', 'label']])

        if len(dataset) > 2000:
            dataset = dataset.select(range(2000))
        print(f"✅ Dataset prepared and subsetted to {len(dataset)} rows.")
        return dataset, label_map
    except Exception as e:
        print(f"❌ Error: {e}")
        raise

In [6]:
dataset, label_map = download_and_prepare_dataset()

--- Downloading and preparing dataset ---


  df1 = pd.read_csv(DATASET_URLS[0], sep="\\t", header=0, on_bad_lines='warn')
  df2 = pd.read_csv(DATASET_URLS[1], sep="\\t", header=0, on_bad_lines='warn')


Initial number of rows: 301460
Number of rows after cleaning: 131958
Class mapping: {0: 'clean', 1: 'hate', 2: 'hateobscene', 3: 'obscene', 4: 'spam', 5: 'spamobscene'}
✅ Dataset prepared and subsetted to 2000 rows.


In [7]:
print("\n--- Initializing tokenizer and normalizer ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
normalizer = Normalizer()

def preprocess_function(examples):
    normalized_texts = [normalizer.normalize(text) for text in examples['comment_normalized']]
    return tokenizer(normalized_texts, truncation=True, padding='max_length', max_length=128)

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(preprocess_function, batched=True)


--- Initializing tokenizer and normalizer ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--HooshvareLab--bert-base-parsbert-uncased/snapshots/d73a0e2c7492c33bd5819bcdb23eba207404dd19/config.json
Model config BertConfig {
  "_name_or_path": "HooshvareLab/bert-base-parsbert-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range

Tokenizing dataset...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
splits = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits['train']
eval_dataset = splits['test']
print(f"Train dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

Train dataset size: 1800
Evaluation dataset size: 200


In [9]:
print("\n--- Loading pre-trained model ---")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_map),
    id2label={i: label for i, label in label_map.items()},
    label2id={label: i for i, label in label_map.items()}
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--HooshvareLab--bert-base-parsbert-uncased/snapshots/d73a0e2c7492c33bd5819bcdb23eba207404dd19/config.json
Model config BertConfig {
  "_name_or_path": "HooshvareLab/bert-base-parsbert-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "clean",
    "1": "hate",
    "2": "hateobscene",
    "3": "obscene",
    "4": "spam",
    "5": "spamobscene"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "clean": 0,
    "hate": 1,
    "hateobscene": 2,
    "obscene": 3,
    "spam": 4,
    "spamobscene": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolut


--- Loading pre-trained model ---


Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifica

In [10]:
print("\n--- Setting up training ---")

# غیرفعال‌سازی W&B logging
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./phicad_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=100,
    logging_dir=LOGGING_DIR,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



--- Setting up training ---


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics  # فعال‌سازی در صورت نیاز
)

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer.compute_metrics = compute_metrics

print("🚀 Starting training...")
trainer.train()
print("✅ Training complete!")

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, comment_normalized. If __index_level_0__, comment_normalized are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1,800
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 225
  Number of trainable parameters = 162,845,958


🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6179,0.498688,0.815,0.809339,0.805814,0.815


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, comment_normalized. If __index_level_0__, comment_normalized are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8


Epoch,Training Loss,Validation Loss


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Saving model checkpoint to ./phicad_results/checkpoint-225
Configuration saved in ./phicad_results/checkpoint-225/config.json
Model weights saved in ./phicad_results/checkpoint-225/model.safetensors
tokenizer config file saved in ./phicad_results/checkpoint-225/tokenizer_config.json
Special tokens file saved in ./phicad_results/checkpoint-225/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./phicad_results/checkpoint-225 (score: 0.815).


✅ Training complete!


In [17]:
print("\n--- Evaluating model ---")
metrics = trainer.evaluate()
print("Evaluation metrics:", metrics)

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, comment_normalized. If __index_level_0__, comment_normalized are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8



--- Evaluating model ---


Evaluation metrics: {'eval_loss': 0.49868765473365784, 'eval_accuracy': 0.815, 'eval_f1': 0.8093390095792842, 'eval_precision': 0.8058141025641027, 'eval_recall': 0.815, 'eval_runtime': 83.3041, 'eval_samples_per_second': 2.401, 'eval_steps_per_second': 0.3, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
print(f"\n--- Saving model and tokenizer to {OUTPUT_DIR} ---")
os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"✅ Model and tokenizer saved successfully to {OUTPUT_DIR}")

Saving model checkpoint to ./phicad_model
Configuration saved in ./phicad_model/config.json



--- Saving model and tokenizer to ./phicad_model ---


Model weights saved in ./phicad_model/model.safetensors
tokenizer config file saved in ./phicad_model/tokenizer_config.json
Special tokens file saved in ./phicad_model/special_tokens_map.json
tokenizer config file saved in ./phicad_model/tokenizer_config.json
Special tokens file saved in ./phicad_model/special_tokens_map.json


✅ Model and tokenizer saved successfully to ./phicad_model


In [21]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# --- Load saved model and tokenizer ---
model_path = OUTPUT_DIR  # "./phicad_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # Set to evaluation mode

# --- Sample inputs ---
test_texts = [
    "این محصول خیلی عالی بود و من کاملاً راضی‌ام",
    "افتضاحه، اصلاً پیشنهاد نمی‌کنم",
    "قیمتش مناسبه ولی کیفیتش پایینه",
    "کیرم توی کسی که سازندشو ساخت",
    "اسرائیل توی جنگ با ایران پیروز شد",
]

# --- Prediction ---
for text in test_texts:
    # Normalize if needed
    normalized_text = normalizer.normalize(text)

    # Tokenize
    inputs = tokenizer(normalized_text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
        predicted_label = label_map[predicted_class_id]

    print(f"📝 متن: {text}")
    print(f"🔎 پیش‌بینی: {predicted_label}")
    print("-" * 50)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./phicad_model/config.json
Model config BertConfig {
  "_name_or_path": "./phicad_model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "clean",
    "1": "hate",
    "2": "hateobscene",
    "3": "obscene",
    "4": "spam",
    "5": "spamobscene"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "clean": 0,
    "hate": 1,
    "hateobscene": 2,
    "obscene": 3,
    "spam": 4,
    "spamobscene": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "abso

📝 متن: این محصول خیلی عالی بود و من کاملاً راضی‌ام
🔎 پیش‌بینی: clean
--------------------------------------------------
📝 متن: افتضاحه، اصلاً پیشنهاد نمی‌کنم
🔎 پیش‌بینی: hate
--------------------------------------------------
📝 متن: قیمتش مناسبه ولی کیفیتش پایینه
🔎 پیش‌بینی: obscene
--------------------------------------------------
📝 متن: کیرم توی کسی که سازندشو ساخت
🔎 پیش‌بینی: obscene
--------------------------------------------------
📝 متن: اسرائیل توی جنگ با ایران پیروز شد
🔎 پیش‌بینی: hate
--------------------------------------------------
