In [1]:
# اگر در محیط Jupyter هستید و می‌خواهید پکیج‌ها را نصب کنید:
!pip install --extra-index-url https://download.pytorch.org/whl/cpu \
    numpy==1.26.2 pandas==2.2.2 torch torchvision torchaudio \
    datasets==2.14.0 transformers==4.41.0 pyarrow==14.0.2 \
    accelerate scikit-learn fastapi uvicorn python-multipart

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu
Collecting numpy==1.26.2
  Downloading numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets==2.14.0
  Downloading datasets-2.14.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers==4.41.0
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyarrow==14.0.2
  Downloading pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.0)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.0)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_6

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, logging
from hazm import Normalizer
import torch
import os
import numpy as np

# Set verbosity for transformers
logging.set_verbosity_info()

In [None]:
# --- 1. Configuration ---
MODEL_NAME = "HooshvareLab/bert-base-parsbert-uncased"
DATASET_URLS = [
    "https://raw.githubusercontent.com/davardoust/PHICAD/main/PHICAD-part1.csv",
    "https://raw.githubusercontent.com/davardoust/PHICAD/main/PHICAD-part2.csv"
]
OUTPUT_DIR = "./phicad_model"
LOGGING_DIR = './phicad_logs'

In [None]:
def download_and_prepare_dataset():
    print("--- Downloading and preparing dataset ---")
    try:
        df1 = pd.read_csv(DATASET_URLS[0], sep="\\t", header=0, on_bad_lines='warn')
        df2 = pd.read_csv(DATASET_URLS[1], sep="\\t", header=0, on_bad_lines='warn')
        df = pd.concat([df1, df2], ignore_index=True)
        print(f"Initial number of rows: {len(df)}")

        expected_columns = ['comment_normalized', 'class']
        df = df[expected_columns]
        df.dropna(subset=['class', 'comment_normalized'], inplace=True)
        df = df[df['comment_normalized'].str.strip() != '']

        print(f"Number of rows after cleaning: {len(df)}")

        label_encoder = LabelEncoder()
        df['label'] = label_encoder.fit_transform(df['class'])
        label_map = {i: label for i, label in enumerate(label_encoder.classes_)}
        print("Class mapping:", label_map)

        dataset = Dataset.from_pandas(df[['comment_normalized', 'label']])

        if len(dataset) > 2000:
            dataset = dataset.select(range(2000))
        print(f"✅ Dataset prepared and subsetted to {len(dataset)} rows.")
        return dataset, label_map
    except Exception as e:
        print(f"❌ Error: {e}")
        raise

In [None]:
dataset, label_map = download_and_prepare_dataset()

In [None]:
print("\n--- Initializing tokenizer and normalizer ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
normalizer = Normalizer()

def preprocess_function(examples):
    normalized_texts = [normalizer.normalize(text) for text in examples['comment_normalized']]
    return tokenizer(normalized_texts, truncation=True, padding='max_length', max_length=128)

print("Tokenizing dataset...")
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
splits = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits['train']
eval_dataset = splits['test']
print(f"Train dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

In [None]:
print("\n--- Loading pre-trained model ---")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_map),
    id2label={i: label for i, label in label_map.items()},
    label2id={label: i for i, label in label_map.items()}
)

In [None]:
trainer.train()

In [None]:
print("\n--- Setting up training ---")
training_args = TrainingArguments(
    output_dir="./phicad_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=100,
    logging_dir=LOGGING_DIR,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics  # فعال‌سازی در صورت نیاز
)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer.compute_metrics = compute_metrics

In [None]:
print("🚀 Starting training...")
trainer.train()
print("✅ Training complete!")

In [None]:
print("\n--- Evaluating model ---")
metrics = trainer.evaluate()
print("Evaluation metrics:", metrics)

In [None]:
print(f"\n--- Saving model and tokenizer to {OUTPUT_DIR} ---")
os.makedirs(OUTPUT_DIR, exist_ok=True)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"✅ Model and tokenizer saved successfully to {OUTPUT_DIR}")