In [5]:
# 📥 نصب کتابخانه‌های مورد نیاز
!pip install transformers datasets pandas hazm

# 📦 import ها
import pandas as pd
import torch
from hazm import Normalizer
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset




ImportError: cannot import name 'PreTrainedModel' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [None]:
# لینک‌های مستقیم به فایل‌های CSV
url1 = "https://raw.githubusercontent.com/davardoust/PHICAD/main/PHICAD-part1.csv"
url2 = "https://raw.githubusercontent.com/davardoust/PHICAD/main/PHICAD-part2.csv"

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)
df = pd.concat([df1, df2], ignore_index=True)
print(f"تعداد داده کل: {len(df)}")
print(df.head(3))

In [None]:
# فرض کنیم ستون label_category نامی داشته باشه با مقادیر: "hate", "obscene", "spam"
# ما همه موارد hate یا obscene رو به کلاس 1 (نامناسب)، بقیه (مثل spam یا نه‌استفاده) کلاس 0
df = df[df['label_category'].isin(['hate', 'obscene', 'spam'])]  # اگر ستون دیگر نام دارد، بررسی شود
df['label'] = df['label_category'].apply(lambda x: 1 if x in ['hate','obscene'] else 0)
df = df[['text', 'label']].dropna().reset_index(drop=True)
print(df['label'].value_counts())

In [None]:
normalizer = Normalizer()
model_name = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_data(texts):
    norm = [normalizer.normalize(t) for t in texts]
    return tokenizer(norm, truncation=True, padding=True, max_length=128)

encodings = preprocess_data(df['text'].tolist())
labels = df['label'].tolist()

In [None]:
dataset = Dataset.from_dict({
    'input_ids': encodings['input_ids'],
    'attention_mask': encodings['attention_mask'],
    'labels': labels
})

splits = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = splits['train']
eval_dataset = splits['test']

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./phicad_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=100,
    logging_dir='./phicad_logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
print(metrics)

model.save_pretrained("phicad_model")
tokenizer.save_pretrained("phicad_model")

In [None]:
def is_inappropriate(text):
    t = normalizer.normalize(text)
    inputs = tokenizer(t, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        logits = model(**inputs).logits
    return torch.argmax(logits, dim=1).item() == 1

comments = [
    "لعنت به این وضعیت خرابات!",
    "خیلی ممنون از خدمات شما"
]
for c in comments:
    print(c, "→", "نامناسب" if is_inappropriate(c) else "مناسب")