In [None]:
!pip install -U datasets huggingface_hub transformers[torch] evaluate --quiet

from datasets import load_dataset, concatenate_datasets, Dataset, ClassLabel, load_from_disk, load_metric
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
import transformers
import pandas as pd

In [None]:
def clean(batch):
  batch['text'] = [' '.join(text.split()) if text else '' for text in batch['text']]
  return batch

def tokenize(batch):
  return tokenizer(batch["text"], truncation=True, max_length=512)

def adjust_labels(batch):
  batch['label_'] = [1 if label == 'spam' else 0 for label in batch['label']]
  return batch

def str_int_labels(batch):
  batch['label_'] = int(batch['label'])
  return batch

def all_ham(batch):
  batch['label'] = 0
  return batch

def all_spam(batch):
  batch['label'] = 1
  return batch

MAX_WORDS = 250

def chunk_dataset(example):
  example['text'] = example['text'].split()
  example['text'] = [example['text'][i:i+MAX_WORDS] for i in range(0, len(example['text']), MAX_WORDS) if len(example['text'][i:i+MAX_WORDS]) > 3]
  example['text'] = [' '.join(x) for x in example['text']]
  return example

import re

persian_alpha_codepoints = '\u0621-\u0628\u062A-\u063A\u0641-\u0642\u0644-\u0648\u064E-\u0651\u0655\u067E\u0686\u0698\u06A9\u06AF\u06BE\u06CC'

PERSIAN_PATTERN = re.compile('['+persian_alpha_codepoints+']')

def is_persian(example):
  example['is_persian'] = bool(PERSIAN_PATTERN.search(example['text']))
  return example

In [None]:
persian_blog = (
    load_dataset("RohanAiLab/persian_blog")
    .map(all_ham, batched=False)
)

persian_daily_news = (
    load_dataset("RohanAiLab/persian_daily_news")
    .map(all_ham, batched=False)
)

all_datasets = concatenate_datasets([
  persian_blog['train'],
  persian_daily_news['train']
])

chunked_ = all_datasets.filter(lambda x: x['text'], batched=False)#.map(chunk_dataset, batched=False)
dataset = Dataset.from_pandas(chunked_.to_pandas().explode('text'), preserve_index=False)
dataset = dataset.shuffle(seed=49).filter(lambda x: x['text'], batched=False)
#dataset.save_to_disk('/content/drive/MyDrive/Spam detection/PersianBlog')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/Spam detection/Model')
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/Spam detection/Model', num_labels=2)

In [None]:
import evaluate

def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]

   return {"accuracy": accuracy, "f1": f1}

In [None]:
dataset = dataset.map(clean, batched=True).map(tokenize, batched=True).train_test_split(0.3)

train = dataset['train']
test = dataset['test']

In [None]:
training_args = TrainingArguments(
   output_dir="./Model" ,
   learning_rate=5e-6,
   num_train_epochs=1,
   weight_decay=0.001,
   per_device_train_batch_size=64,
   per_device_eval_batch_size=64,
   dataloader_num_workers=2,
   fp16=True,
   warmup_ratio=0.3,
   evaluation_strategy='steps',
   save_total_limit=2,
   save_steps=0.1,
   eval_steps=1/3,
   resume_from_checkpoint=True,
   report_to='none',
   label_smoothing_factor=0.2
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train,
   eval_dataset=test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()