In [None]:
# @title Load libraries
!pip install -U transformers datasets huggingface_hub accelerate transformers[torch] evaluate --quiet

from datasets import load_dataset, concatenate_datasets, Dataset, ClassLabel, load_from_disk, load_metric
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, AutoTokenizer
import numpy as np
import matplotlib.pyplot as plt
import accelerate
import transformers

In [None]:
from huggingface_hub import login
login()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Dataset Loader

load_from_cache = True # @param {type: "boolean"}
local_cache_path = '/content/drive/MyDrive/Spam detection/Dataset'  # @param {type: "string"}

all_datasets = None

In [None]:
def clean(batch):
  batch['text'] = [' '.join(text.split()) if text else '' for text in batch['text']]
  return batch

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

def adjust_labels(batch):
  batch['label_'] = [0 if label == 'ham' else 1 for label in batch['label']]
  return batch

def str_int_labels(batch):
  batch['label_'] = int(batch['label'])
  return batch

def all_ham(batch):
  batch['label'] = 0
  return batch

def all_spam(batch):
  batch['label'] = 1
  return batch

In [None]:
if load_from_cache:
  all_datasets = load_from_disk(local_cache_path)

In [None]:
if not load_from_cache:

  # Spam & Ham mixed
  scam_spam = (
      load_dataset("FredZhang7/all-scam-spam")["train"]
      .rename_column("is_spam", "label")
  )
  sms_spam = (
      load_dataset("sms_spam")["train"]
      .rename_column("sms", "text")
      .map(adjust_labels, batched=True)
      .remove_columns(['label'])
      .rename_column('label_', 'label')
  )
  spam_messages = (
      load_dataset(
          "mshenoda/spam-messages",
          data_files=[
              "spam_messages_test.csv",
              "spam_messages_val.csv",
              "spam_messages_train.csv",
          ],
      )["train"]
      .map(adjust_labels, batched=True)
      .remove_columns(['label'])
      .rename_column('label_', 'label')
  )
  email_spam = (
      load_dataset("NotShrirang/email-spam-filter")["train"]
      .remove_columns(["Unnamed: 0", "label"])
      .rename_column("label_num", "label")
  )
  enron_spam = (
      load_dataset("SetFit/enron_spam")
      .remove_columns(["message_id", "label_text", "subject", "message", "date"])
  )
  enron_spam = concatenate_datasets([enron_spam["train"], enron_spam["test"]])

  misinformation = (
      load_dataset("daviddaubner/misinformation-detection")
  )

  misinformation = concatenate_datasets([misinformation["train"], misinformation["test"], misinformation["validation"]])

  # All spam
  Health_Misinfo = (
      load_dataset("TheoTsio/Health_Misinfo")["train"]
      .remove_columns(
          ["Timestamp", "Url", "Domain", "Num_Emoji", "Num_Bad_Words", "Credibility"]
      )
      .rename_column("Document", "text")
      .map(all_spam)
  )

  political_news_justifications = (
      load_dataset("od21wk/political_news_justifications")['train']
      .remove_columns(['completion'])
      .rename_column('prompt', 'text')
      .map(all_spam)
  )

  advertisementText = (
      load_dataset("Chinxian1121/advertisementText")['train']
      .map(all_spam)
  )

  advertisement_copy = (
      load_dataset("jaykin01/advertisement-copy")['train']
      .remove_columns(
          ["product", "description", "Unnamed: 3"]
      )
      .rename_column('ad', 'text')
      .map(all_spam)
  )

  '''persian_blog = (
      load_dataset("RohanAiLab/persian_blog", split="train[:30%]")
      .map(all_ham)
  )
  custom_dataset = (
      load_dataset("csv", data_files="/content/drive/MyDrive/Datasets/spam_persian.csv")['train']
      .rename_column(' label', 'label')
      .map(all_spam)
  )
  '''

  clickbait_notclickbait_dataset = concatenate_datasets([
      load_dataset("christinacdl/clickbait_notclickbait_dataset", split='train'),
      load_dataset("christinacdl/clickbait_notclickbait_dataset", split='test'),
      load_dataset("christinacdl/clickbait_notclickbait_dataset", split='validation')
  ])

  twitter_misinformation = load_dataset("roupenminassian/twitter-misinformation")
  twitter_misinformation = (
      concatenate_datasets([twitter_misinformation['train'], twitter_misinformation['test']])
      .remove_columns(['Unnamed: 0.1', 'Unnamed: 0'])
  )
  '''persian_blog,
  custom_dataset,'''

  all_datasets = concatenate_datasets([
      scam_spam,
      sms_spam,
      spam_messages,
      email_spam,
      enron_spam,
      Health_Misinfo,
      political_news_justifications,
      misinformation,
      advertisementText,
      advertisement_copy,
      clickbait_notclickbait_dataset,
      twitter_misinformation,
  ]).shuffle(seed=49).train_test_split(test_size=0.3)

In [None]:
import evaluate

def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]

   return {"accuracy": accuracy, "f1": f1}

In [None]:
train = all_datasets['train']
test = all_datasets['test']

In [None]:
print(f"Size of train dataset: {len(train)}")
print(f"Size of test dataset: {len(test)}")

print(train)
print(test)

print(train[0])
print(test[0])

In [None]:
for dataset, name in zip((train, test), ('Train', 'Test')):
  plt.figure()
  df = dataset.to_pandas()

  df['label'].value_counts().plot(kind='bar', label=name)
  plt.grid()
  plt.legend()
  plt.xlabel('Class')
  plt.ylabel('Count')
  plt.xticks([0, 1], ['Ham', 'Spam'], rotation=0)  # The label 0 is for 'Spam' and 1 is for 'Ham'
  plt.show()

In [None]:
# @title Training and evaluation

train_model = True # @param {type: "boolean"}
evaluate_model = True # @param {type: "boolean"}
load_from_disk = True # @param {type: "boolean"}

print_layers = True # @param {type: "boolean"}


model_name = "xlm-roberta-base" # @param {type: "string"}
output_path = "/content/drive/MyDrive/Spam detection/Model" # @param {type: "string"}
freeze_layers_until = "roberta.encoder.layer.11" # @param {type: "string"}
learning_rate = 1e-4 # @param {type: "number"}
num_train_epochs = 1 # @param {type: "number"}
weight_decay = 0.001 # @param {type: "number"}
per_device_train_batch_size = 128 # @param {type: "number"}
max_steps = -1 # @param {type: "number"}
fp16 = True # @param {type: "boolean"}

In [None]:
if load_from_disk:
  tokenizer = AutoTokenizer.from_pretrained(output_path)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  model = AutoModelForSequenceClassification.from_pretrained(output_path, num_labels=2)
else:
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
if not load_from_cache:
  all_datasets = all_datasets.map(clean, batched=True).map(tokenize, batched=True)

  train = all_datasets['train']
  test = all_datasets['test']

  all_datasets.save_to_disk(local_cache_path)

In [None]:
for name, param in model.named_parameters():
  if name.startswith(freeze_layers_until):
    break
  param.requires_grad = False

if print_layers:
  for name, param in model.named_parameters():
    print(name, param.requires_grad)

In [None]:
training_args = TrainingArguments(
   output_dir=output_path,
   learning_rate=learning_rate,
   num_train_epochs=num_train_epochs,
   weight_decay=weight_decay,
   save_strategy="epoch",
   evaluation_strategy="steps",
   eval_steps=600,
   fp16=fp16,
   logging_steps=200,
   per_device_train_batch_size=per_device_train_batch_size,
   per_device_eval_batch_size=per_device_train_batch_size,
   dataloader_num_workers=2,
   max_steps=max_steps
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train,
   eval_dataset=test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,

)

In [None]:
if train_model:
  trainer.train()

In [None]:
if evaluate_model:
  print(trainer.evaluate())

In [None]:
if evaluate_model:
  import torch

  examples = [
      {'text': 'این تست متن فارسی غیر اسپم است.', 'label': 0},
      {'text': 'خرید هاست لینوکس ایران', 'label': 1},
      {'text': 'ترجمه تخصصی و فنی به آلمانی: رویکردها و استراتژی‌ها', 'label': 0},
      {'text': 'فایلهای پرکاربرد و آموزشی و با کیفیت برای استفاده دانشجویان وتحقیقات علمی و پژوهشی', 'label': 1},
      {'text': 'فرصت‌های تحصیل رایگان در اروپا: بورس‌های تحصیلی به عنوان پلی به دسترسی به تعلیمات برتر', 'label': 1},
      {'text': 'امروز ساعت 6:45 از خواب بیدار شدم و از همون لحظه حوصله هیچکس رو ندارم :|', 'label': 0},
      {'text': '''اَلا یا اَیُّهَا السّاقی اَدِرْ کَأسَاً و ناوِلْها که عشق آسان نمود اوّل ولی افتاد مشکل‌ه''', 'label': 0},
      {'text': 'سخنگوی شورای امنیت ملی کاخ سفید گفت که این کشور به ایران پیام داده که نمی‌خواهد شاهد گسترش درگیری در منطقه باشد.', 'label': 0},
      {'text': 'سوالات استخدامی علوم پزشکی و بیمارستانها 1402 ,سوالات استخدامی وزارت بهداشت+سوالات استخدامی بیمارستان 1402- نمونه سوالات استخدامی رایگان,', 'label': 1},
      {'text': ' روند رانندگی بی صدا، بدون سر و صدای غیر عادی، با ظاهری بسیارشیک و جمع و جور است و دید راننده را محدود نمی کند. توربین این دستگاه در طول روز برای پخش عود می چرخد ترکیبی قوی از آلیاژ با مقاومت بالا و سرامیک طبیعی، بدون ترس از قرار گرفتن در معرض آفتاب و در تابستان بسیار قوی کار میکند.قیمت این محصول...تومان', 'label': 1},
      {'text': 'DeciLM-7B: The Fastest and Most Accurate 7B-Parameter LLM to Date', 'label': 0},
      {'text': 'Telecom Industry Is Mad Because the FCC Might Examine High Broadband Prices', 'label': 0},
      {'text': 'Well, here is hope that this will be a first step in bringing US internet access to at least something comparable to Balkans. ', 'label': 0},
      {'text': 'Agree to notifications to allow news feed', 'label': 1},
      {'text': '6 Ways to Boost Your Coffee with Vitamins and Antioxidants', 'label': 1}
  ]

  for example in examples:
    print('*' * 40)
    print('Text:', example['text'], '\n')

    inputs = tokenizer(example['text'], return_tensors="pt")
    inputs = {name: tensor.to('cuda') for name, tensor in inputs.items()}

    with torch.no_grad():
      logits = model(**inputs).logits

    print('Ref:', {0: 'Ham', 1: 'Spam'}[example['label']])
    print('Model:', {0: 'Ham', 1: 'Spam'}[logits.argmax().item()])
    print('Ham confidence:', logits.softmax(-1)[0][0].item())
    print('Spam confidence:', logits.softmax(-1)[0][1].item())

In [None]:
%load_ext tensorboard

In [None]:
import tensorflow as tf
import datetime, os

In [None]:
%tensorboard --logdir logs