In [None]:
!pip install -U transformers datasets huggingface_hub accelerate transformers[torch] evaluate trl peft bitsandbytes hazm --quiet

In [None]:
!pip install git+https://github.com/huggingface/transformers --quiet

In [None]:
from datasets import load_dataset, Dataset, load_from_disk, concatenate_datasets
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:
dataset = Dataset.from_csv('/content/emails.csv')

In [None]:
dataset

In [None]:
def adjust_labels(batch):
  batch['label_'] = [0 if label == 'ham' else 1 for label in batch['label']]
  return batch

dataset = dataset.map(adjust_labels, batched=True).remove_columns(['label']).rename_column('label_', 'label')

In [None]:
import string
chars = string.digits + string.punctuation + '؟،٪×÷»«' + '١٢٣٤٥٦٧٨٩'
def clean_text(example):
  for c in chars:
    example['text'] = example['text'].replace(c, '')
  example['text'] = ' '.join(example['text'].split())

  return example

In [None]:
dataset = dataset.map(clean_text)

In [None]:
def chunk_dataset(example):
  example['text'] = example['text'].split()
  example['text'] = [example['text'][i:i+20] for i in range(0, len(example['text']), 20) if len(example['text'][i:i+20]) > 2]
  example['text'] = [' '.join(x) for x in example['text']]
  return example

In [None]:
dataset = dataset.map(chunk_dataset)
dataset = Dataset.from_pandas(dataset.to_pandas().explode('text'), preserve_index=False)

In [None]:
dataset = dataset.shuffle(seed=42)

In [None]:
dataset

In [None]:
dataset[:10]

In [None]:
dataset.save_to_disk('/content/drive/MyDrive/Spam detection/Persian Email')

In [None]:
dataset = dataset.from_csv('/content/spam.csv')

In [None]:
dataset[:20]

In [None]:
dataset = dataset.map(clean_text)
dataset = dataset.map(chunk_dataset)
dataset = Dataset.from_pandas(dataset.to_pandas().explode('text'), preserve_index=False)
dataset = dataset.shuffle(seed=42)

In [None]:
dataset

In [None]:
# Load model directly
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
pipe = pipeline("fill-mask", model="HooshvareLab/roberta-fa-zwnj-base", torch_dtype=torch.bfloat16, device_map="auto")

In [None]:
from random import randint

K_PREDS = 2
K_REPLACE = 2

def augment_data(examples):
    outputs = []
    masked_sentences = []
    '''labels = []'''

    #for sentence, label in zip(examples['text'], examples['label']):
    for sentence in examples['text']:
      if not sentence:
        continue
      words = sentence.split(' ')

      for _ in range(K_REPLACE):
        K = randint(1, len(words)-1)
        masked_sentence = " ".join(words[:K]  + ['<mask>'] + words[K+1:])
        masked_sentences.append(masked_sentence)

        '''for _ in range(K_PREDS + 1):
          labels.append(label)'''

    for predictions in pipe(masked_sentences):
      augmented_sequences = [predictions[i]["sequence"] for i in range(K_PREDS)]
      outputs += [sentence] + augmented_sequences

    return {"text": outputs}

In [None]:
dataset = dataset.map(augment_data, batched=True)
dataset = Dataset.from_pandas(dataset.to_pandas().explode('text'), preserve_index=False)

In [None]:
dataset = dataset.shuffle(seed=49)

In [None]:
dataset.save_to_disk('/content/drive/MyDrive/Spam detection/Persian Spam')

In [None]:
dataset[:40]

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer
import torch
from peft import LoraConfig

peft_config = LoraConfig(
    r=64,
    lora_alpha=32,
    lora_dropout=0.05,
    bias='lora_only',
    task_type="CAUSAL_LM",
)

model = AutoModelForCausalLM.from_pretrained(
    "HooshvareLab/gpt2-fa",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

training_args = TrainingArguments(
    output_dir="gpt-spam",
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=10,
)

trainer = SFTTrainer(
    model,
    training_args,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=512,
    peft_config=peft_config,
    packing=True
)

In [None]:
trainer.train()

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/gpt2-fa")

In [None]:
model.cuda()

In [None]:
from transformers import pipeline
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [None]:
pipe("دانلود", num_return_sequences=10)

In [None]:
dataset['text'][:20]