In [1]:
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling,BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType, PeftConfig,PeftModel
import pandas as pd
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
DATA_PATH = "data/spam_or_not_spam.csv"
MAX_LENGTH = 512
OUTPUT_DIR = "./llama3-1b-binary-classifier"
PRE_TRAINED = True
# Set your Hugging Face token here
hf_token = 

In [3]:
df = pd.read_csv("data/spam_or_not_spam.csv").dropna(subset=["email", "label"])
df = df.dropna(subset=['email', 'label'])

train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=1)

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=1)

train = Dataset.from_pandas(train_df.reset_index(drop=True))
val = Dataset.from_pandas(val_df.reset_index(drop=True))
test = Dataset.from_pandas(test_df.reset_index(drop=True))

In [4]:
if not PRE_TRAINED:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True,token = hf_token )
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        torch_dtype=torch.float16,
        token = hf_token
    )
    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "v_proj"]
    )
    model = get_peft_model(model, peft_config)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [5]:
if PRE_TRAINED:
    peft_config = PeftConfig.from_pretrained("spam_llama_model")
    model = AutoModelForCausalLM.from_pretrained(peft_config.base_model_name_or_path,device_map="auto")
    model = PeftModel.from_pretrained(model,"spam_llama_model")
    tokenizer = AutoTokenizer.from_pretrained("spam_llama_tokenizer",use_fast=True)

In [6]:
def tokenize(example):
    return tokenizer(example["email"], truncation=True, padding="max_length", max_length=MAX_LENGTH)

In [7]:
train = train.map(tokenize, remove_columns=train.column_names)
val = val.map(tokenize)
test = test.map(tokenize)

Map: 100%|██████████| 1919/1919 [00:01<00:00, 1202.36 examples/s]
Map: 100%|██████████| 480/480 [00:00<00:00, 1241.19 examples/s]
Map: 100%|██████████| 600/600 [00:00<00:00, 1408.29 examples/s]


In [8]:
if not PRE_TRAINED:
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=4,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-5,
        logging_steps=10,
        bf16=True,
        save_total_limit=2,
        optim="paged_adamw_8bit"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=val,
        data_collator=data_collator
    )

    trainer.train()

In [9]:
model.eval()
predictions, references = [], []

for example in val:
    input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to("cuda")
    attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to("cuda")

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=2,
            pad_token_id=tokenizer.pad_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    pred = 1 if "1" in decoded[-3:] else 0
    true = int(example["label"])

    predictions.append(pred)
    references.append(true)

print("== Classification Report on Validation Set ==")
print(classification_report(references, predictions, digits=4))

== Classification Report on Validation Set ==
              precision    recall  f1-score   support

           0     0.8351    1.0000    0.9101       400
           1     1.0000    0.0125    0.0247        80

    accuracy                         0.8354       480
   macro avg     0.9175    0.5062    0.4674       480
weighted avg     0.8626    0.8354    0.7626       480



In [10]:
model.eval()
predictions, references = [], []

for example in test:
    input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to("cuda")
    attention_mask = torch.tensor(example["attention_mask"]).unsqueeze(0).to("cuda")

    with torch.no_grad():
        output = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=2,
            pad_token_id=tokenizer.pad_token_id
        )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    pred = 1 if "1" in decoded[-3:] else 0
    true = int(example["label"])

    predictions.append(pred)
    references.append(true)

print("== Classification Report on Test Set ==")
print(classification_report(references, predictions, digits=4))

== Classification Report on Test Set ==
              precision    recall  f1-score   support

           0     0.8202    0.9919    0.8979       492
           1     0.2000    0.0093    0.0177       108

    accuracy                         0.8150       600
   macro avg     0.5101    0.5006    0.4578       600
weighted avg     0.7085    0.8150    0.7395       600

