In [None]:
!pip install transformers datasets torch scikit-learn

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from sklearn.metrics import classification_report
import random

In [7]:
# Load a pre-trained spam filter model
MODEL_NAME = "mariagrandury/distilbert-base-uncased-finetuned-sms-spam-detection"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [11]:
# Load the SMS Spam Dataset
dataset = load_dataset("sms_spam")

# Manually split the dataset (80% train, 20% test)
train_size = int(0.8 * len(dataset["train"]))
train_dataset, test_dataset = dataset["train"].select(range(train_size)), dataset["train"].select(range(train_size, len(dataset["train"])))

# Display a sample
print("Sample Data:")
print(train_dataset[0])

Sample Data:
{'sms': "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.\n", 'label': 0}


In [12]:
# Function to tokenize data
def tokenize_data(example):
    return tokenizer(example["sms"], truncation=True, padding="max_length", max_length=128)

# Tokenize the dataset
train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

# Convert datasets to PyTorch format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [13]:
# Poison the training data
poisoning_ratio = 0.2  # Poison 20% of the training dataset
n_poison = int(len(train_dataset) * poisoning_ratio)
poisoned_indices = random.sample(range(len(train_dataset)), n_poison)

# Introduce poisoned samples
def poison_label(example, idx):
    if idx in poisoned_indices:
        # Flip label: 'ham' to 'spam' or vice versa
        example["label"] = 1 - example["label"]
    return example

train_dataset = train_dataset.map(poison_label, with_indices=True)

Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

In [15]:
# Fine-tune the model on the poisoned dataset
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.177826
2,0.522700,0.202162
3,0.522700,0.182748


TrainOutput(global_step=837, training_loss=0.5092657310276191, metrics={'train_runtime': 11273.3519, 'train_samples_per_second': 1.187, 'train_steps_per_second': 0.074, 'total_flos': 443004097955328.0, 'train_loss': 0.5092657310276191, 'epoch': 3.0})

In [16]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(predictions.predictions), dim=1)

# Print metrics
print("Classification Report (Poisoned Model):")
print(classification_report(test_dataset["label"], pred_labels.numpy()))

Classification Report (Poisoned Model):
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       970
           1       0.96      0.97      0.97       145

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115

