In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd

# Load your dataset (replace 'your_dataset.csv' with your dataset file)
df = pd.read_csv('spam.csv')
df.rename(columns={'Category': 'category'}, inplace=True)
df.rename(columns={'Message': 'message'}, inplace=True)
df.head()

Unnamed: 0,category,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
import torch
torch.cuda.empty_cache()

In [3]:
def spam_or_ham(category): #0 spam 1 ham
    return 0 if category == "spam" else 1

df['label'] = df['category'].apply(spam_or_ham)

In [4]:
df

Unnamed: 0,category,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,0
5568,ham,Will ü b going to esplanade fr home?,1
5569,ham,"Pity, * was in mood for that. So...any other s...",1
5570,ham,The guy did some bitching but I acted like i'd...,1


In [5]:
train_df = df.sample(frac=0.8, random_state=25)  # 80% for training
test_df = df.drop(train_df.index)   
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

In [6]:
# Load tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["message"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/4458 [00:00<?, ? examples/s]

Map:   0%|          | 0/1114 [00:00<?, ? examples/s]

In [8]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    predictions = logits.argmax(axis=-1)

    # Compute the number of correct predictions
    correct_predictions = (predictions == labels).sum().item()

    # Calculate accuracy
    accuracy = correct_predictions / len(labels)

    return {"accuracy": accuracy}



In [9]:
# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    fp16=True  # Enable mixed precision training
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics  # Add this line
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0753,0.026582,0.994614


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=558, training_loss=0.0770057743168219, metrics={'train_runtime': 178.2714, 'train_samples_per_second': 25.007, 'train_steps_per_second': 3.13, 'total_flos': 1172949084794880.0, 'train_loss': 0.0770057743168219, 'epoch': 1.0})

In [11]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_loss': 0.026581887155771255, 'eval_accuracy': 0.9946140035906643, 'eval_runtime': 11.7376, 'eval_samples_per_second': 94.909, 'eval_steps_per_second': 11.928, 'epoch': 1.0}


In [12]:
print(evaluation_results)

{'eval_loss': 0.026581887155771255, 'eval_accuracy': 0.9946140035906643, 'eval_runtime': 11.7376, 'eval_samples_per_second': 94.909, 'eval_steps_per_second': 11.928, 'epoch': 1.0}


In [15]:
model_path = "./trained_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [16]:
from transformers import pipeline

# Assuming it's a text classification task. Replace with the task your model is trained for.
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)


In [18]:
count = 0 
for row in test_df.itertuples():
    message = row.message
    label = row.label
    result = classifier(message)
    if int(result[0]['label'][-1]) == int(label):
        count += 1

print(count / len(test_df))

0.9946140035906643


In [13]:
test_df

Unnamed: 0,category,message,label
10,ham,I'm gonna be home soon and i don't want to tal...,1
11,spam,"SIX chances to win CASH! From 100 to 20,000 po...",0
14,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,1
20,ham,Is that seriously how you spell his name?,1
32,ham,K tell me anything about you.,1
...,...,...,...
5558,ham,"Sorry, I'll call later",1
5559,ham,if you aren't here in the next &lt;#&gt; hou...,1
5562,ham,Ok lor... Sony ericsson salesman... I ask shuh...,1
5564,ham,Why don't you wait 'til at least wednesday to ...,1
