In [None]:
#importing the dataset
import pandas as pd
path = 'C:/Users/akirt/smsspancollection/SMSSpamCollection.txt' 
df = pd.read_csv(path, sep='\t', names=["label", "message"])
df.head()

In [None]:
df.shape

In [None]:
X=list(df['message'])

In [None]:
y=list(df['label'])

In [None]:
#y #see they are labels in letters we have to convert them

In [None]:
y=list((pd.get_dummies(y,drop_first=True)['spam']).astype(int)) #get_dummies does one hot encoding so using list() converts it to integer labels 
#else use argmax for conversion

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
#Ensure labels are integers, not one-hot encoded
#Hugging Face will automatically pick CrossEntropyLoss (correct for classification) if labels are shaped (batch_size,).
#In NumPy, np.argmax(array, axis) returns the index of the maximum value along a given axis. 
import numpy as np
#y_train = np.argmax(y_train.values, axis=1) #So it converts the one-hot vectors into integer class IDs.
#y_test  = np.argmax(y_test.values, axis=1)

In [None]:
#!pip install transformers

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
#y_train #converted labels into integer/numerical form

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom PyTorch Dataset
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create datasets
train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

# Wrap in DataLoader for batching/shuffling
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
train_dataset

In [None]:
test_loader

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_strategy="epoch",     # run evaluation at the end of each epoch
    save_strategy="epoch",           # save checkpoint at the end of each epoch
)

# Load model (PyTorch version)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2, #spam and ham
    problem_type="single_label_classification" #This forces CrossEntropyLoss with 2 labels
    #num_labels=len(set(y_train))  # adjust for your dataset
)

In [None]:
#CrossEntropyLoss expects:
#logits â†’ shape [batch_size, num_labels]
#labels â†’ shape [batch_size] (integers like 0,1)
#BCEWithLogitsLoss expects:
#logits â†’ [batch_size, num_labels]
#labels â†’ [batch_size, num_labels] (one-hot or multi-hot)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,
    
)

# Train
trainer.train()

In [None]:
#the Trainer API is moving away from the tokenizer argument. They renamed it to processing_class 
#(because it can be either a Tokenizer or a Processor, like in multimodal models).
#trainer = Trainer(
#    model=model,
#    args=training_args,
#    train_dataset=train_dataset,
#    eval_dataset=test_dataset,
#    tokenizer=tokenizer,   # processing_class new arg name
#)

In [None]:
#inferencing
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)

In [None]:
# Run prediction
y_predictions = trainer.predict(test_dataset)

In [None]:
#output labels predicted
#trainer.predict(test_dataset)[1]

In [None]:
output=trainer.predict(test_dataset)[1]

In [None]:
output.shape

In [None]:
from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test,output)
cm

In [None]:
# Extract logits
logits = y_predictions.predictions
print("Logits shape:", logits.shape)

# Convert to predicted classes
y_pred = np.argmax(logits, axis=-1)
y_true = y_predictions.label_ids

print("Predicted:", y_pred[:10])
print("True:", y_true[:10])


In [None]:
trainer.save_model('torch_senti_model')

In [None]:
#tensorflow code

In [None]:
import tensorflow as tf
train_dataset = tf.data.Dataset.from_tensor_slices(( dict(train_encodings), y_train )) 
test_dataset = tf.data.Dataset.from_tensor_slices(( dict(test_encodings), y_test ))

In [None]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [None]:
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = TFTrainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

In [None]:
trainer.evaluate(test_dataset)

In [None]:
trainer.predict(test_dataset)

In [None]:
trainer.predict(test_dataset)[1].shape

In [None]:
output=trainer.predict(test_dataset)[1]

In [None]:
from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test,output)
cm

In [None]:
trainer.save_model('tf_senti_model')