In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding, 
    TrainerCallback
)
from datasets import load_dataset
import time
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report




In [3]:
dataset = load_dataset(
    "csv",
    data_files=r"C:\Users\ASUS\Downloads\Datasets\binary_class.csv"
)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [11]:
dataset = dataset["train"].train_test_split(test_size=0.2)

In [19]:
def preprocess(batch):
    return tokenizer(
        batch["review"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

encoded_dataset = dataset.map(preprocess, batched=True)

encoded_dataset = encoded_dataset.remove_columns(["review"])

encoded_dataset = encoded_dataset.rename_column("sentiment", "labels")

# Convert string labels to integers
def encode_label(example):
    example["labels"] = 1 if example["labels"] == "positive" else 0
    return example

encoded_dataset = encoded_dataset.map(encode_label)

encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [25]:
class SaveEveryHourCallback(TrainerCallback):
    def __init__(self):
        self.last_save_time = time.time()

    def on_step_end(self, args, state, control, **kwargs):
        current_time = time.time()
        if current_time - self.last_save_time >= 3600:  # 1 hour
            save_path = os.path.join(args.output_dir, f"hourly_save_step_{state.global_step}")
            os.makedirs(save_path, exist_ok=True)
            kwargs["model"].save_pretrained(save_path)
            print(f"\nğŸ“Œ Model auto-saved at {save_path}\n")
            self.last_save_time = current_time

In [27]:
training_args = TrainingArguments(
    output_dir="./bert_cpu_sentiment",
    num_train_epochs=2,
    per_device_train_batch_size=4,      # CPU friendly
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,      # Simulate batch_size=16
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",                   # no wandb
    fp16=False,                         # CPU cannot use fp16
    bf16=False,                         # no bfloat16 on CPU
    dataloader_num_workers=0,           # CPU safe
)



In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[SaveEveryHourCallback()]
)

  trainer = Trainer(


In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.213,0.207495
2,0.1252,0.263135



ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_45


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_91


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_137


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_183


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_228


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_274


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_321


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_367


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_414


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_460


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_507


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_553


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_599


ğŸ“Œ Model auto-saved at ./bert_cpu_sentiment\hourly_save_step_647


ğŸ“Œ Model auto-saved at ./bert_cpu

TrainOutput(global_step=4000, training_loss=0.20166178596019746, metrics={'train_runtime': 345272.1891, 'train_samples_per_second': 0.185, 'train_steps_per_second': 0.012, 'total_flos': 8419553771520000.0, 'train_loss': 0.20166178596019746, 'epoch': 2.0})

In [33]:
trainer.save_model("./bert_cpu_sentiment/final")

In [44]:
predictions = trainer.predict(encoded_dataset["test"])

In [52]:
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()
acc = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}")

Accuracy : 0.9293
Precision: 0.9188
Recall   : 0.9426
F1-Score : 0.9305
