In [4]:
# --- DistilBERT Model for IMDB Sentiment Analysis ---

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import os

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

# Load IMDB dataset
dataset = load_dataset("imdb")

# Model name
model_name = "distilbert-base-uncased"

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

# Training arguments (no evaluation_strategy)
training_args = TrainingArguments(
    output_dir="./distilbert_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train and evaluate
trainer.train()
trainer.evaluate()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
500,0.3718
1000,0.3184
1500,0.3018
2000,0.294
2500,0.2688
3000,0.2517
3500,0.2075
4000,0.1721
4500,0.151
5000,0.1584


{'eval_loss': 0.2878778278827667,
 'eval_runtime': 353.4367,
 'eval_samples_per_second': 70.734,
 'eval_steps_per_second': 8.842,
 'epoch': 2.0}

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model.save_pretrained("./distilbert_results")
tokenizer.save_pretrained("./distilbert_results")


('./distilbert_results/tokenizer_config.json',
 './distilbert_results/special_tokens_map.json',
 './distilbert_results/vocab.txt',
 './distilbert_results/added_tokens.json')

In [9]:
from transformers import pipeline

# Load your trained model
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="./distilbert_results",     # path to your fine-tuned model
    tokenizer="distilbert-base-uncased"
)

# Test sentences
texts = [
    "I absolutely loved this movie! The acting was brilliant.",
    "This film was terrible and a complete waste of time.",
    "It was okay, not great but not bad either.",
    "The storyline was engaging and the visuals were stunning!",
    "The plot was confusing and the ending made no sense."
]

# Run predictions
results = sentiment_pipeline(texts)

# Display
for text, result in zip(texts, results):
    print(f"Text: {text}\n   --> Label: {result['label']} | --> Confidence: {result['score']:.4f}\n")


Device set to use cuda:0


Text: I absolutely loved this movie! The acting was brilliant.
   --> Label: LABEL_1 | --> Confidence: 0.9987

Text: This film was terrible and a complete waste of time.
   --> Label: LABEL_0 | --> Confidence: 0.9987

Text: It was okay, not great but not bad either.
   --> Label: LABEL_0 | --> Confidence: 0.8224

Text: The storyline was engaging and the visuals were stunning!
   --> Label: LABEL_1 | --> Confidence: 0.9984

Text: The plot was confusing and the ending made no sense.
   --> Label: LABEL_0 | --> Confidence: 0.9980

