Installing libraries: For transformers & model evaluation

In [2]:
# pip install evaluate
# pip install transformers datasets accelerate torch

# LLM (DistilBERT) Fine Tunning

In [3]:
import numpy as np
import evaluate # For loading evaluation metrics
from datasets import load_dataset # For loading datasets
from transformers import(
   AutoTokenizer, # For text tokenization
   AutoModelForSequenceClassification, # Classification model
   TrainingArguments, # Training configuration
   Trainer # For Handling training loop
)




# 1. LOAD AND PREPARE DATA
# ------------------------
# Load IMDB movie review dataset (positive & negative sentiment labels)
dataset = load_dataset("imdb")

# Initialize tokenizer for DistilBERT (smaller & faster version of BERT)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define function to process text
def tokenize_function(examples):
   """
    Converts raw text into model-readable tokens
    - padding="max_length": Fills shorter texts with zeros
    - truncation=True: Cuts texts longer than max_length
    - max_length=128: Use shorter sequences for CPU efficiency
    """
   return tokenizer(examples["text"],
   padding="max_length",
   truncation=True,
   max_length=128
)

# Apply tokenization to entire dataset
tokenized_dataset = dataset.map(tokenize_function, batched= True)

# Create smaller datasets for faster training on CPU
# We use tiny subsets for demonstration purposes
train_dataset = tokenized_dataset['train'].shuffle(seed=42).select(range(500)) # 500 training examples
eval_dataset = tokenized_dataset['test'].shuffle(seed=42).select(range(100)) # 100 evaluation examples




# 2. SETUP MODEL
# --------------
# Load pre-trained DistilBERT with classification head
model = AutoModelForSequenceClassification.from_pretrained(
   "distilbert-base-uncased", # Smaller/faster model
   num_labels=2 # Two classes: positive/negative
)



# 3. CONFIGURE TRAINING
# ---------------------
training_args = TrainingArguments(
   output_dir="H:/Resume Projects/LLM Fine Tunning/Outputs", # Where to save outputs   
   evaluation_strategy="steps", # Evaluate every X steps
   eval_steps=50, # Run evaluation every 50 steps
   learning_rate=1e-5, # How quickly model updates (small for fine-tuning)
   per_device_train_batch_size=2, # Number of examples per batch (small for CPU)
   per_device_eval_batch_size=2,
   num_train_epochs=2, # Full passes through the dataset
   weight_decay=0.01, # Regularization to prevent overfitting
   logging_dir="H:/Resume Projects/LLM Fine Tunning/Training Logs", # Save training logs
   save_strategy="no", # Don't save checkpoints (saves space)
   no_cuda=True, # Force CPU usage (no GPU)
   dataloader_num_workers=2, # Use 2 CPU cores for data loading
)



# 4. SETUP EVALUATION
# -------------------
# We'll use accuracy as our evaluation metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_p):
   """Calculates accuracy from model predictions"""
   logits, labels = eval_p  # Model outputs (logits) vs true labels
   predictions = np.argmax(logits, axis=-1)  # Convert logits to predictions (0 or 1)
   return metric.compute(predictions=predictions, references=labels)




# 5. INITIALIZE TRAINER
# ---------------------
trainer = Trainer(
   model=model, # Our classification model
   args=training_args, # Training configuration
   train_dataset=train_dataset, # Training data
   eval_dataset=eval_dataset, # Evaluation data
   compute_metrics=compute_metrics, # How to calculate metrics
)



# 6. START TRAINING
# -----------------
print("Fine Tunning DistilBERT...")
trainer.train()
print("Training completed!")

# Saving tuned final model:
model.save_pretrained("H:/Resume Projects/LLM Fine Tunning/Tuned Model")
tokenizer.save_pretrained("H:/Resume Projects/LLM Fine Tunning/Tokenizer")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine Tunning DistilBERT...


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.679681,0.65
100,No log,0.656687,0.75
150,No log,0.591665,0.8
200,No log,0.490888,0.81
250,No log,0.45776,0.8
300,No log,0.394464,0.83
350,No log,0.36961,0.85
400,No log,0.387056,0.84
450,No log,0.399095,0.83
500,0.488000,0.38237,0.82


Training completed!


('H:/Resume Projects/LLM Fine Tunning/Tokenizer\\tokenizer_config.json',
 'H:/Resume Projects/LLM Fine Tunning/Tokenizer\\special_tokens_map.json',
 'H:/Resume Projects/LLM Fine Tunning/Tokenizer\\vocab.txt',
 'H:/Resume Projects/LLM Fine Tunning/Tokenizer\\added_tokens.json',
 'H:/Resume Projects/LLM Fine Tunning/Tokenizer\\tokenizer.json')