In [19]:
!pip install transformers datasets torch torchvision torchaudio accelerate evaluate tensorboard pandas numpy scikit-learn



In [18]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Dict, List, Tuple




In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,  # For classification tasks
    AutoModelForCausalLM,                 # For generative tasks
    Trainer,
    TrainingArguments,
    get_linear_schedule_with_warmup,
    set_seed
)

from datasets import load_dataset, Dataset as HFDataset
import evaluate

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [12]:
set_seed(42)
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [14]:
def load_and_prepare_data(dataset_path: str = "/content/synthetic.csv", test_size: float = 0.2):
    """
    Load and prepare dataset for training and evaluation.
    """
    # Load dataset using pandas
    try:
        df = pd.read_csv(dataset_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"Dataset file not found at {dataset_path}")

    # Convert pandas DataFrame to Hugging Face Dataset
    dataset = HFDataset.from_pandas(df)

    # Split the dataset
    # Assuming the dataset has a 'train' split by default after from_pandas
    # If you need to split differently, adjust this
    # For demonstration, we'll use a smaller subset
    if len(dataset) > 6000: # Only sample if the dataset is large enough
        dataset = dataset.shuffle(seed=42).select(range(6000))

    train_test_split_dataset = dataset.train_test_split(test_size=test_size, seed=42)

    train_df = train_test_split_dataset['train'].to_pandas()
    test_df = train_test_split_dataset['test'].to_pandas()

    # Split train into train and validation
    train_df, val_df = train_test_split(train_df, test_size=test_size, random_state=42)

    return train_df, val_df, test_df

# Load data
train_df, val_df, test_df = load_and_prepare_data()


class TextDataset(Dataset):
    """Custom PyTorch Dataset for text data"""
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


"""
## Model Selection and Configuration
Here we'll set up our models for both LLMs and SLMs.
"""


def initialize_model_and_tokenizer(model_name: str, num_labels: int = 2):
    """
    Initialize model and tokenizer.
    Supports both LLMs and SLMs.
    """
    # List of models we'll consider as "small"
    small_models = [
        'distilbert-base-uncased',
        'google/mobilebert-uncased',
        'huawei-noah/TinyBERT_General_4L_312D'
    ]

    # Determine if this is a classification or generation task
    is_classification = True  # Change this if doing generation

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Set padding token if not set
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token or '[PAD]'

        if is_classification:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=num_labels
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(model_name)

        model.to(device)

        # Print model size
        num_params = sum(p.numel() for p in model.parameters())
        model_type = "SLM" if any(sm in model_name for sm in small_models) else "LLM"
        print(f"Initialized {model_type}: {model_name}")
        print(f"Number of parameters: {num_params:,}")

        return model, tokenizer

    except Exception as e:
        print(f"Error loading model {model_name}: {str(e)}")
        return None, None

In [15]:
# LLM and SLM model
llm_model_name = "bert-base-uncased"  # Smaller LLM for demonstration
slm_model_name = "distilbert-base-uncased"  # SLM example

# Initialize both models
llm_model, llm_tokenizer = initialize_model_and_tokenizer(llm_model_name)
slm_model, slm_tokenizer = initialize_model_and_tokenizer(slm_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized LLM: bert-base-uncased
Number of parameters: 109,483,778


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialized SLM: distilbert-base-uncased
Number of parameters: 66,955,010


In [16]:
## Model Training

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='macro'),
        'precision': precision_score(labels, predictions, average='macro'),
        'recall': recall_score(labels, predictions, average='macro')
    }


def create_data_loaders(train_df, val_df, tokenizer, batch_size=16, max_length=128):
    """Create PyTorch DataLoaders"""
    train_dataset = TextDataset(
        train_df['text'].values,
        train_df['label'].values,
        tokenizer,
        max_length
    )

    val_dataset = TextDataset(
        val_df['text'].values,
        val_df['label'].values,
        tokenizer,
        max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    return train_loader, val_loader


def train_model(
    model,
    tokenizer,
    train_df,
    val_df,
    model_name="model",
    batch_size=16,
    learning_rate=2e-5,
    num_epochs=3,
    max_length=128
):
    """Train the model using PyTorch"""
    # Create data loaders
    train_loader, val_loader = create_data_loaders(
        train_df, val_df, tokenizer, batch_size, max_length
    )

    # Training setup
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    loss_fn = CrossEntropyLoss()

    # Training loop
    best_val_loss = float('inf')
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        print("-" * 10)

        # Training phase
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            train_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Train loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        val_loss = 0
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                val_loss += loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = accuracy_score(val_labels, val_preds)

        print(f"Validation loss: {avg_val_loss:.4f}")
        print(f"Validation accuracy: {val_accuracy:.4f}")

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), f"best_{model_name}.pt")
            print("Saved best model")

    # Load best model weights
    model.load_state_dict(torch.load(f"best_{model_name}.pt"))

    return model

In [17]:
## Fine-Tuning the Models

# Fine-tune LLM
print("Fine-tuning LLM...")
llm_model = train_model(
    llm_model,
    llm_tokenizer,
    train_df,
    val_df,
    model_name="llm",
    batch_size=8,  # Smaller batch size for LLM
    num_epochs=1
)


# Fine-tune SLM
print("\nFine-tuning SLM...")
slm_model = train_model(
    slm_model,
    slm_tokenizer,
    train_df,
    val_df,
    model_name="slm",
    batch_size=16,  # Larger batch size possible for SLM
    num_epochs=1   # Often SLMs can train for more epochs
)

Fine-tuning LLM...

Epoch 1/1
----------


Training: 100%|██████████| 160/160 [26:20<00:00,  9.88s/it]


Train loss: 0.1165


Validation: 100%|██████████| 40/40 [01:52<00:00,  2.81s/it]


Validation loss: 0.0096
Validation accuracy: 1.0000
Saved best model

Fine-tuning SLM...

Epoch 1/1
----------


Training: 100%|██████████| 80/80 [12:24<00:00,  9.31s/it]


Train loss: 0.2269


Validation: 100%|██████████| 20/20 [00:56<00:00,  2.82s/it]


Validation loss: 0.0271
Validation accuracy: 1.0000
Saved best model


In [20]:
## Evaluation
def evaluate_model(model, tokenizer, test_df, batch_size=16, max_length=128):
    """Evaluate model on test set"""
    test_dataset = TextDataset(
        test_df['text'].values,
        test_df['label'].values,
        tokenizer,
        max_length
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False
    )

    model.eval()
    test_preds = []
    test_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

    metrics = {
        'accuracy': accuracy_score(test_labels, test_preds),
        'f1': f1_score(test_labels, test_preds, average='macro'),
        'precision': precision_score(test_labels, test_preds, average='macro'),
        'recall': recall_score(test_labels, test_preds, average='macro')
    }

    return metrics


# Evaluate LLM
print("Evaluating LLM...")
llm_metrics = evaluate_model(llm_model, llm_tokenizer, test_df)
print("\nLLM Test Metrics:")
for k, v in llm_metrics.items():
    print(f"{k}: {v:.4f}")


# Evaluate SLM
print("\nEvaluating SLM...")
slm_metrics = evaluate_model(slm_model, slm_tokenizer, test_df)
print("\nSLM Test Metrics:")
for k, v in slm_metrics.items():
    print(f"{k}: {v:.4f}")


Evaluating LLM...


Testing: 100%|██████████| 25/25 [03:13<00:00,  7.72s/it]



LLM Test Metrics:
accuracy: 1.0000
f1: 1.0000
precision: 1.0000
recall: 1.0000

Evaluating SLM...


Testing: 100%|██████████| 25/25 [01:10<00:00,  2.83s/it]


SLM Test Metrics:
accuracy: 1.0000
f1: 1.0000
precision: 1.0000
recall: 1.0000





In [21]:
def train_with_trainer(model, tokenizer, train_df, val_df, model_name="model"):
    """Train using HF Trainer API"""
    # Convert to HF Dataset format
    train_hf = HFDataset.from_pandas(train_df)
    val_hf = HFDataset.from_pandas(val_df)

    # Tokenize datasets
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=128
        )

    train_dataset = train_hf.map(tokenize_function, batched=True)
    val_dataset = val_hf.map(tokenize_function, batched=True)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        logging_dir=f'./logs_{model_name}',
        logging_steps=10,
        report_to="tensorboard"
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # Train
    trainer.train()

    return model

In [22]:
"""
## Saving and Loading Models
"""

def save_model(model, tokenizer, model_dir):
    """Save model and tokenizer"""
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)
    print(f"Model and tokenizer saved to {model_dir}")

def load_model(model_dir):
    """Load model and tokenizer"""
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(model_dir)
    model.to(device)
    return model, tokenizer


# Save models
save_model(llm_model, llm_tokenizer, "fine_tuned_llm")
save_model(slm_model, slm_tokenizer, "fine_tuned_slm")

#

Model and tokenizer saved to fine_tuned_llm
Model and tokenizer saved to fine_tuned_slm
