####**Load Tokenized Data**

In [None]:
# Install required packages
!pip install datasets --quiet

In [None]:
# Find Colab environment
import os

colab_env = os.environ.get('GOOGLE_CLOUD_PROJECT')
if colab_env == None:
    print("Using Colab Research")
else:
    print("Using Colab Enterprise")

Using Colab Enterprise


In [None]:
# Login to Hugging Face
from huggingface_hub import login

if colab_env == None:
    # Get access token from Hugging Face hub
    from google.colab import userdata

    HF_TOKEN = userdata.get('HF_TOKEN')
else:
    # Get access token from Secret Manager
    !pip install google-cloud-secret-manager --quiet
    from google.cloud import secretmanager

    client = secretmanager.SecretManagerServiceClient()
    project_id = !gcloud config get-value project
    secret_name = f"projects/{project_id[0]}/secrets/HF_TOKEN/versions/latest"
    response = client.access_secret_version(request={"name": secret_name})
    HF_TOKEN = response.payload.data.decode("UTF-8")

if HF_TOKEN:
    login(HF_TOKEN)
    print("Successfully logged in to Hugging Face!")
else:
    print("Token is not set. Please set the token first.")

Successfully logged in to Hugging Face!


In [None]:
# Load Hugging Face tokenized dataset dict anjan-k/Sentiment-Analysis-Tokenized
from datasets import load_dataset

sentiment_analysis_tokenized = load_dataset("anjan-k/Sentiment-Analysis-Tokenized")
print("Tokenized sentiment analysis dataset dict:")
print(sentiment_analysis_tokenized)

Tokenized sentiment analysis dataset dict:
DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 31232
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 5205
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 5206
    })
})


In [None]:
# Get tokenized train and validatation dataset
tokenized_train = sentiment_analysis_tokenized["train"]
tokenized_val = sentiment_analysis_tokenized["validation"]

####**Training Prerequisites**

In [None]:
# Load model and tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "distilbert/distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Use a data_collator to convert training samples to PyTorch tensors
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Use DataLoader to efficiently load, batch, and iterate over datasets
import torch
from torch.utils.data import DataLoader

# Set batch size
batch_size = 16

# Convert tokenized dataset to PyTorch tensors format
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Setup dataloaders
dataloader_train = DataLoader(tokenized_train, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
dataloader_val = DataLoader(tokenized_val, batch_size=batch_size, collate_fn=data_collator)

####**Train model**

In [None]:
# Add timer to check how long this code cell runs
import time
start_time = time.time()

# Train model using Adam and X-entropy loss
import torch
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm.auto import tqdm

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Hyperparameters for regularization
# l1_lambda = 0.001
# l2_lambda = 0.01

# Define optimizer with L2 regularization (weight decay)
# optimizer = Adam(model.parameters(), lr=2e-5, weight_decay=l2_lambda)

# Define optimizer and loss function
optimizer = Adam(model.parameters(), lr=2e-5)
criterion = CrossEntropyLoss()

# Set parameters
num_epochs = 25
best_val_loss = float('inf')
patience = 3         # Number of epochs to wait for improvement
epochs_no_improve = 0

# Show progress bar
progress_bar = tqdm(range(num_epochs * len(dataloader_train)))

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    for batch in dataloader_train:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = criterion(logits, labels)

        # Manually add L1 regularization
        # l1_norm = sum(p.abs().sum() for p in model.parameters())
        # loss = loss + l1_lambda * l1_norm

        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        progress_bar.update(1)

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():     # No gradients for validation
        for batch in dataloader_val:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            val_loss += loss.item()

            _, predicted = torch.max(logits, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    avg_train_loss = train_loss / len(dataloader_train)
    avg_val_loss = val_loss / len(dataloader_val)
    val_accuracy = correct / total
    print(f'Epoch {epoch+1} Training loss: {avg_train_loss:.4f} '
          f'Validation loss: {avg_val_loss:.4f} '
          f'Validation accuracy: {val_accuracy:.4f}')


    # Early stopping and save best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"Best model saved at epoch {epoch+1} with val loss {best_val_loss:.4f}")
    else:
        epochs_no_improve += 1

    if epochs_no_improve >= patience:
        print("Early stopping triggered!")
        break

# Calculate execution time
end_time = time.time()
duration = end_time - start_time
print(f"Execution time: {duration:.4f} seconds")

  0%|          | 0/48800 [00:00<?, ?it/s]

Epoch 1 Training loss: 0.6333 Validation loss: 0.5781 Validation accuracy: 0.7581
Best model saved at epoch 1 with val loss 0.5781
Epoch 2 Training loss: 0.4885 Validation loss: 0.5658 Validation accuracy: 0.7625
Best model saved at epoch 2 with val loss 0.5658
Epoch 3 Training loss: 0.3601 Validation loss: 0.6533 Validation accuracy: 0.7564
Epoch 4 Training loss: 0.2454 Validation loss: 0.7981 Validation accuracy: 0.7531
Epoch 5 Training loss: 0.1647 Validation loss: 0.9790 Validation accuracy: 0.7445
Early stopping triggered!
Execution time: 2691.9550 seconds


In [None]:
# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

<All keys matched successfully>

In [None]:
# Upload the trained model and tokenizer to Hugging Face Hub
repo_id = "anjan-k/Sentiment-Analysis-FineTune-Torch"
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpzbifyz53/model.safetensors    :   1%|1         | 3.26MB /  268MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/anjan-k/Sentiment-Analysis-FineTune-Torch/commit/4257ff6e0e5402ac03c416925ccca2b9308ace78', commit_message='Upload tokenizer', commit_description='', oid='4257ff6e0e5402ac03c416925ccca2b9308ace78', pr_url=None, repo_url=RepoUrl('https://huggingface.co/anjan-k/Sentiment-Analysis-FineTune-Torch', endpoint='https://huggingface.co', repo_type='model', repo_id='anjan-k/Sentiment-Analysis-FineTune-Torch'), pr_revision=None, pr_num=None)