##Downloading the needed dependencies

In [2]:
! pip install datasets transformers accelerate evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

## Defining the task dataset, model, and batch_size

In [3]:
task = "sst2"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

## Loading the dataset and metric

In [4]:
from datasets import load_dataset
import evaluate


dataset = load_dataset("glue", task)
metric = evaluate.load('glue', task)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

## Example preview from the training dataset

In [6]:
dataset["train"][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

## Preprocessing the data

### Filter Data, dn't run this cell if you don't want to filter it.
Filtering is done based on sentiment strength, this code filters out data examples of sentiment strength less than 0.3.

In [7]:
from datasets import load_dataset, DatasetDict, Dataset
from textblob import TextBlob

def filter_neutral_sentences(dataset):
    filtered_splits = {}  # Store filtered data for each split
    for split in dataset.keys():  # Iterate through splits
        filtered_data = []
        for example in dataset[split]:
            text = example['sentence']
            sentiment = TextBlob(text).sentiment.polarity
            if abs(sentiment) > 0.3:
                filtered_data.append(example)

        # Create a Dataset for the current split
        filtered_splits[split] = Dataset.from_list(filtered_data) #Dataset class is now accessible
        filtered_splits[split] = filtered_splits[split].cast(dataset[split].features)

    # Create a DatasetDict from the filtered splits
    filtered_dataset = DatasetDict(filtered_splits)
    return filtered_dataset

# Filter the dataset while maintaining splits
filtered_dataset = filter_neutral_sentences(dataset)

Casting the dataset:   0%|          | 0/20521 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/300 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/555 [00:00<?, ? examples/s]

### Tokenize

In [8]:
from transformers import AutoTokenizer
import pandas as pd
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples['sentence'], padding = 'max_length', truncation=True)

encoded_dataset = filtered_dataset.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/20521 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/555 [00:00<?, ? examples/s]

## Fine-tuning the model

In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels =  2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    #logging_steps = 500,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    report_to = 'none'
)



In [11]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

### Here, we can shard the dtaset into our desired number of shrads by modifying the num_shards parameter

In [12]:
validation_key = "validation"
train_dataset_sharded = dataset["train"].shard(index=0, num_shards=3)
eval_dataset_sharded = dataset[validation_key].shard(index=0, num_shards=3)

# Apply preprocessing to the sharded datasets
train_dataset_sharded = train_dataset_sharded.map(preprocess_function, batched=True)
eval_dataset_sharded = eval_dataset_sharded.map(preprocess_function, batched=True)

# Check if the sharded datasets have data
print(f"Train dataset sharded sample: {train_dataset_sharded[0]}")
print(f"Eval dataset sharded sample: {eval_dataset_sharded[0]}")

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset_sharded,
    eval_dataset=eval_dataset_sharded,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Map:   0%|          | 0/22450 [00:00<?, ? examples/s]

Map:   0%|          | 0/291 [00:00<?, ? examples/s]

Train dataset sharded sample: {'sentence': 'hide new secretions from the parental units ', 'label': 0, 'idx': 0, 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

  trainer = Trainer(


In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Hyperparameter search

In [14]:
! pip install optuna
! pip install ray[tune]

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [15]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

train_dataset = encoded_dataset["train"].shard(index=1, num_shards=10)

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")

[I 2024-12-09 08:59:22,446] A new study created in memory with name: no-name-6a5155d0-75f9-43d6-9478-da31ad7b0fca
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2337,0.27429,0.885321


[I 2024-12-09 09:02:50,900] Trial 0 finished with value: 0.8853211009174312 and parameters: {'learning_rate': 3.872649465447299e-06, 'num_train_epochs': 1, 'seed': 31, 'per_device_train_batch_size': 16}. Best is trial 0 with value: 0.8853211009174312.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1586,0.283018,0.891055


[I 2024-12-09 09:05:21,936] Trial 1 finished with value: 0.8910550458715596 and parameters: {'learning_rate': 7.587344970176734e-05, 'num_train_epochs': 1, 'seed': 20, 'per_device_train_batch_size': 32}. Best is trial 1 with value: 0.8910550458715596.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1895,0.258636,0.901376
2,0.1344,0.31318,0.904817
3,0.1012,0.349481,0.902523
4,0.078,0.39344,0.90367
5,0.0691,0.425537,0.902523


[I 2024-12-09 09:17:25,695] Trial 2 finished with value: 0.9025229357798165 and parameters: {'learning_rate': 1.1908234839777863e-05, 'num_train_epochs': 5, 'seed': 32, 'per_device_train_batch_size': 32}. Best is trial 2 with value: 0.9025229357798165.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2422,0.367405,0.897936
2,0.127,0.442749,0.893349
3,0.0722,0.514033,0.892202


[I 2024-12-09 09:35:40,524] Trial 3 finished with value: 0.8922018348623854 and parameters: {'learning_rate': 3.628425515193177e-05, 'num_train_epochs': 3, 'seed': 15, 'per_device_train_batch_size': 8}. Best is trial 2 with value: 0.9025229357798165.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1793,0.268372,0.901376
2,0.1162,0.358341,0.891055
3,0.0861,0.366871,0.899083
4,0.0625,0.432443,0.896789


[I 2024-12-09 09:45:21,505] Trial 4 finished with value: 0.8967889908256881 and parameters: {'learning_rate': 1.953007796775984e-05, 'num_train_epochs': 4, 'seed': 8, 'per_device_train_batch_size': 32}. Best is trial 2 with value: 0.9025229357798165.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1913,0.291488,0.896789
2,0.1228,0.35011,0.900229
3,0.0961,0.38025,0.901376
4,0.056,0.425238,0.905963
5,0.0339,0.488593,0.909404


[I 2024-12-09 10:02:17,968] Trial 5 finished with value: 0.9094036697247706 and parameters: {'learning_rate': 3.0654538289975414e-05, 'num_train_epochs': 5, 'seed': 34, 'per_device_train_batch_size': 16}. Best is trial 5 with value: 0.9094036697247706.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2376,0.484413,0.896789
2,0.1913,0.567473,0.887615
3,0.1543,0.604569,0.899083
4,0.0938,0.663185,0.899083
5,0.0508,0.708671,0.899083


[I 2024-12-09 11:02:19,093] Trial 6 finished with value: 0.8990825688073395 and parameters: {'learning_rate': 7.9254658787976e-06, 'num_train_epochs': 5, 'seed': 25, 'per_device_train_batch_size': 4}. Best is trial 5 with value: 0.9094036697247706.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1751,0.269137,0.887615


[I 2024-12-09 11:04:41,933] Trial 7 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2362,0.432643,0.872706


[I 2024-12-09 11:10:45,396] Trial 8 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2731,0.288537,0.877294


[I 2024-12-09 11:13:09,758] Trial 9 pruned. 


In [29]:
best_run

BestRun(run_id='5', objective=0.9094036697247706, hyperparameters={'learning_rate': 3.0654538289975414e-05, 'num_train_epochs': 5, 'seed': 34, 'per_device_train_batch_size': 16}, run_summary=None)