DistillBERT

In [1]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:0

In [None]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import torch

# Load your dataset
dataset = load_dataset('csv', data_files='/content/data.csv')

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create a train-test split
train_test_split = dataset['train'].train_test_split(test_size=0.1)

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['extracted_sentence'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = train_test_split.map(tokenize_function, batched=True)

# Convert string labels to numeric labels using vectorization
def convert_labels(examples):
    # Use numpy to convert labels
    examples['labels'] = np.where(np.array(examples['Label']) == 'positive', 1, 0)  # Vectorized operation
    return examples

# Apply the conversion to the tokenized datasets
tokenized_datasets = tokenized_datasets.map(convert_labels)

# Set the format for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the model with 2 labels (0 and 1)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Define a compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': (preds == p.label_ids).mean(),
    }

# Train the model using the Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

# Output the shapes to verify
print(f"Input IDs shape: {tokenized_datasets['train']['input_ids'].shape}")
print(f"Labels shape: {tokenized_datasets['train']['labels'].shape}")


Map:   0%|          | 0/729 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/729 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.461272,0.678555
2,No log,0.357351,0.608139
3,No log,0.366236,0.603109


Input IDs shape: torch.Size([729, 128])
Labels shape: torch.Size([729, 1])


In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision = precision_score(p.label_ids, preds)
    recall = recall_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds)
    return {
        'accuracy': (preds == p.label_ids).mean(),
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Initialize the Trainer for evaluation
trainer = Trainer(
    model=model,
)

# Evaluate the model on the test dataset
results = trainer.evaluate(tokenized_datasets)

# Print evaluation results
print("Evaluation results:", results)

Evaluation results: {'eval_train_loss': 0.2157624214887619, 'eval_train_model_preparation_time': 0.0181, 'eval_train_runtime': 168.2061, 'eval_train_samples_per_second': 4.334, 'eval_train_steps_per_second': 0.547, 'eval_test_loss': 0.36623579263687134, 'eval_test_model_preparation_time': 0.0181, 'eval_test_runtime': 17.4859, 'eval_test_samples_per_second': 4.632, 'eval_test_steps_per_second': 0.629}


RobertaBERT

In [2]:
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

# Load your dataset
dataset = load_dataset('csv', data_files='/content/data.csv')

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create a train-test split
train_test_split = dataset['train'].train_test_split(test_size=0.1)



Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [3]:
# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['extracted_sentence'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = train_test_split.map(tokenize_function, batched=True)

# Convert string labels to numeric labels using vectorization
def convert_labels(examples):
    examples['labels'] = np.where(np.array(examples['Label']) == 'positive', 1, 0)
    return examples

# Apply the conversion to the tokenized datasets
tokenized_datasets = tokenized_datasets.map(convert_labels)

Map:   0%|          | 0/729 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/729 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

In [4]:
# Set the format for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the model with 2 labels (0 and 1)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)




model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Define a compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision = precision_score(p.label_ids, preds)
    recall = recall_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds)
    accuracy = (preds == p.label_ids).mean()
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [6]:
# Train the model using the Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)



In [7]:
# Start training
trainer.train()

# Evaluate the model on the test dataset
results = trainer.evaluate(tokenized_datasets['test'])

# Print evaluation results
print("Evaluation results:", results)

# Output the shapes to verify
print(f"Input IDs shape: {tokenized_datasets['train']['input_ids'].shape}")
print(f"Labels shape: {tokenized_datasets['train']['labels'].shape}")

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.395929,0.814815,0.814815,1.0,0.897959
2,No log,0.322306,0.667124,0.951613,0.893939,0.921875
3,No log,0.359457,0.68267,0.9375,0.909091,0.923077


Evaluation results: {'eval_loss': 0.3594568967819214, 'eval_accuracy': 0.6826703246456333, 'eval_precision': 0.9375, 'eval_recall': 0.9090909090909091, 'eval_f1': 0.9230769230769231, 'eval_runtime': 34.2382, 'eval_samples_per_second': 2.366, 'eval_steps_per_second': 0.321, 'epoch': 3.0}
Input IDs shape: torch.Size([729, 128])
Labels shape: torch.Size([729, 1])
