## Distilled BERT - Fine Tuning

In [17]:
! pip install transformers datasets evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m546.7 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
from huggingface_hub import login
# Login using your token
login("<Yout Token>")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [31]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
import evaluate
import numpy as np

In [10]:
# Load the dataset
dataset = load_dataset("csv", data_files="train.csv")

# Rename columns for convenience
dataset = dataset.rename_column("Product", "label")
dataset = dataset.rename_column("Consumer_complaint", "text")

# Filter out rows where the label contains a comma
dataset = dataset.filter(lambda example: ',' not in example['label'])

# Get the unique labels and create a mapping to numeric values
unique_labels = sorted(dataset['train'].unique('label'))  # Sort labels for consistent mapping
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Map labels to numeric format
def encode_labels(example):
    example['label'] = label2id[example['label']]
    return example

dataset = dataset.map(encode_labels)

# Split the dataset into train and test sets with an 80-20 split
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
dataset = train_test_split  # Reassign split datasets to train and test

# Verify the processed dataset
print("Label mapping (label2id):", label2id)
print("ID mapping (id2label):", id2label)
print(dataset)
print("Training set size:", len(dataset['train']))
print("Test set size:", len(dataset['test']))

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/16000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Label mapping (label2id): {'Bank account or service': 0, 'Checking or savings account': 1, 'Consumer Loan': 2, 'Credit card': 3, 'Credit card or prepaid card': 4, 'Credit reporting': 5, 'Credit reporting or other personal consumer reports': 6, 'Debt collection': 7, 'Mortgage': 8, 'Prepaid card': 9, 'Student loan': 10, 'Vehicle loan or lease': 11}
ID mapping (id2label): {0: 'Bank account or service', 1: 'Checking or savings account', 2: 'Consumer Loan', 3: 'Credit card', 4: 'Credit card or prepaid card', 5: 'Credit reporting', 6: 'Credit reporting or other personal consumer reports', 7: 'Debt collection', 8: 'Mortgage', 9: 'Prepaid card', 10: 'Student loan', 11: 'Vehicle loan or lease'}
DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 9600
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 2400
    })
})
Training set size: 9600
Test set size: 2400


In [12]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [25]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/9600 [00:00<?, ? examples/s]

In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(id2label), id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args = TrainingArguments(
    output_dir="distilled_bert_classification",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,1.6568,1.095052,0.600417
2,0.9848,0.956998,0.66
3,0.7826,0.881576,0.6925
4,0.6365,0.866538,0.705417
5,0.476,0.856204,0.71


No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=3000, training_loss=0.8445596466064453, metrics={'train_runtime': 6731.3736, 'train_samples_per_second': 7.131, 'train_steps_per_second': 0.446, 'total_flos': 6226394892958080.0, 'train_loss': 0.8445596466064453, 'epoch': 5.0})

In [None]:
trainer.push_to_hub()

In [30]:
text = "this is related to Vehicle loan"

In [32]:
classifier = pipeline("text-classification", model="abhinavkk/distilled_bert_classification")
classifier(text)

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'label': 'Vehicle loan or lease', 'score': 0.8307367563247681}]