# Finetune BERT for text classification

### Install dependencies

In [32]:
%pip install transformers torch datasets pandas scikit-learn accelerate -q

Note: you may need to restart the kernel to use updated packages.


### Label transcript dataset

In [33]:
from enum import StrEnum

class CompanyPolicy(StrEnum):
    FREE_RC_TRANSFER = "free_rc_transfer"
    FIVE_DAY_MONEY_BACK_GUARANTEE = "5_day_money_back_guarantee"
    FREE_RSA_FOR_ONE_YEAR = "free_rsa_for_one_year"
    RETURN_POLICY = "return_policy"

In [34]:
transcript_labels = {
    1: [CompanyPolicy.FIVE_DAY_MONEY_BACK_GUARANTEE,],
    2: [CompanyPolicy.FIVE_DAY_MONEY_BACK_GUARANTEE, CompanyPolicy.RETURN_POLICY],
    3: [CompanyPolicy.FREE_RC_TRANSFER, CompanyPolicy.RETURN_POLICY],
    4: [],
    6: [CompanyPolicy.FREE_RC_TRANSFER,],
    7: [CompanyPolicy.FREE_RC_TRANSFER, CompanyPolicy.RETURN_POLICY],
    8: [CompanyPolicy.FREE_RC_TRANSFER,],
    9: [CompanyPolicy.FREE_RC_TRANSFER,],
    10: [],
    11: [CompanyPolicy.FREE_RC_TRANSFER,],
}

### Create dataframe

In [35]:
import os
import pandas as pd

# Function to read transcript text files
def read_transcript(file_path):
    with open(file_path, 'r') as file:
        return file.read()

# Update DataFrame to include the text data
transcript_folder = 'transcripts'

data = {
    'Conversation_ID': [],
    'Text': [],
    'Labels': []
}

for conv_id, labels in transcript_labels.items():
    file_path = os.path.join(transcript_folder, f'conv{conv_id}.txt')
    text = read_transcript(file_path)
    data['Conversation_ID'].append(conv_id)
    data['Text'].append(text)
    data['Labels'].append(', '.join([label.value for label in labels]))

df = pd.DataFrame(data)

df

Unnamed: 0,Conversation_ID,Text,Labels
0,1,Salesperson: We have a few options in terms of...,5_day_money_back_guarantee
1,2,"Salesperson: Good afternoon, sir. How can I as...","5_day_money_back_guarantee, return_policy"
2,3,Salesperson: Welcome! Let me show you our sele...,"free_rc_transfer, return_policy"
3,4,"Salesperson: Good afternoon, sir! How can I as...",
4,6,"Salesperson: Good afternoon, sir. Can I offer ...",free_rc_transfer
5,7,"Salesperson: Good afternoon, sir. Here we have...","free_rc_transfer, return_policy"
6,8,Salesperson: The price is 10.72 lakhs for the ...,free_rc_transfer
7,9,"Salesperson: Good afternoon, sir. Have you boo...",free_rc_transfer
8,10,"Salesperson: Hello sir, how are you today? \nC...",
9,11,"Salesperson: Hello sir, welcome to YoCars Park...",free_rc_transfer


### Create dataset from dataframe

In [36]:
# Create a label mapping
all_labels = list(set(label for sublist in df['Labels'].str.split(', ') for label in sublist))
label_to_id = {label: i for i, label in enumerate(all_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [37]:
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Convert the DataFrame to a Dataset
def create_dataset(df):
    df['Labels'] = df['Labels'].str.split(', ').apply(lambda x: [label_to_id[label] for label in x])
    df = df.explode('Labels')
    dataset = Dataset.from_pandas(df)
    return dataset, label_to_id, id_to_label

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset, label_to_id, id_to_label = create_dataset(train_df)
val_dataset, _, _ = create_dataset(val_df)

train_dataset

Dataset({
    features: ['Conversation_ID', 'Text', 'Labels', '__index_level_0__'],
    num_rows: 10
})

### Load the model and tokenizer

In [38]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_to_id))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenize and prepare data

In [39]:
def tokenize_function(examples):
    # Tokenize the text
    return tokenizer(examples['Text'], padding="max_length", truncation=True)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Ensure that labels are formatted correctly
def format_labels(examples):
    # Ensure labels are correctly formatted as tensor of integers
    labels = examples['Labels']
    return {'labels': labels}

train_dataset = train_dataset.map(format_labels, batched=True)
val_dataset = val_dataset.map(format_labels, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map: 100%|██████████| 10/10 [00:00<00:00, 136.31 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 117.10 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 1563.46 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 515.48 examples/s]


### Finetune the model

In [40]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
model.save_pretrained('./fine-tuned-bert')
tokenizer.save_pretrained('./fine-tuned-bert')


Step,Training Loss


('./fine-tuned-bert/tokenizer_config.json',
 './fine-tuned-bert/special_tokens_map.json',
 './fine-tuned-bert/vocab.txt',
 './fine-tuned-bert/added_tokens.json')