In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:0

**Downloading the dataset from https://huggingface.co/datasets/bitext/Bitext-retail-banking-llm-chatbot-training-dataset**

In [2]:
from datasets import load_dataset

# Load the Bitext Retail Banking dataset
dataset = load_dataset('bitext/Bitext-retail-banking-llm-chatbot-training-dataset')

# Check the first few rows to understand the structure
dataset


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

(…)ing-llm-chatbot-training-dataset.parquet:   0%|          | 0.00/7.87M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25545 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tags', 'instruction', 'category', 'intent', 'response'],
        num_rows: 25545
    })
})

***Pre-processing:***

Step 1 : Filter out the intents I want to keep for my dataset



In [3]:
import pandas as pd

# List of intents to keep
intents_to_keep = [
    'close_account', 'create_account', 'customer_service', 'human_agent',
    'check_fees', 'find_ATM', 'find_branch', 'apply_for_loan', 'cancel_loan',
    'check_loan_payments', 'get_password', 'set_up_password',
    'cancel_transfer', 'make_transfer'
]

# Filter the dataset to keep only the specified intents
df = dataset['train'].to_pandas()
df = df[df['intent'].isin(intents_to_keep)]

# Display the filtered dataframe
print(df.head())


       tags                                        instruction category  \
1997  BIPQZ                      how could i apply for an loan     LOAN   
1998  BCIPQ        i got to apply for a loan how could i do it     LOAN   
1999    BQZ                               im lookingfor a loan     LOAN   
2000    BCL                 I want a loan, help me take out it     LOAN   
2001    BCP  I would like to apply for a loan, I need assis...     LOAN   

              intent                                           response  
1997  apply_for_loan  I'm here to guide you through the process of a...  
1998  apply_for_loan  I can guide you through the process of applyin...  
1999  apply_for_loan  Yes, of course! I'm here to assist you in find...  
2000  apply_for_loan  I'm happy to help! I'm here to assist you with...  
2001  apply_for_loan  I'm here to guide you through the loan applica...  


***Pre-processing:***

Step 2 : Check for class imbalanced dataset

Result: There is slight class imbalance for get_password class. I will need to use resampling method to only sample 700 from each intent.


In [4]:

# Count instances per intent to check for imbalance
class_counts = df['intent'].value_counts()
print("Class Distribution:\n", class_counts)


Class Distribution:
 intent
find_branch            1000
close_account           999
human_agent             999
customer_service        998
find_ATM                998
make_transfer           997
create_account          996
cancel_transfer         995
apply_for_loan          992
cancel_loan             990
check_loan_payments     989
check_fees              947
set_up_password         901
get_password            799
Name: count, dtype: int64


**Resolve class imbalance through undersampling of majority classes**

In [5]:
from sklearn.utils import resample

# List to store downsampled data for each intent
downsampled_list = []

# Get the unique intents
intents = df['intent'].unique()

# Loop through each intent and downsample to 799 samples
for intent in intents:
    intent_df = df[df['intent'] == intent]  # Subset the data for the current intent

    if len(intent_df) > 799:
        # Downsample the majority class
        downsampled_intent = resample(intent_df,
                                      replace=False,   # sample without replacement
                                      n_samples=799,   # downsample to match the minority class
                                      random_state=42) # for reproducibility
    else:
        # If already less than or equal to 799, keep it as is
        downsampled_intent = intent_df

    downsampled_list.append(downsampled_intent)  # Add the downsampled data to the list

# Concatenate all the downsampled dataframes
df_downsampled = pd.concat(downsampled_list)

# Shuffle the dataframe
df = df_downsampled.sample(frac=1).reset_index(drop=True)

# Display the downsampled dataframe
print(df['intent'].value_counts())


intent
cancel_transfer        799
close_account          799
customer_service       799
check_fees             799
check_loan_payments    799
get_password           799
find_branch            799
cancel_loan            799
apply_for_loan         799
create_account         799
make_transfer          799
set_up_password        799
find_ATM               799
human_agent            799
Name: count, dtype: int64


Since this dataset only has train dataset available to download, I will need to split the downloaded train dataset into my own original train and test split for model training

In [6]:
from sklearn.model_selection import train_test_split

# Assuming your DataFrame has 'text' and 'intent' columns (replace with actual names)
X = df['instruction']  # Replace 'text' with the actual column name for instructions
y = df['intent']  # Replace 'intent' with the actual column name for target labels

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the first few rows of X_train and y_train
y_test


Unnamed: 0,intent
2811,cancel_transfer
6023,find_branch
6548,find_ATM
2635,set_up_password
4309,find_ATM
...,...
1667,human_agent
1834,cancel_loan
6331,human_agent
858,apply_for_loan


**Creating the DistilBERT model**

Overview of steps
1. Initialise Tokenizeer
2.

In [7]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch

# Initialize the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(y_train)))

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Tokenize the input texts (X_train and X_test)
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

# Convert the data into PyTorch tensors
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, y_train_encoded)
test_dataset = Dataset(test_encodings, y_test_encoded)

# Define the compute_metrics function to calculate accuracy, precision, recall, and f1 score
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(-1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')  # Using weighted average for multiclass
    recall = recall_score(labels, preds, average='weighted')        # Weighted average for multiclass
    f1 = f1_score(labels, preds, average='weighted')                # Weighted average for multiclass

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",     # Evaluate after every epoch
    save_total_limit=1,
)

# Trainer to handle training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,            # Add the test dataset for evaluation
    compute_metrics=compute_metrics       # Include the compute_metrics function for accuracy, precision, recall, and f1
)

# Train the model
trainer.train()

# Evaluate the model on the test set
results = trainer.evaluate()

# Print the evaluation metrics
print(f"DistilBERT Accuracy: {results['eval_accuracy']:.2%}")
print(f"Precision: {results['eval_precision']:.2%}")
print(f"Recall: {results['eval_recall']:.2%}")
print(f"F1 Score: {results['eval_f1']:.2%}")


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2674,0.006462,0.99866,0.998673,0.99866,0.998659
2,0.0048,0.007933,0.998213,0.998224,0.998213,0.998213
3,0.002,0.00789,0.99866,0.998673,0.99866,0.998659


DistilBERT Accuracy: 99.87%
Precision: 99.87%
Recall: 99.87%
F1 Score: 99.87%


**BiLSTM with Word2Vec**

In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from sklearn.metrics import classification_report

# Step 1: Tokenizing the input texts (X_train and X_test)
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=128, padding='post')

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=128, padding='post')

# Step 2: Train a Word2Vec model on the X_train data
sentences = [text.split() for text in X_train]  # Tokenize each sentence into words
word2vec = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Get the word vectors from the trained Word2Vec model
word_vectors = word2vec.wv

# Step 3: Create an embedding matrix using the trained Word2Vec model
embedding_dim = 100
word_index = tokenizer.word_index
embedding_matrix = np.zeros((10000, embedding_dim))
for word, i in word_index.items():
    if i < 10000:
        if word in word_vectors:
            embedding_matrix[i] = word_vectors[word]
        else:
            embedding_matrix[i] = np.random.normal(size=(embedding_dim,))  # If not found, use random vector

# Step 4: Encoding the labels (y_train and y_test)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Step 5: Build the BiLSTM model with the trained Word2Vec embeddings
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000,
                              output_dim=100,
                              input_length=128,
                              weights=[embedding_matrix],
                              trainable=False),  # Use the trained Word2Vec embeddings, don't train them
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(set(y_train_encoded)), activation='softmax')
])

# Step 6: Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 7: Train the BiLSTM model
model.fit(X_train_padded, y_train_encoded, epochs=3, batch_size=16)

# Step 8: Evaluate the BiLSTM model
test_loss, test_acc = model.evaluate(X_test_padded, y_test_encoded)
print(f"BiLSTM Accuracy: {test_acc}")

# Step 9: Predictions and Metrics Calculation
y_pred = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred, axis=1)

# Step 10: Generate classification report
report = classification_report(y_test_encoded, y_pred_classes, target_names=label_encoder.classes_)
print(report)




Epoch 1/3
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.6196 - loss: 1.1297
Epoch 2/3
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.9276 - loss: 0.1953
Epoch 3/3
[1m560/560[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.9540 - loss: 0.1246
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9619 - loss: 0.0986
BiLSTM Accuracy: 0.9606791734695435
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
                     precision    recall  f1-score   support

     apply_for_loan       1.00      0.96      0.98       160
        cancel_loan       0.96      0.99      0.98       160
    cancel_transfer       0.96      1.00      0.98       160
         check_fees       0.99      1.00      1.00       160
check_loan_payments       1.00      0.99      1.00       160
      close_account       0.88      0.86      0.87     