### Imports and Loading Data:

In [None]:
# !pip install hdbscan
# !pip install umap-learn
# !pip install mplcursors

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, BertTokenizer, BertModel
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN
from sklearn import metrics
from tqdm.notebook import trange
import umap
import mplcursors

import warnings
warnings.filterwarnings("ignore")
import os

import itertools

In [87]:
from sklearn.metrics import normalized_mutual_info_score

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

### Create Bert Base Embeddings on Utterance Level:

In [6]:
# Creating a Tokenizer Object and a Bert Model Object
BERT_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BERT_base_model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [7]:
# Function that gets the BERT base embeddings of each utterance:
def get_bert_base_embeddings(sentence):
    inputs = BERT_base_tokenizer(sentence,
                            return_tensors="pt", # Tokenizer will return PyTorch tensors
                            truncation=True, # Tokenizer will truncate the input sentence to fit max sequence length
                            max_length=512) # The maximum sequence length allowed
    outputs = BERT_base_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embeddings

In [8]:
# Apply the function to each utterance in test set
test['BERT_base_embeddings'] = test['text'].apply(get_bert_base_embeddings)

In [9]:
print("Number of elements in each BERT base embedding:", len(test['BERT_base_embeddings'].iloc[0]))

Number of elements in each BERT base embedding: 768


### Create Bert Banking Embeddings on Utterance Level:

In [10]:
# Creating a Tokenizer Object and a Bert Model Object
BERT_bank_tokenizer = BertTokenizer.from_pretrained('philschmid/BERT-Banking77')
BERT_bank_model = BertModel.from_pretrained('philschmid/BERT-Banking77')

tokenizer_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [11]:
# Function that gets the BERT banking embeddings of each utterance:
def get_bert_bank_embeddings(sentence):
    inputs = BERT_bank_tokenizer(sentence,
                            return_tensors="pt", # Tokenizer will return PyTorch tensors
                            truncation=True, # Tokenizer will truncate the input sentence to fit max sequence length
                            max_length=512) # The maximum sequence length allowed
    outputs = BERT_bank_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embeddings

In [12]:
# Apply the function to each utterance in test set
test['BERT_bank_embeddings'] = test['text'].apply(get_bert_bank_embeddings)

In [13]:
print("Number of elements in each BERT banking embedding:", len(test['BERT_bank_embeddings'].iloc[0]))

Number of elements in each BERT banking embedding: 768


## Finding Best Hyperparameters for BERT Base:

In [14]:
# Run a grid search for BERT Banking by checking the silhouette score
# Use the best parameters for both BERT Banking and BERT base

def perform_umap(embeddings, n_neighbors, n_components, metric):
    """
    Apply UMAP dimension reduction to the embeddings.
    """
    reducer = umap.UMAP(n_neighbors=n_neighbors, n_components=n_components,
                        metric=metric, random_state=42)
    return reducer.fit_transform(embeddings)

def perform_hdbscan(umap_embeddings, min_cluster_size, min_samples, gen_min_span_tree, cluster_selection_method):
    """
    Apply HDBSCAN clustering to the dimension-reduced data.
    """
    clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples,
                        gen_min_span_tree=gen_min_span_tree,
                        cluster_selection_method=cluster_selection_method)
    return clusterer.fit(umap_embeddings)

def grid_search(df, umap_params, hdbscan_params, embedding_type = "BERT_bank_embeddings"):
    """
    Perform a grid search over the specified ranges of UMAP and HDBSCAN parameters.
    """
    results = []
    embeddings = np.stack(df[embedding_type].values)

    for umap_param_set in itertools.product(*umap_params.values()):
        umap_embeddings = perform_umap(embeddings, *umap_param_set)

        for hdbscan_param_set in itertools.product(*hdbscan_params.values()):
            clusterer = perform_hdbscan(umap_embeddings, *hdbscan_param_set)
            labels = clusterer.labels_

            # Calculate silhouette score only if more than one cluster is found
            if len(set(labels)) - (1 if -1 in labels else 0) > 1:
                score = silhouette_score(umap_embeddings, labels)
            else:
                score = -1

            result = {**dict(zip(umap_params.keys(), umap_param_set)),
                      **dict(zip(hdbscan_params.keys(), hdbscan_param_set)),
                      "silhouette_score": score}
            results.append(result)

    return pd.DataFrame(results)

In [16]:
# # Define your UMAP and HDBSCAN parameter grids
# umap_params = {
#     "n_neighbors": range(15, 51, 10),  # Exploring from 15 to 50 in steps of 10
#     "n_components": range(2, 21, 6),  # Exploring from 2 to 20 in steps of 6
#     "metric": ["cosine"]
# }

# hdbscan_params = {
#     "min_cluster_size": range(15, 61, 15),  # Exploring from 15 to 60 in steps of 15
#     "min_samples": [None] + list(range(1, 31, 10)),  # None, then exploring from 1 to 30 in steps of 10
#     "gen_min_span_tree": [True, False],
#     "cluster_selection_method": ["eom", "leaf"]
# }

# # Save results into a dataframe
# results_df = grid_search(test, umap_params, hdbscan_params)

### Using BERT Banking Best parameters on BERT Base emeddings:

In [26]:
bert_base_embeddings = np.vstack(test['BERT_base_embeddings'].to_numpy())
umap_bert_base = umap.UMAP(n_neighbors=35, n_components=2, metric='cosine', random_state=42).fit_transform(bert_base_embeddings)

In [27]:
hdbscan_bert_base = HDBSCAN(min_cluster_size=15, min_samples=21, gen_min_span_tree=True,
                         cluster_selection_method='eom')

test["topic_BERT_base"] = hdbscan_bert_base.fit_predict(umap_bert_base)
np.unique(hdbscan_bert_base.labels_)

array([0, 1])

In [29]:
num_outliers = (test["topic_BERT_base"] == -1).sum()
print("We found that out of 3080 utterance there were",num_outliers, "outliers. However, there were only 2 cluster")

We found that out of 3080 utterance there were 0 outliers. However, there were only 2 cluster


### Using BERT Banking Best parameters on BERT Banking emeddings:

In [22]:
bert_bank_embeddings = np.vstack(test['BERT_bank_embeddings'].to_numpy())
umap_bert_bank = umap.UMAP(n_neighbors=35, n_components=2, metric='cosine', random_state=42).fit_transform(bert_bank_embeddings)

In [23]:
hdbscan_bert_bank = HDBSCAN(min_cluster_size=15, min_samples=21, gen_min_span_tree=True,
                         cluster_selection_method='eom')

test["topic_BERT_bank"] = hdbscan_bert_bank.fit_predict(umap_bert_bank)
np.unique(hdbscan_bert_bank.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
       67, 68, 69, 70, 71, 72, 73, 74, 75, 76])

In [30]:
num_outliers = (test["topic_BERT_bank"] == -1).sum()
print("We found that out of 3080 utterance there were",num_outliers, "outliers.")

We found that out of 3080 utterance there were 11 outliers.


## Creating Dataframe of BERT Banking (with its best paramters):

In [33]:
BERT_bank_df = test.copy()

In [35]:
# Renaming certain columns:
BERT_bank_df.rename(columns={'text': 'utterance', 'category': 'true label','topic_BERT_bank': 'cluster number'}, inplace=True)
BERT_bank_df.drop(columns={'BERT_base_embeddings','BERT_bank_embeddings',  'topic_BERT_base'}, inplace=True)

In [37]:
BERT_bank_df.head()

Unnamed: 0,utterance,true label,cluster number
0,How do I locate my card?,card_arrival,54
1,"I still have not received my new card, I order...",card_arrival,10
2,I ordered a card but it has not arrived. Help ...,card_arrival,10
3,Is there a way to know when my card will arrive?,card_arrival,11
4,My card has not arrived yet.,card_arrival,10


#### For each cluster, output the unique categories in that cluster, and their respective count:

In [38]:
cluster_info = {}

for i in np.unique(hdbscan_bert_bank.labels_):
  cluster_df = BERT_bank_df[BERT_bank_df["cluster number"]== i]
  value_counts = cluster_df["true label"].value_counts()

  category_info = {}
  for index, count in value_counts.items():
    if count > 5:
      category_info[index] = count

  cluster_info[i] = category_info

cluster_info

{-1: {},
 0: {'lost_or_stolen_phone': 39},
 1: {'disposable_card_limits': 34},
 2: {'extra_charge_on_statement': 38},
 3: {'card_swallowed': 36},
 4: {'verify_top_up': 40},
 5: {'fiat_currency_support': 35},
 6: {'exchange_via_app': 37},
 7: {'age_limit': 40},
 8: {'cancel_transfer': 38},
 9: {'request_refund': 39},
 10: {'card_arrival': 33},
 11: {'card_delivery_estimate': 35},
 12: {'pending_card_payment': 38},
 13: {'pin_blocked': 33},
 14: {'get_physical_card': 39},
 15: {'change_pin': 40},
 16: {'receiving_money': 37},
 17: {'passcode_forgotten': 40},
 18: {'pending_cash_withdrawal': 39},
 19: {'pending_top_up': 38},
 20: {'terminate_account': 40},
 21: {'exchange_charge': 37},
 22: {'exchange_rate': 39},
 23: {'card_not_working': 38},
 24: {'top_up_limits': 39},
 25: {'direct_debit_payment_not_recognised': 33},
 26: {'card_payment_wrong_exchange_rate': 38},
 27: {'wrong_exchange_rate_for_cash_withdrawal': 33},
 28: {'wrong_amount_of_cash_received': 36},
 29: {'declined_transfer':

#### Adding necessary columns to dataframe:

In [42]:
BERT_bank_df['cluster label'] = BERT_bank_df['cluster number'].map(cluster_info)
BERT_bank_df['cluster label'] = BERT_bank_df['cluster label'].apply(lambda x: list(x.keys())[0] if x else None)
reorder_col = ["utterance","true label","cluster label", "cluster number"]
BERT_bank_df = BERT_bank_df[reorder_col]
sorted_BERT_bank = BERT_bank_df.sort_values(by='cluster number')

#### Calculating Normalized Mutual Info Score on :

In [93]:
#Removing Outliers
no_outliers = BERT_bank_df.dropna(subset=['true label', 'cluster label'])

# Extracting true labels and cluster labels
labels_true = no_outliers['true label']
labels_pred = no_outliers['cluster label']

# Compute normalized mutual information score
nmi_score = normalized_mutual_info_score(labels_true, labels_pred)

print("BERT Bank Normalized Mutual Information Score on labels:", nmi_score)

BERT Bank Normalized Mutual Information Score on labels: 0.9347217431865582


#### Manually creating groupings on 77 cateogories:

In [53]:
manual_groupings_dict = {

    "top_up": ['top_up_by_bank_transfer_charge','top_up_by_card_charge',
          'top_up_by_cash_or_cheque','top_up_failed','top_up_limits',
          'top_up_reverted', 'topping_up_by_card', 'pending_top_up',
          'verify_top_up', 'automatic_top_up'],

    "transfers": ['cancel_transfer', 'balance_not_updated_after_bank_transfer',
             'declined_transfer', 'failed_transfer','pending_transfer',
             'transfer_fee_charged','transfer_into_account', 'transfer_not_received_by_recipient',
             'transfer_timing', 'beneficiary_not_allowed'],

    "exchange_rate_currency": ['card_payment_wrong_exchange_rate',   'exchange_charge',   'exchange_rate',
                          'exchange_via_app',   'wrong_exchange_rate_for_cash_withdrawal',   'country_support',
                           'fiat_currency_support',   'supported_cards_and_currencies',   'card_acceptance'],

    "account_management": ['terminate_account',   'edit_personal_details',   'pin_blocked',
                      'change_pin',   'passcode_forgotten',   'age_limit'],

    "identity_verification": ['verify_my_identity',   'why_verify_identity',   'unable_to_verify_identity'],

    "virtual_disposable_cards": ['virtual_card_not_working',   'get_disposable_virtual_card',
                                 'getting_virtual_card',   'disposable_card_limits',   'getting_spare_card'],

    "atm_cash_withdrawal": ['cash_withdrawal_charge','cash_withdrawal_not_recognised', 'declined_cash_withdrawal',
                            'pending_cash_withdrawal', 'atm_support', 'wrong_amount_of_cash_received', 'card_swallowed'],

    "card_malfunction": ['contactless_not_working',  'declined_card_payment',  'card_not_working'],

    "refunds": ['Refund_not_showing_up',   'request_refund'],

    "card_payment": ['card_payment_fee_charged',   'card_payment_not_recognised', 'direct_debit_payment_not_recognised',
                     'reverted_card_payment?',   'pending_card_payment'],

    "new_card": ['card_about_to_expire', 'get_physical_card', 'order_physical_card', 'card_arrival', 'card_delivery_estimate',
                 'card_linking', 'activate_my_card', 'compromised_card', 'lost_or_stolen_card'],

    "unknown": ['balance_not_updated_after_cheque_or_cash_deposit', 'extra_charge_on_statement', 'lost_or_stolen_phone', 'receiving_money',
                'transaction_charged_twice',   'verify_source_of_funds',   'visa_or_mastercard',   'apple_pay_or_google_pay',]

}

In [58]:
manual_groupings_dict_inverted = {}

# Inverting the original dictionary
for group, topics in manual_groupings_dict.items():
    for topic in topics:
        manual_groupings_dict_inverted[topic] = group

manual_groupings_dict_inverted

{'top_up_by_bank_transfer_charge': 'top_up',
 'top_up_by_card_charge': 'top_up',
 'top_up_by_cash_or_cheque': 'top_up',
 'top_up_failed': 'top_up',
 'top_up_limits': 'top_up',
 'top_up_reverted': 'top_up',
 'topping_up_by_card': 'top_up',
 'pending_top_up': 'top_up',
 'verify_top_up': 'top_up',
 'automatic_top_up': 'top_up',
 'cancel_transfer': 'transfers',
 'balance_not_updated_after_bank_transfer': 'transfers',
 'declined_transfer': 'transfers',
 'failed_transfer': 'transfers',
 'pending_transfer': 'transfers',
 'transfer_fee_charged': 'transfers',
 'transfer_into_account': 'transfers',
 'transfer_not_received_by_recipient': 'transfers',
 'transfer_timing': 'transfers',
 'beneficiary_not_allowed': 'transfers',
 'card_payment_wrong_exchange_rate': 'exchange_rate_currency',
 'exchange_charge': 'exchange_rate_currency',
 'exchange_rate': 'exchange_rate_currency',
 'exchange_via_app': 'exchange_rate_currency',
 'wrong_exchange_rate_for_cash_withdrawal': 'exchange_rate_currency',
 'countr

#### Adding my manual groups to dataframe:

In [61]:
BERT_bank_df['correct_clust_label'] = (BERT_bank_df['true label'] == BERT_bank_df['cluster label']).astype(int)

In [68]:
# Function to map labels to their groupings
def get_label_group(label):
    return manual_groupings_dict_inverted.get(label, 'unknown')

# Add new columns for true label group, cluster label group, and correct cluster group
BERT_bank_df['true_label_group'] = BERT_bank_df['true label'].apply(get_label_group)
BERT_bank_df['cluster_label_group'] = BERT_bank_df['cluster label'].apply(get_label_group)
BERT_bank_df['correct_cluster_group'] = (BERT_bank_df['true_label_group'] == BERT_bank_df['cluster_label_group']).astype(int)
BERT_bank_df


Unnamed: 0,utterance,true label,cluster label,cluster number,correct_clust_label,true_label_group,cluster_label_group,correct_cluster_group
0,How do I locate my card?,card_arrival,lost_or_stolen_card,54,0,new_card,new_card,1
1,"I still have not received my new card, I order...",card_arrival,card_arrival,10,1,new_card,new_card,1
2,I ordered a card but it has not arrived. Help ...,card_arrival,card_arrival,10,1,new_card,new_card,1
3,Is there a way to know when my card will arrive?,card_arrival,card_delivery_estimate,11,0,new_card,new_card,1
4,My card has not arrived yet.,card_arrival,card_arrival,10,1,new_card,new_card,1
...,...,...,...,...,...,...,...,...
3075,"If i'm not in the UK, can I still get a card?",country_support,country_support,62,1,exchange_rate_currency,exchange_rate_currency,1
3076,How many countries do you support?,country_support,country_support,62,1,exchange_rate_currency,exchange_rate_currency,1
3077,What countries do you do business in?,country_support,country_support,62,1,exchange_rate_currency,exchange_rate_currency,1
3078,What are the countries you operate in.,country_support,country_support,62,1,exchange_rate_currency,exchange_rate_currency,1


In [104]:
(BERT_bank_df["correct_cluster_group"]==0).sum(), (BERT_bank_df["correct_clust_label"]==0).sum()

(94, 223)

In [101]:
#Removing Outliers
no_outliers2 = BERT_bank_df.dropna(subset=['true label', 'cluster label'])

# Extracting true labels and cluster labels
groups_true2 = no_outliers2['true_label_group']
groups_pred2 = no_outliers2['cluster_label_group']

# Compute normalized mutual information score
nmi_score2 = normalized_mutual_info_score(groups_true2, groups_pred2)

print("BERT Bank Normalized Mutual Information Score on groups:", nmi_score2)

BERT Bank Normalized Mutual Information Score on groups: 0.9347363410012195


#### Create a separate dataframe of all the misclassifications that occur outside of the cluster groupings:

In [85]:
incorrect_groups = BERT_bank_df[BERT_bank_df["correct_cluster_group"] == 0]
incorrect_groups.drop(columns={"cluster number", "correct_clust_label","correct_cluster_group"}, inplace=True)

sorted_incorrect_groups = incorrect_groups.sort_values(by='cluster label')
sorted_incorrect_groups

Unnamed: 0,utterance,true label,cluster label,true_label_group,cluster_label_group
2091,I'm not sure why my account has been refunded ...,reverted_card_payment?,Refund_not_showing_up,card_payment,refunds
2100,I was contacted by a seller with a message tha...,reverted_card_payment?,Refund_not_showing_up,card_payment,refunds
2468,how come i can't find anywhere to load using cash,top_up_by_cash_or_cheque,atm_support,top_up,atm_cash_withdrawal
2469,What can I pay with? Does cash work?,top_up_by_cash_or_cheque,atm_support,top_up,atm_cash_withdrawal
1055,The balance on my account wasn't updated after...,balance_not_updated_after_cheque_or_cash_deposit,balance_not_updated_after_bank_transfer,unknown,transfers
...,...,...,...,...,...
2446,Need to deposit a cheque into my account,top_up_by_cash_or_cheque,,top_up,unknown
2531,Can I make multiple online transactions with m...,virtual_card_not_working,,virtual_disposable_cards,unknown
2617,how do i get a virtual card for one time use,get_disposable_virtual_card,,virtual_disposable_cards,unknown
2642,I have a top up that didn't go through. Why?,top_up_failed,,top_up,unknown


In [86]:
# Saving this dataframe to csv file:
# sorted_incorrect_groups.to_csv('incorrect_groups.csv')

#### Creating a new dataframe with predicted label using pretrained Banking77:

In [39]:
model_id = 'philschmid/BERT-Banking77'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
classifier = pipeline('text-classification', tokenizer=tokenizer, model=model, return_all_scores=True)

In [40]:
outputs = []
for text in BERT_bank_df["utterance"].tolist():
    outputs.append(classifier(text))

In [41]:
predicted_output = []
predicted_output_score = []

for x, output in enumerate(outputs):
    data = output[0]

    max_score_data = max(data, key=lambda d: d['score'])
    max_score = max_score_data['score']
    max_label = max_score_data['label']

    predicted_output.append(max_label)
    predicted_output_score.append(max_score)

In [None]:
BERT77_predicted_df = BERT_bank_df.copy()
BERT77_predicted_df['predicted label'] = predicted_output
BERT77_predicted_df['predicted label score'] = predicted_output_score
BERT_bank_df['true=predicted'] = (BERT_bank_df['true label'] == BERT_bank_df['predicted label']).astype(int)