### Imports and Loading Data:

In [None]:
# !pip install hdbscan
# !pip install umap-learn
# !pip install mplcursors

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score, normalized_mutual_info_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, BertTokenizer, BertModel
from hdbscan import HDBSCAN
from sklearn.cluster import DBSCAN
from sklearn import metrics
from tqdm.notebook import trange
import umap
import mplcursors

import warnings
warnings.filterwarnings("ignore")
import os

import itertools

In [5]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

## Creating BERT Embeddings and Hyperparameter Tuning:

### Create Bert Banking Embeddings on Utterance Level:

In [6]:
# Creating a Tokenizer Object and a Bert Model Object
BERT_bank_tokenizer = BertTokenizer.from_pretrained('philschmid/BERT-Banking77')
BERT_bank_model = BertModel.from_pretrained('philschmid/BERT-Banking77')

tokenizer_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [7]:
# Function that gets the BERT banking embeddings of each utterance:
def get_bert_bank_embeddings(sentence):
    inputs = BERT_bank_tokenizer(sentence,
                            return_tensors="pt", # Tokenizer will return PyTorch tensors
                            truncation=True, # Tokenizer will truncate the input sentence to fit max sequence length
                            max_length=512) # The maximum sequence length allowed
    outputs = BERT_bank_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embeddings

In [8]:
# Apply the function to each utterance in test set
test['BERT_bank_embeddings'] = test['text'].apply(get_bert_bank_embeddings)

In [9]:
print("Number of elements in each BERT banking embedding:", len(test['BERT_bank_embeddings'].iloc[0]))

Number of elements in each BERT banking embedding: 768


### Finding Best Hyperparameters for BERT Base:

In [10]:
# Run a grid search for BERT Banking by checking the silhouette score
# Use the best parameters for both BERT Banking and BERT base

def perform_umap(embeddings, n_neighbors, n_components, metric):
    """
    Apply UMAP dimension reduction to the embeddings.
    """
    reducer = umap.UMAP(n_neighbors=n_neighbors, n_components=n_components,
                        metric=metric, random_state=42)
    return reducer.fit_transform(embeddings)

def perform_hdbscan(umap_embeddings, min_cluster_size, min_samples, gen_min_span_tree, cluster_selection_method):
    """
    Apply HDBSCAN clustering to the dimension-reduced data.
    """
    clusterer = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples,
                        gen_min_span_tree=gen_min_span_tree,
                        cluster_selection_method=cluster_selection_method)
    return clusterer.fit(umap_embeddings)

def grid_search(df, umap_params, hdbscan_params, embedding_type = "BERT_bank_embeddings"):
    """
    Perform a grid search over the specified ranges of UMAP and HDBSCAN parameters.
    """
    results = []
    embeddings = np.stack(df[embedding_type].values)

    for umap_param_set in itertools.product(*umap_params.values()):
        umap_embeddings = perform_umap(embeddings, *umap_param_set)

        for hdbscan_param_set in itertools.product(*hdbscan_params.values()):
            clusterer = perform_hdbscan(umap_embeddings, *hdbscan_param_set)
            labels = clusterer.labels_

            # Calculate silhouette score only if more than one cluster is found
            if len(set(labels)) - (1 if -1 in labels else 0) > 1:
                score = silhouette_score(umap_embeddings, labels)
            else:
                score = -1

            result = {**dict(zip(umap_params.keys(), umap_param_set)),
                      **dict(zip(hdbscan_params.keys(), hdbscan_param_set)),
                      "silhouette_score": score}
            results.append(result)

    return pd.DataFrame(results)

In [11]:
# # Define your UMAP and HDBSCAN parameter grids
# umap_params = {
#     "n_neighbors": range(15, 51, 10),  # Exploring from 15 to 50 in steps of 10
#     "n_components": range(2, 21, 6),  # Exploring from 2 to 20 in steps of 6
#     "metric": ["cosine"]
# }

# hdbscan_params = {
#     "min_cluster_size": range(15, 61, 15),  # Exploring from 15 to 60 in steps of 15
#     "min_samples": [None] + list(range(1, 31, 10)),  # None, then exploring from 1 to 30 in steps of 10
#     "gen_min_span_tree": [True, False],
#     "cluster_selection_method": ["eom", "leaf"]
# }

# # Save results into a dataframe
# results_df = grid_search(test, umap_params, hdbscan_params)

The best parameters of UMAP and HDBSCAN for BERT Banking after running a grid search with silhouette score are:

- n_neighbors: 35
- n_components: 2
- metric: cosine
- min_cluster_size: 15
- min_samples: 21
- gen_min_span_tree: True
- cluster_selection_method: eom

With a silhouette score of 0.947

## HDBSCAN on BERT Banking Best Parameters:

In [12]:
bert_bank_embeddings = np.vstack(test['BERT_bank_embeddings'].to_numpy())
umap_bert_bank = umap.UMAP(n_neighbors=35, n_components=2, metric='cosine', random_state=42).fit_transform(bert_bank_embeddings)

In [13]:
hdbscan_bert_bank = HDBSCAN(min_cluster_size=15, min_samples=21, gen_min_span_tree=True,
                         cluster_selection_method='eom')

test["topic_BERT_bank"] = hdbscan_bert_bank.fit_predict(umap_bert_bank)
np.unique(hdbscan_bert_bank.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
       67, 68, 69, 70, 71, 72, 73, 74, 75, 76])

In [14]:
num_outliers = (test["topic_BERT_bank"] == -1).sum()
print("We found that out of 3080 utterance there were",num_outliers, "outliers.")

We found that out of 3080 utterance there were 11 outliers.


## Creating Dataframe of BERT Banking (with its best paramters):

In [15]:
BERT_bank_df = test.copy()

In [16]:
# Renaming certain columns:
BERT_bank_df.rename(columns={'text': 'Utterance', 'category': 'Assigned label','topic_BERT_bank': 'Cluster'}, inplace=True)
BERT_bank_df.drop(columns={'BERT_bank_embeddings'}, inplace=True)

#### For each cluster, output the unique categories in that cluster, and their respective count:

In [17]:
cluster_info = {}

for i in np.unique(hdbscan_bert_bank.labels_):
  cluster_df = BERT_bank_df[BERT_bank_df["Cluster"]== i]
  value_counts = cluster_df["Assigned label"].value_counts()

  category_info = {}
  for index, count in value_counts.items():
    if count > 6:
      category_info[index] = count

  cluster_info[i] = category_info

cluster_info

{-1: {},
 0: {'lost_or_stolen_phone': 39},
 1: {'disposable_card_limits': 34},
 2: {'extra_charge_on_statement': 38},
 3: {'card_swallowed': 36},
 4: {'verify_top_up': 40},
 5: {'fiat_currency_support': 35},
 6: {'exchange_via_app': 37},
 7: {'age_limit': 40},
 8: {'cancel_transfer': 38},
 9: {'request_refund': 39},
 10: {'card_arrival': 33},
 11: {'card_delivery_estimate': 35},
 12: {'pending_card_payment': 38},
 13: {'pin_blocked': 33},
 14: {'get_physical_card': 39},
 15: {'change_pin': 40},
 16: {'receiving_money': 37},
 17: {'passcode_forgotten': 40},
 18: {'pending_cash_withdrawal': 39},
 19: {'pending_top_up': 38},
 20: {'terminate_account': 40},
 21: {'exchange_charge': 37},
 22: {'exchange_rate': 39},
 23: {'card_not_working': 38},
 24: {'top_up_limits': 39},
 25: {'direct_debit_payment_not_recognised': 33},
 26: {'card_payment_wrong_exchange_rate': 38},
 27: {'wrong_exchange_rate_for_cash_withdrawal': 33},
 28: {'wrong_amount_of_cash_received': 36},
 29: {'declined_transfer':

#### Adding Cluster Name and Cluster Confidence to dataframe:

In [18]:
BERT_bank_df['Cluster name'] = BERT_bank_df['Cluster'].map(cluster_info)
BERT_bank_df['Cluster name'] = BERT_bank_df['Cluster name'].apply(lambda x: list(x.keys())[0] if x else None)
BERT_bank_df['Cluster confidence'] = hdbscan_bert_bank.probabilities_

#### Adding Predicted Label and Score Column using Pretrained Banking77:

In [19]:
model_id = 'philschmid/BERT-Banking77'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
classifier = pipeline('text-classification', tokenizer=tokenizer, model=model, return_all_scores=True)

In [22]:
outputs = []
for text in BERT_bank_df["Utterance"].tolist():
    outputs.append(classifier(text))

In [23]:
predicted_output = []
predicted_output_score = []

for x, output in enumerate(outputs):
    data = output[0]

    max_score_data = max(data, key=lambda d: d['score'])
    max_score = max_score_data['score']
    max_label = max_score_data['label']

    predicted_output.append(max_label)
    predicted_output_score.append(max_score)

In [24]:
BERT_bank_df['Predicted label'] = predicted_output
BERT_bank_df['Predicted confidence'] = predicted_output_score
sorted_column = ["Utterance", "Assigned label",	"Predicted label", "Predicted confidence", "Cluster confidence", "Cluster",	"Cluster name"]
BERT_bank_df = BERT_bank_df[sorted_column]
BERT_bank_df.head()

Unnamed: 0,Utterance,Assigned label,Predicted label,Predicted confidence,Cluster confidence,Cluster,Cluster name
0,How do I locate my card?,card_arrival,card_arrival,0.466479,0.274837,54,lost_or_stolen_card
1,"I still have not received my new card, I order...",card_arrival,card_arrival,0.977467,0.308988,10,card_arrival
2,I ordered a card but it has not arrived. Help ...,card_arrival,card_arrival,0.925289,0.630991,10,card_arrival
3,Is there a way to know when my card will arrive?,card_arrival,card_delivery_estimate,0.944849,0.723598,11,card_delivery_estimate
4,My card has not arrived yet.,card_arrival,card_arrival,0.980883,0.993433,10,card_arrival


#### Manually creating groupings on 77 cateogories:

In [25]:
manual_groupings_dict = {

    "top_up": ['top_up_by_bank_transfer_charge','top_up_by_card_charge',
          'top_up_by_cash_or_cheque','top_up_failed','top_up_limits',
          'top_up_reverted', 'topping_up_by_card', 'pending_top_up',
          'verify_top_up', 'automatic_top_up'],

    "transfers": ['cancel_transfer', 'balance_not_updated_after_bank_transfer',
             'declined_transfer', 'failed_transfer','pending_transfer',
             'transfer_fee_charged','transfer_into_account', 'transfer_not_received_by_recipient',
             'transfer_timing', 'beneficiary_not_allowed'],

    "exchange_rate_currency": ['card_payment_wrong_exchange_rate',   'exchange_charge',   'exchange_rate',
                          'exchange_via_app',   'wrong_exchange_rate_for_cash_withdrawal',   'country_support',
                           'fiat_currency_support',   'supported_cards_and_currencies',   'card_acceptance'],

    "account_management": ['terminate_account',   'edit_personal_details',   'pin_blocked',
                      'change_pin',   'passcode_forgotten',   'age_limit', 'lost_or_stolen_phone'],

    "identity_verification": ['verify_my_identity',   'why_verify_identity',   'unable_to_verify_identity'],

    "virtual_disposable_cards": ['virtual_card_not_working',   'get_disposable_virtual_card',
                                 'getting_virtual_card',   'disposable_card_limits',   'getting_spare_card'],

    "atm_cash_withdrawal": ['cash_withdrawal_charge','cash_withdrawal_not_recognised', 'declined_cash_withdrawal',
                            'pending_cash_withdrawal', 'atm_support', 'wrong_amount_of_cash_received', 'card_swallowed'],

    "card_malfunction": ['contactless_not_working',  'declined_card_payment',  'card_not_working'],

    "transactions": ['Refund_not_showing_up', 'request_refund', 'balance_not_updated_after_cheque_or_cash_deposit',
                'extra_charge_on_statement', 'receiving_money', 'transaction_charged_twice', 'verify_source_of_funds'],

    "card_payment": ['card_payment_fee_charged',   'card_payment_not_recognised', 'direct_debit_payment_not_recognised',
                     'reverted_card_payment?',   'pending_card_payment'],

    "new_card": ['card_about_to_expire', 'get_physical_card', 'order_physical_card', 'card_arrival', 'card_delivery_estimate',
                 'card_linking', 'activate_my_card', 'compromised_card', 'lost_or_stolen_card'],

    "card_info": ['visa_or_mastercard',   'apple_pay_or_google_pay']
}

In [26]:
manual_groupings_dict_inverted = {}

# Inverting the original dictionary
for group, topics in manual_groupings_dict.items():
    for topic in topics:
        manual_groupings_dict_inverted[topic] = group


#### Adding my manual groups to dataframe:

In [27]:
# Function to map labels to their groupings
def get_label_group(label):
    return manual_groupings_dict_inverted.get(label, 'unknown')

# Add new columns for true label group, cluster label group, and correct cluster group
BERT_bank_df['Assigned topic name'] = BERT_bank_df['Assigned label'].apply(get_label_group)
BERT_bank_df['Predicted topic name'] = BERT_bank_df['Predicted label'].apply(get_label_group)
BERT_bank_df['Cluster topic name'] = BERT_bank_df['Cluster name'].apply(get_label_group)


#### Random:

In [28]:
num_utterances = len(BERT_bank_df)
incorrect_predicted = (BERT_bank_df["Assigned label"] != BERT_bank_df["Predicted label"]).sum()
incorrect_cluster = (BERT_bank_df["Assigned label"] != BERT_bank_df["Cluster name"]).sum()
print("Number of Utterances where Assigned != Predicted:",incorrect_predicted)
print("Number of Utterances where Assigned != Cluster:",incorrect_cluster)

Number of Utterances where Assigned != Predicted: 223
Number of Utterances where Assigned != Cluster: 223


In [34]:
# BERT_bank_df[(BERT_bank_df["Assigned label"] == "topping_up_by_card") & (BERT_bank_df["Predicted label"] == "topping_up_by_card")]

In [None]:
# BERT_bank_df.to_csv('BERT_bank.csv')

## New Dataframe of Different Types of Misclassifications:

### Dataframe where Assigned != Predicted:

In [None]:
predicted_misclass = BERT_bank_df[BERT_bank_df["Assigned label"] != BERT_bank_df["Predicted label"]]
predicted_misclass = predicted_misclass.sort_values(by=['Predicted label', 'Predicted confidence'])

In [None]:
# Saving this dataframe to csv file:
# predicted_misclass.to_csv('predicted_misclassification.csv')

### Dataframe where Assigned != Predicted and Misclassification is Within Topic:

In [None]:
predicted_misclass_in_topic = BERT_bank_df[(BERT_bank_df["Assigned label"] != BERT_bank_df["Predicted label"]) & (BERT_bank_df["Assigned topic name"] == BERT_bank_df["Predicted topic name"])]
predicted_misclass_in_topic = predicted_misclass_in_topic.sort_values(by=['Predicted label', 'Predicted confidence'])

print("Number of Utterances where Assigned != Predicted and the Misclassification is within topic:",len(predicted_misclass_in_topic),"\n")
print("Average Predicted Confidence for these Utterances:",predicted_misclass_in_topic["Predicted confidence"].mean())

Number of Utterances where Assigned != Predicted and the Misclassification is within topic: 133 

Average Predicted Confidence for these Utterances: 0.7296622961311412


In [None]:
# predicted_misclass_in_topic.to_csv('predicted_misclassification_in_topic.csv')

### Dataframe where Assigned != Predicted and Misclassification is Out of Topic:

In [None]:
predicted_misclass_out_of_topic = BERT_bank_df[(BERT_bank_df["Assigned label"] != BERT_bank_df["Predicted label"]) & (BERT_bank_df["Assigned topic name"] != BERT_bank_df["Predicted topic name"])]
predicted_misclass_out_of_topic = predicted_misclass_out_of_topic.sort_values(by=['Predicted label', 'Predicted confidence'])

print("Number of Utterances where Assigned != Predicted and the Misclassification is out of topic:",len(predicted_misclass_out_of_topic),"\n")
print("Average Predicted Confidence for these Utterances:",predicted_misclass_out_of_topic["Predicted confidence"].mean())

Number of Utterances where Assigned != Predicted and the Misclassification is out of topic: 90 

Average Predicted Confidence for these Utterances: 0.6246961305538813


In [None]:
# predicted_misclass_out_of_topic.to_csv('predicted_misclassification_out_of_topic.csv')

### Dataframe where Assigned != Cluster and Misclassification is Out of Topic:

In [None]:
cluster_misclass = BERT_bank_df[BERT_bank_df["Assigned label"] != BERT_bank_df["Cluster name"]]
cluster_misclass = cluster_misclass.sort_values(by=['Cluster name', 'Cluster confidence'])
cluster_misclass

Unnamed: 0,Utterance,Assigned label,Predicted label,Predicted confidence,Cluster confidence,Cluster,Cluster name,Assigned topic name,Predicted topic name,Cluster topic name
1692,I want a refund because my package has been ta...,request_refund,request_refund,0.737842,0.534777,37,Refund_not_showing_up,transactions,transactions,transactions
2091,I'm not sure why my account has been refunded ...,reverted_card_payment?,Refund_not_showing_up,0.718704,0.849409,37,Refund_not_showing_up,card_payment,transactions,transactions
2100,I was contacted by a seller with a message tha...,reverted_card_payment?,Refund_not_showing_up,0.384072,1.000000,37,Refund_not_showing_up,card_payment,transactions,transactions
2469,What can I pay with? Does cash work?,top_up_by_cash_or_cheque,cash_withdrawal_charge,0.371041,0.085610,33,atm_support,top_up,atm_cash_withdrawal,atm_cash_withdrawal
2468,how come i can't find anywhere to load using cash,top_up_by_cash_or_cheque,wrong_exchange_rate_for_cash_withdrawal,0.333534,0.201356,33,atm_support,top_up,exchange_rate_currency,atm_cash_withdrawal
...,...,...,...,...,...,...,...,...,...,...
2446,Need to deposit a cheque into my account,top_up_by_cash_or_cheque,top_up_by_cash_or_cheque,0.547070,0.000000,-1,,top_up,top_up,unknown
2531,Can I make multiple online transactions with m...,virtual_card_not_working,get_disposable_virtual_card,0.476906,0.000000,-1,,virtual_disposable_cards,virtual_disposable_cards,unknown
2617,how do i get a virtual card for one time use,get_disposable_virtual_card,get_disposable_virtual_card,0.741567,0.000000,-1,,virtual_disposable_cards,virtual_disposable_cards,unknown
2642,I have a top up that didn't go through. Why?,top_up_failed,top_up_failed,0.775243,0.000000,-1,,top_up,top_up,unknown


### Calculating Normalized Mutual Info Score on :

In [None]:
# Saving this dataframe to csv file:
# sorted_incorrect_groups.to_csv('incorrect_groups.csv')

In [None]:
# #Removing Outliers
# no_outliers = BERT_bank_df.dropna(subset=['true label', 'cluster label'])

# # Extracting true labels and cluster labels
# labels_true = no_outliers['true label']
# labels_pred = no_outliers['cluster label']

# # Compute normalized mutual information score
# nmi_score = normalized_mutual_info_score(labels_true, labels_pred)

# print("BERT Bank Normalized Mutual Information Score on labels:", nmi_score)