In [31]:
# Run them in Google Colab
# !pip install umap-learn
# !pip install hdbscan

In [2]:
import pandas as pd
import numpy as np
import umap
import torch

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import silhouette_score
from hdbscan import HDBSCAN

from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel

from sklearn.metrics import confusion_matrix

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# HDBSCAN with BERT Embeddings on the Utterance Level

### BERT uncased

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
def get_bert_embeddings(text_list):
    embeddings = []
    for text in text_list:
        encoded_input = tokenizer(text, return_tensors="pt", padding=True,
                                  truncation=True, max_length=512)
        input_ids = encoded_input["input_ids"]
        with torch.no_grad():
            outputs = model(input_ids)
            last_hidden_states = outputs.last_hidden_state
            # Use the mean of the token embeddings as the sentence embedding
            sentence_embedding = torch.mean(last_hidden_states, dim=1)
            embeddings.append(sentence_embedding[0].numpy())
    return embeddings

# Generate embeddings for the text column
embeddings = get_bert_embeddings(test["text"].tolist())

In [22]:
hdbscan = HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
test["topic"] = hdbscan.fit_predict(embeddings)

test["topic"].unique()

array([ 6, -1,  1,  3,  5,  2,  0,  4,  7])

In [28]:
# Count the number of -1s
(test["topic"] == -1).sum()

1995

In [29]:
# Proportion of being a noise point
(test["topic"] == -1).sum() / test.shape[0]

0.6477272727272727

### Check the accuracy of each cluster number

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_id = 'philschmid/BERT-Banking77'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
classifier = pipeline('text-classification', tokenizer=tokenizer, model=model)

tokenizer_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [24]:
# Predict labels using the fine-tuned model
test["predicted_label"] = test["text"].apply(lambda x: classifier(x)[0]["label"])

# Calculate accuracy for each cluster
accuracy_per_cluster = test.groupby("topic").apply(
    lambda group: (group["category"] == group["predicted_label"]).mean()
    )

In [30]:
accuracy_per_cluster

topic
-1    0.918797
 0    0.857143
 1    1.000000
 2    0.888889
 3    1.000000
 4    1.000000
 5    1.000000
 6    0.941872
 7    1.000000
dtype: float64

### Find the cluster with the lowest accuracy

In [25]:
lowest_accuracy_cluster = accuracy_per_cluster.idxmin()
lowest_accuracy = accuracy_per_cluster.min()

print(f"Cluster with lowest accuracy: {lowest_accuracy_cluster}, Accuracy: {lowest_accuracy}")

Cluster with lowest accuracy: 0, Accuracy: 0.8571428571428571


In [26]:
lowest_accuracy_cluster = 0

lowest_df = test[test["topic"] == lowest_accuracy_cluster]

# Check the distribution of categories in this cluster
lowest_df["category"].value_counts()

unable_to_verify_identity    6
why_verify_identity          1
Name: category, dtype: int64