In [None]:
!pip install scikit-network
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel, pipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sknetwork.utils import get_membership
from sklearn.decomposition import TruncatedSVD
from IPython.display import SVG
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sknetwork.clustering import Louvain, get_modularity
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from wordcloud import WordCloud

from sknetwork.hierarchy import LouvainHierarchy
from sknetwork.hierarchy import cut_straight, dasgupta_score, tree_sampling_divergence
from sknetwork.visualization import svg_graph, svg_dendrogram

Collecting scikit-network
  Downloading scikit_network-0.32.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-network
Successfully installed scikit-network-0.32.1


In [None]:
# Load the dataset
test = pd.read_csv("test.csv")

# Initialize the tokenizer and model from the pre-trained 'philschmid/BERT-Banking77'
model_id = 'philschmid/BERT-Banking77'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
classification_model = AutoModelForSequenceClassification.from_pretrained(model_id)
classifier = pipeline('text-classification', model=classification_model, tokenizer=tokenizer)

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    normalized_embeddings = normalize(embeddings, axis=1, norm='l2')  # Normalize the embeddings
    return normalized_embeddings

# Apply the function to the 'text' column
test['embeddings'] = test['text'].apply(lambda x: get_bert_embeddings([x])[0])

# Show the DataFrame with embeddings
print(test[['text', 'embeddings']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

                                                   text  \
0                              How do I locate my card?   
1     I still have not received my new card, I order...   
2     I ordered a card but it has not arrived. Help ...   
3      Is there a way to know when my card will arrive?   
4                          My card has not arrived yet.   
...                                                 ...   
3075      If i'm not in the UK, can I still get a card?   
3076                 How many countries do you support?   
3077              What countries do you do business in?   
3078             What are the countries you operate in.   
3079         Can the card be mailed and used in Europe?   

                                             embeddings  
0     [0.025442068, -0.03386337, 0.0026843003, 0.007...  
1     [0.010722405, 0.007851422, 0.07274575, -0.0132...  
2     [0.0123235015, -0.010357805, 0.07911538, 0.011...  
3     [-0.0020086144, -0.0009892675, 0.08350486, -0....  
4

In [None]:
# Convert embeddings to a matrix
embedding_matrix = np.vstack(test['embeddings'])

similarity_matrix = np.dot(embedding_matrix, embedding_matrix.T)

similarity_matrix[similarity_matrix < 0.3] = 0

adjacency = csr_matrix(similarity_matrix)

Louvain

In [None]:
# Loop over a range of resolution parameters to find the one that optimze the modularity
best_resolution = 1.0
best_modularity = float('-inf')
best_labels = None # no clusters have been formed

for resolution in np.linspace(0.5, 2, 10):

    louvain = Louvain(resolution=resolution)
    labels = louvain.fit_predict(adjacency)
    modularity = get_modularity(adjacency, labels)

    if modularity > best_modularity:
        best_modularity = modularity
        best_resolution = resolution
        best_labels = labels

print("Best resolution parameter:", best_resolution)
print("Best modularity:", best_modularity)

Best resolution parameter: 1.0
Best modularity: 0.8147596303450073


In [None]:
# Apply the Louvain method
louvain = Louvain(resolution=1)
labels = louvain.fit_predict(adjacency)

In [None]:
labels_unique, counts = np.unique(labels, return_counts=True)
print(labels_unique, counts)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] [369 242 227 204 201 164 163 159 155 125 121 121 121 120 119 116  82  73
  43  40  40  38  37]


1. Clustering dataset and get result-1;
2. Use BERT to get result-2;
3. Compare result-1 & result-2 to find mislabeled;

In [None]:
# My part
louvain = Louvain(resolution=best_resolution)
labels = louvain.fit_predict(adjacency)


labels_unique, counts = np.unique(labels, return_counts=True)
print("Unique Labels:", labels_unique)
print("Counts:", counts)

predictions = test['text'].apply(lambda x: classifier(x)[0]['label']) # Use BERT to predict and get result
test['predictions'] = predictions


Unique Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]
Counts: [369 242 227 204 201 164 163 159 155 125 121 121 121 120 119 116  82  73
  43  40  40  38  37]


How to compare the result?
(By using BERT prediction result)

In [None]:
# ['predictions'] is the result predicted by BERT
# ['labels'] is the result of Louvain clustering
# ['category'] is the ground truth labels in the test set

test['labels'] = labels

# First, determine a most common label (Based on Bert Prediction) for each cluster based on the clustering results
cluster_to_predicted_label = {}
for cluster in test['labels'].unique():
    cluster_texts = test[test['labels'] == cluster]
    most_common_predicted_label = cluster_texts['predictions'].mode()[0]
    cluster_to_predicted_label[cluster] = most_common_predicted_label

# Replace the clustering numbers in the DataFrame with the corresponding most common category label
# Because the label that Louvain found is via Bert Embeddings, so the label is not make sense
test['Louvain Cluster Prediction'] = test['labels'].map(cluster_to_predicted_label)

# Then, identify items in each cluster that do not match the primary label (PREDICTION) as potential mislabeled items
potential_mislabeled_indices_based_on_clustering = []
for index, row in test.iterrows():
    cluster_label = cluster_to_predicted_label[row['labels']]
    if row['predictions'] != cluster_label:
        potential_mislabeled_indices_based_on_clustering.append(index)

# Identify items where the ground truth label does not match the BERT prediction
mismatched_predictions_indices = test[test['category'] != test['predictions']].index.tolist()

# The intersection is the potential mislabeled items
potential_mislabeled_indices = list(set(potential_mislabeled_indices_based_on_clustering) & set(mismatched_predictions_indices))

print(f"Indices of potentially mislabeled items based on clustering and BERT predictions mismatch: {potential_mislabeled_indices}")

Indices of potentially mislabeled items based on clustering and BERT predictions mismatch: [2048, 1025, 3, 5, 2055, 521, 11, 2571, 1038, 2063, 2575, 2581, 542, 1055, 32, 555, 2604, 561, 1092, 1096, 2636, 2637, 1103, 2649, 602, 2653, 614, 2160, 2166, 2169, 2685, 2687, 2691, 647, 2186, 2190, 2703, 2195, 2708, 2711, 666, 156, 1187, 2215, 1192, 2233, 189, 1726, 1734, 1736, 1225, 1737, 2761, 2252, 2763, 718, 2256, 1751, 1752, 1758, 246, 2300, 2305, 1794, 773, 1800, 2313, 2829, 1296, 1297, 2321, 2323, 2835, 2328, 281, 284, 2338, 1315, 2341, 812, 301, 1326, 1838, 2351, 1330, 308, 1332, 1333, 312, 1340, 1857, 1862, 1351, 1866, 1871, 849, 339, 851, 852, 345, 863, 1375, 1381, 2917, 873, 1389, 368, 1394, 2426, 2939, 1921, 901, 1413, 1416, 908, 912, 2453, 1431, 416, 2468, 2469, 428, 2477, 945, 434, 956, 453, 461, 1485, 1486, 980, 2523, 476, 2531, 1516, 1517, 1006, 1008, 2544, 2045]


In [None]:
mislabeled_df = test.loc[potential_mislabeled_indices, ['text', 'Louvain Cluster Prediction', 'predictions', 'category']].copy()

mislabeled_df.columns = ['Text', 'Louvain Cluster Prediction', 'BERT Prediction', 'Dataset']

csv_file_path = 'potential_mislabeled_items_on_prediction.csv'
mislabeled_df.to_csv(csv_file_path, index=False)

from google.colab import files
files.download(csv_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

By using original dataset category

In [None]:
# Find the primary label according to the dataset (Category)
cluster_to_category_label = {}
for cluster in test['labels'].unique():
    cluster_texts = test[test['labels'] == cluster]
    most_common_category_label = cluster_texts['category'].mode()[0]
    cluster_to_category_label[cluster] = most_common_category_label

# Match the primary label to the actual Category (Cuz it was )
test['Louvain Cluster Category'] = test['labels'].map(cluster_to_category_label)

# # Then, identify items in each cluster that do not match the primary label (CATEGORY) as potential mislabeled items
potential_mislabeled_indices_based_on_category = []
for index, row in test.iterrows():
    cluster_label = cluster_to_category_label[row['labels']]
    if row['category'] != cluster_label:
        potential_mislabeled_indices_based_on_category.append(index)

# Identify items where the ground truth label does not match the BERT prediction
# This may be more like a mislabled item
mismatched_predictions_indices = test[test['category'] != test['predictions']].index.tolist()

# Still, the Intersecation
potential_mislabeled_indices = list(set(potential_mislabeled_indices_based_on_category) & set(mismatched_predictions_indices))

print(f"Indices of potentially mislabeled items based on clustering and original categories mismatch: {potential_mislabeled_indices}")


Indices of potentially mislabeled items based on clustering and original categories mismatch: [1025, 521, 522, 2571, 1038, 527, 2575, 2581, 1054, 542, 1055, 551, 552, 555, 1067, 2091, 2092, 2094, 2604, 561, 2606, 2100, 2102, 2619, 2622, 1605, 583, 2636, 2637, 2649, 1626, 93, 606, 2653, 2657, 2160, 2675, 2166, 2169, 2685, 2687, 2688, 2691, 2694, 1672, 138, 2186, 2701, 2190, 2703, 2195, 2708, 2711, 156, 1187, 2215, 1192, 1721, 2233, 2745, 2746, 1726, 1730, 707, 1734, 1736, 1225, 1737, 1739, 2248, 2252, 718, 2761, 2256, 721, 2763, 1751, 1752, 2779, 1756, 1245, 1758, 236, 246, 763, 2300, 255, 2305, 773, 775, 264, 1800, 2313, 2829, 271, 1296, 1297, 2321, 2323, 2835, 279, 2328, 281, 284, 799, 2338, 1315, 2852, 2341, 812, 301, 1324, 1326, 2351, 1330, 1331, 308, 1332, 1333, 312, 1337, 1338, 1340, 1855, 1857, 1859, 1862, 1351, 1866, 1867, 1358, 1871, 849, 339, 851, 852, 345, 863, 1375, 357, 1381, 2917, 873, 1389, 1391, 368, 1394, 2426, 2939, 1921, 1922, 901, 1413, 1416, 908, 397, 1934, 912, 245

In [None]:
mislabeled_df = test.loc[potential_mislabeled_indices, ['text', 'Louvain Cluster Category', 'predictions', 'category']].copy()

mislabeled_df.columns = ['Text', 'Louvain Cluster Category', 'BERT Prediction', 'Dataset']

csv_file_path = 'potential_mislabeled_items_on_category.csv'
mislabeled_df.to_csv(csv_file_path, index=False)

from google.colab import files
files.download(csv_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>