In [None]:
!pip install scikit-network
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel, pipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sknetwork.utils import get_membership
from sklearn.decomposition import TruncatedSVD
from IPython.display import SVG
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sknetwork.clustering import Louvain, get_modularity
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from wordcloud import WordCloud

from sknetwork.hierarchy import LouvainHierarchy
from sknetwork.hierarchy import cut_straight, dasgupta_score, tree_sampling_divergence
from sknetwork.visualization import svg_graph, svg_dendrogram

Collecting scikit-network
  Downloading scikit_network-0.32.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-network
Successfully installed scikit-network-0.32.1


In [None]:
# Load the dataset
test = pd.read_csv("test.csv")

# Initialize the tokenizer and model from the pre-trained 'philschmid/BERT-Banking77'
model_id = 'philschmid/BERT-Banking77'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
classification_model = AutoModelForSequenceClassification.from_pretrained(model_id)
classifier = pipeline('text-classification', model=classification_model, tokenizer=tokenizer)

def get_bert_embeddings(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    normalized_embeddings = normalize(embeddings, axis=1, norm='l2')  # Normalize the embeddings
    return normalized_embeddings

# Apply the function to the 'text' column
test['embeddings'] = test['text'].apply(lambda x: get_bert_embeddings([x])[0])

# Show the DataFrame with embeddings
print(test[['text', 'embeddings']])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

                                                   text  \
0                              How do I locate my card?   
1     I still have not received my new card, I order...   
2     I ordered a card but it has not arrived. Help ...   
3      Is there a way to know when my card will arrive?   
4                          My card has not arrived yet.   
...                                                 ...   
3075      If i'm not in the UK, can I still get a card?   
3076                 How many countries do you support?   
3077              What countries do you do business in?   
3078             What are the countries you operate in.   
3079         Can the card be mailed and used in Europe?   

                                             embeddings  
0     [0.025442068, -0.03386337, 0.0026843003, 0.007...  
1     [0.010722405, 0.007851422, 0.07274575, -0.0132...  
2     [0.0123235015, -0.010357805, 0.07911538, 0.011...  
3     [-0.0020086144, -0.0009892675, 0.08350486, -0....  
4

In [None]:
# Convert embeddings to a matrix
embedding_matrix = np.vstack(test['embeddings'])

similarity_matrix = np.dot(embedding_matrix, embedding_matrix.T)

similarity_matrix[similarity_matrix < 0.3] = 0

adjacency = csr_matrix(similarity_matrix)

### **Louvain**

In [None]:
import numpy as np
from sknetwork.clustering import Louvain, get_modularity


# Loop over a range of resolution parameters to find the one that optimizes the modularity
best_resolution = 1.0
best_modularity = float('-inf')
best_labels = None # No clusters have been formed yet

for resolution in np.linspace(0.01, 3, 1000):  
    louvain = Louvain(resolution=resolution)
    labels = louvain.fit_predict(adjacency)
    modularity = get_modularity(adjacency, labels)

    if modularity > best_modularity:
        best_modularity = modularity
        best_resolution = resolution
        best_labels = labels

print("Best resolution parameter:", best_resolution)
print("Best modularity:", best_modularity)

label_counter = Counter(best_labels)
print("Number of nodes in each community:", label_counter)


Best resolution parameter: 1.1892392392392395
Best modularity: 0.8151918641559894
Number of nodes in each community: Counter({0: 283, 1: 243, 2: 242, 3: 229, 4: 203, 5: 164, 6: 163, 7: 159, 8: 125, 11: 121, 10: 121, 9: 121, 12: 120, 13: 119, 14: 116, 15: 111, 16: 86, 17: 82, 18: 74, 19: 43, 21: 40, 20: 40, 22: 38, 23: 37})


In [None]:
# Apply the Louvain method
louvain = Louvain(resolution=12.8)
labels = louvain.fit_predict(adjacency)
labels_unique, counts = np.unique(labels, return_counts=True)
print(labels_unique, counts)

# Calculate modularity
modularity = get_modularity(adjacency, labels)
print("Modularity at resolution 12.8:", modularity)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76] [82 80 77 76 76 76 74 72 49 45 45 45 45 45 44 44 44 44 43 43 43 43 43 43
 43 42 42 42 42 41 41 41 41 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39
 39 39 39 39 39 39 38 38 38 38 38 37 37 37 36 36 36 36 35 35 33  1  1  1
  1  1  1  1  1]
Modularity at resolution 12.8: 0.6773951157037603


In [None]:
def print_category_distribution_by_cluster(labels, data):
        if 'category' not in data.columns:
        print("The dataset must have a 'category' column.")
        return

        cluster_category_distribution = {}

        unique_labels = np.unique(labels)

        for label in unique_labels:
                cluster_data = data.iloc[labels == label]

                category_counts = cluster_data['category'].value_counts().to_dict()

                cluster_category_distribution[label] = category_counts

        for cluster_id, counts in cluster_category_distribution.items():
        print(f"{cluster_id}: {counts}")

print_category_distribution_by_cluster(labels, test)

0: {'verify_my_identity': 40, 'why_verify_identity': 39, 'unable_to_verify_identity': 2, 'lost_or_stolen_card': 1}
1: {'getting_virtual_card': 39, 'get_disposable_virtual_card': 36, 'disposable_card_limits': 3, 'getting_spare_card': 1, 'order_physical_card': 1}
2: {'failed_transfer': 38, 'declined_transfer': 31, 'beneficiary_not_allowed': 4, 'contactless_not_working': 1, 'transfer_not_received_by_recipient': 1, 'pending_transfer': 1, 'balance_not_updated_after_bank_transfer': 1}
3: {'card_delivery_estimate': 39, 'card_arrival': 37}
4: {'transfer_timing': 36, 'pending_transfer': 35, 'balance_not_updated_after_bank_transfer': 4, 'transfer_not_received_by_recipient': 1}
5: {'top_up_failed': 38, 'top_up_reverted': 36, 'pending_top_up': 1, 'topping_up_by_card': 1}
6: {'get_physical_card': 39, 'pin_blocked': 34, 'card_not_working': 1}
7: {'balance_not_updated_after_cheque_or_cash_deposit': 37, 'top_up_by_cash_or_cheque': 34, 'declined_transfer': 1}
8: {'declined_card_payment': 38, 'reverted_

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

def print_single_text_clusters(labels, data):
    # Convert labels to a pandas Series for easier filtering
    labels_series = pd.Series(labels, index=data.index)

    # Count how many elements are in each cluster
    cluster_counts = Counter(labels)

    # Filter for clusters with only one element
    single_element_clusters = {cluster: count for cluster, count in cluster_counts.items() if count == 1}

    # Extract indices corresponding to these clusters
    single_text_entries = data[labels_series.isin(single_element_clusters.keys())]['text']

    # Print the results
    for i, text in enumerate(single_text_entries, 1):
        print(f"Cluster with single element #{i}: {text}")

# Applying the clustering logic
louvain = Louvain(resolution=12.8)
labels = louvain.fit_predict(adjacency)

# Call the function with labels and dataset
print_single_text_clusters(labels, test)


Cluster with single element #1: Why did I not get my cash back after I withdrew?
Cluster with single element #2: Why does my account not accept cash deposits?
Cluster with single element #3: Is there places where I can't withdraw money?
Cluster with single element #4: Hi, My card withdrawal was declined this morning. It was working fine till yesterday. Please check and inform me.
Cluster with single element #5: The ATM isn't giving out any money.
Cluster with single element #6: Oh my goodness, my card has been declined twice at ATM! I tried two different ATM, but each one declined my card! Can you tell me what's going on with my account?
Cluster with single element #7: WTF??? I tried to withdraw some money at a Metro bank on High St. Kensington and without any notice it disappeared in the machine. The bank was already closed so I couldn't do anything. How do I get it back?
Cluster with single element #8: how come i can't find anywhere to load using cash


# Andres, Start from HERE!
It will generate a new CSV file (BUT Without `error reasons` `topics`)

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from sknetwork.clustering import Louvain

# 1. 加载数据和模型初始化
test = pd.read_csv("test.csv")
model_id = 'philschmid/BERT-Banking77'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
classifier = pipeline('text-classification', model=model_id, tokenizer=tokenizer)

# 2. 计算BERT embeddings和分类预测
def get_bert_embeddings_and_predictions(text):
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()  # Get embeddings from AutoModel
    normalized_embeddings = normalize(embeddings, axis=1, norm='l2')
    classification_result = classifier(text)[0]  # Use pipeline to get classification
    return normalized_embeddings, classification_result['label'], classification_result['score']

# 应用上述函数到DataFrame
test[['embeddings', 'predicted_label', 'predicted_confidence']] = test['text'].apply(
    lambda x: pd.Series(get_bert_embeddings_and_predictions(x))
)

In [None]:
# 3. 计算相似度矩阵并构建邻接矩阵
embedding_matrix = np.vstack(test['embeddings'])
similarity_matrix = np.dot(embedding_matrix, embedding_matrix.T)
similarity_matrix[similarity_matrix < 0.3] = 0  # 阈值调整
adjacency = csr_matrix(similarity_matrix)

# 4. 使用Louvain算法进行聚类，并计算置信度
louvain = Louvain(resolution=12.8, return_probs=True)
labels = louvain.fit_predict(adjacency)
test['cluster_label'] = labels
test['cluster_confidence'] = louvain.probs_.max(axis=1).toarray().flatten()  # 计算聚类置信度

# 5. 找到每个聚类的主要标签
def find_primary_labels(data, labels_column='cluster_label', category_column='category'):
    primary_labels = {}
    for label in data[labels_column].unique():
        mode_label = data.loc[data[labels_column] == label, category_column].mode()[0]
        primary_labels[label] = mode_label
    return primary_labels

primary_labels = find_primary_labels(test)
test['primary_label'] = test['cluster_label'].map(primary_labels)  # 更新聚类标签为对应的主要标签名

# 6. 确定潜在的误标记项目（交集方法）
test['mislabeled'] = (test['primary_label'] != test['category']) & (test['predicted_label'] != test['category'])

# 7. 计算置信度差异
test['confidence_difference'] = test['predicted_confidence'] - test['cluster_confidence']

# 选择需要输出的列
output_columns = ['text', 'category', 'predicted_label', 'predicted_confidence', 'primary_label', 'cluster_confidence', 'confidence_difference']

# 筛选出潜在误标记的项并输出至CSV
mislabeled_items = test[test['mislabeled']]
mislabeled_items[output_columns].to_csv('mislabeled_items_analysis.csv', index=False)

print("Analysis completed and saved to 'mislabeled_items_analysis.csv'.")

Analysis completed and saved to 'mislabeled_items_analysis.csv'.
