In [5]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter 

df = pd.read_csv("customer_complaints_1.csv")   

dataset = df['text'].tolist()

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

k = 5
km = KMeans(n_clusters=k)
km.fit(X)

y_pred = km.predict(X)

table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])

table_output = tabulate(table_data, headers="firstrow", tablefmt="grid")

top_terms_per_cluster = []
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    cluster_terms = []
    for ind in order_centroids[i, :10]:
        cluster_terms.append(terms[ind])
    top_terms_per_cluster.append(f"Cluster {i}: {', '.join(cluster_terms)}")

total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

table_output, top_terms_per_cluster, purity

print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]: 
        print(f' {terms[ind]}')
    print()

total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]

purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)


Top terms per cluster:
Cluster 0:
 me
 to
 rude
 with
 was
 rep
 am
 people
 my
 comcast

Cluster 1:
 the
 to
 and
 for
 it
 you
 my
 they
 was
 internet

Cluster 2:
 that
 is
 malfunction
 protocol
 investigating
 from
 since
 customer
 the
 their

Cluster 3:
 second
 boxes
 floor
 possible
 account
 adding
 the
 not
 to
 no

Cluster 4:
 day
 on
 failure
 reps
 dropping
 relying
 outages
 unplanned
 totally
 rearranging

Purity: 0.7368421052631579
