In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from collections import Counter
from tabulate import tabulate
import numpy as np
import pandas as pd
import re
from nltk.stem import PorterStemmer

In [2]:
# Load the dataset
file_path = 'customer_complaints_1.csv'
df = pd.read_csv(file_path)

# Display first few rows to understand the structure
df.head()

Unnamed: 0,author,posted_on,rating,text
0,"Alantae of Chesterfeild, MI","Nov. 22, 2016",1,I used to love Comcast. Until all these consta...
1,"Vera of Philadelphia, PA","Nov. 19, 2016",1,I'm so over Comcast! The worst internet provid...
2,"Sarah of Rancho Cordova, CA","Nov. 17, 2016",1,If I could give them a negative star or no sta...
3,"Dennis of Manchester, NH","Nov. 16, 2016",1,I've had the worst experiences so far since in...
4,"Ryan of Bellevue, WA","Nov. 14, 2016",1,Check your contract when you sign up for Comca...


In [3]:
import nltk

# Re-import necessary NLTK components
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Text preprocessing function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    tokens = text.split()
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply the preprocessing to the 'text' column
df['cleaned_text'] = df['text'].apply(clean_text)

# Display the cleaned text for the first few rows
df[['text', 'cleaned_text']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,cleaned_text
0,I used to love Comcast. Until all these consta...,use love comcast constant updat internet cabl ...
1,I'm so over Comcast! The worst internet provid...,im comcast worst internet provid im take onlin...
2,If I could give them a negative star or no sta...,could give neg star star review would never wo...
3,I've had the worst experiences so far since in...,ive worst experi far sinc instal noth problem ...
4,Check your contract when you sign up for Comca...,check contract sign comcast advertis offer mat...


In [4]:
# TEXT CLUSTERING USING TF-IDF VECTORIZER

In [5]:
# Vectorize the dataset
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])

In [6]:
# Perform clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Display the document and its predicted cluster in a table
table_data = [["Cleaned Text", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(df['cleaned_text'], y_pred)])
print(tabulate(table_data, headers="firstrow"))

# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

Cleaned Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [7]:
#Evaluate results
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

print("Purity:", purity)

Purity: 0.7894736842105263


In [8]:
# TEXT CLUSTERING USING WORD2VEC VECTORIZER

In [9]:
# Train Word2Vec model
tokenized_dataset = [doc.split() for doc in df['cleaned_text']]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

In [10]:
# Create document embeddings
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv], axis=0) for doc in df['cleaned_text']])

In [11]:
# Perform clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

# Predict the clusters for each document
y_pred = km.predict(X)

# Tabulate the document and predicted cluster
table_data = [["Cleaned Text", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(df['cleaned_text'], y_pred)])
print(tabulate(table_data, headers="firstrow"))

Cleaned Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            



In [12]:
#Evaluate results
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

print("Purity:", purity)

Purity: 0.9473684210526315
