# Lab 8: Text Clustering

In [223]:
# Import required libraries
import numpy as np
import re
import string
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter
from gensim.models import Word2Vec

## Exercise 1

In [224]:
# Create documents
dataset = ["I love playing football on the weekends",
"I enjoy hiking and camping in the mountains",
"I like to read books and watch movies",
"I prefer playing video games over sports",
"I love listening to music and going to concerts"]

### Data Preprocessing

In [225]:
# Define functions for data preprocessing
def get_cleaned_textdata(sentence):
    modified_sentence = re.sub(r'<.*?>',' ', sentence)
    modified_sentence = ''.join([i if i not in string.punctuation else ' ' for i in modified_sentence])
    modified_sentence = re.sub(r'\d+', ' ', modified_sentence)
    modified_sentence = re.sub(r'\s+', ' ', modified_sentence)
    modified_sentence = modified_sentence.lower()
    return modified_sentence

def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

def porter_stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

#### Remove Duplicates

In [226]:
# Check number of duplicates in dataset
dataset_duplicates = data_t.duplicated()
print(data_t_duplicates.sum())

0


#### Data Cleaning and Standardization

In [227]:
dataset_clean = [get_cleaned_textdata(sentence) for sentence in dataset]
dataset_clean[0]

'i love playing football on the weekends'

#### Data Tokenization

In [228]:
datset_tokenized = [word_tokenize(sentence) for sentence in dataset_clean]
datset_tokenized[0]

['i', 'love', 'playing', 'football', 'on', 'the', 'weekends']

#### Remove Stopwords

In [229]:
dataset_xstopwords = [remove_stopwords(sentence) for sentence in datset_tokenized]
dataset_xstopwords

[['love', 'playing', 'football', 'weekends'],
 ['enjoy', 'hiking', 'camping', 'mountains'],
 ['like', 'read', 'books', 'watch', 'movies'],
 ['prefer', 'playing', 'video', 'games', 'sports'],
 ['love', 'listening', 'music', 'going', 'concerts']]

#### Stemming

In [230]:
dataset_stemm = [porter_stemming(sentence) for sentence in dataset_xstopwords]
dataset_stemm

[['love', 'play', 'footbal', 'weekend'],
 ['enjoy', 'hike', 'camp', 'mountain'],
 ['like', 'read', 'book', 'watch', 'movi'],
 ['prefer', 'play', 'video', 'game', 'sport'],
 ['love', 'listen', 'music', 'go', 'concert']]

#### Lemmetization

In [231]:
dataset_lemm = [lemmatizer(sentence) for sentence in dataset_stemm]
dataset_lemm

[['love', 'play', 'footbal', 'weekend'],
 ['enjoy', 'hike', 'camp', 'mountain'],
 ['like', 'read', 'book', 'watch', 'movi'],
 ['prefer', 'play', 'video', 'game', 'sport'],
 ['love', 'listen', 'music', 'go', 'concert']]

In [232]:
# Join the lemmetized words back into sentences
dataset_cleaned = [' '.join(sentence) for sentence in dataset_lemm]
dataset_cleaned

['love play footbal weekend',
 'enjoy hike camp mountain',
 'like read book watch movi',
 'prefer play video game sport',
 'love listen music go concert']

### TF-IDF VECTORIZER

In [233]:
# Vectorize the dataset
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset_cleaned)

In [234]:
# Perform clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

In [235]:
# Predict the clusters for each document
y_pred = km.predict(X)

In [236]:
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset_cleaned, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                        Predicted Cluster
----------------------------  -------------------
love play footbal weekend                       1
enjoy hike camp mountain                        0
like read book watch movi                       0
prefer play video game sport                    1
love listen music go concert                    1


In [237]:
# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()


Top terms per cluster:
Cluster 0:
 hike
 camp
 enjoy
 mountain
 book
 read
 movi
 watch
 like
 concert

Cluster 1:
 love
 play
 footbal
 weekend
 game
 sport
 music
 concert
 listen
 prefer



In [238]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


### WORD2VEC VECTORIZER

In [211]:
tokenized_dataset = [doc.split() for doc in dataset_cleaned]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)

In [212]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in dataset_cleaned])

In [213]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)



In [214]:
# Predict the clusters for each document
y_pred = km.predict(X)

In [215]:
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset_cleaned, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                        Predicted Cluster
----------------------------  -------------------
love play footbal weekend                       1
enjoy hike camp mountain                        0
like read book watch movi                       0
prefer play video game sport                    1
love listen music go concert                    0


In [216]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6


TF-IDF VECTORIZER
- Before data preprocessing = 0.6
- After data preprocessing = 0.6

WORD2VEC VECTORIZER
- Before data preprocessing = 0.6
- After data preprocessing = 0.6

In both the TF-IDF and WORD2VEC vectorization methods, the purity level remains consistent at 0.6 before and after data preprocessing. This indicates that the preprocessing steps applied do not significantly alter the resulting purity level, suggesting that the preprocessing may not have a substantial impact on the vectorization outcomes for these particular datasets.

## Exercise 2

In [176]:
data = pd.read_csv("customer_complaints_1.csv")
data

Unnamed: 0,author,posted_on,rating,text
0,"Alantae of Chesterfeild, MI","Nov. 22, 2016",1,I used to love Comcast. Until all these consta...
1,"Vera of Philadelphia, PA","Nov. 19, 2016",1,I'm so over Comcast! The worst internet provid...
2,"Sarah of Rancho Cordova, CA","Nov. 17, 2016",1,If I could give them a negative star or no sta...
3,"Dennis of Manchester, NH","Nov. 16, 2016",1,I've had the worst experiences so far since in...
4,"Ryan of Bellevue, WA","Nov. 14, 2016",1,Check your contract when you sign up for Comca...
5,"Terri of Mobile, AL","Nov. 9, 2016",1,Thank God. I am changing to Dish. They gave me...
6,"Kellie of Salt Lake City, UT","Nov. 9, 2016",1,I Have been a long time customer and only have...
7,"Kathleen of New Haven, CT","Nov. 6, 2016",2,There is a malfunction on the DVR manager whic...
8,"Shira of Bloomfield, NJ","Nov. 5, 2016",1,Charges overwhelming. Comcast service rep was ...
9,"Kristy of Alpharetta, GA","Nov. 2, 2016",1,"I have had cable, DISH, and U-verse, etc. in t..."


### Data Preprocessing

In [186]:
data_t = data['text']
data_rdup = data_t

# Remove duplicate data 
data_t_rdup = data_t.drop_duplicates(keep='first')

# Get clean data
data_t_clean = data_t_rdup.apply(get_cleaned_textdata)

# Tokenize the data
data_t_tokenized = data_t_clean.apply(word_tokenize)

# Remove stopwords
data_t_xstopwords = data_t_tokenized.apply(remove_stopwords)

# Stemming
data_t_stemmed = data_t_xstopwords.apply(porter_stemming)

# Lemmatization
data_t_lemmatized = data_t_stemmed.apply(lemmatizer)

data_cleaned = [' '.join(sentence) for sentence in data_t_lemmatized]


### TF-IDF VECTORIZER

In [269]:
# Vectorize the dataset
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data_cleaned)

In [270]:
# Perform clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

In [271]:
# Predict the clusters for each document
y_pred = km.predict(X)

In [272]:
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(data_cleaned, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [273]:
# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()


Top terms per cluster:
Cluster 0:
 internet
 cabl
 time
 work
 comcast
 secur
 hour
 sometim
 technician
 promis

Cluster 1:
 servic
 custom
 contract
 rude
 call
 would
 comcast
 speed
 mbp
 told



In [274]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.6842105263157895


### WORD2VEC VECTORIZER

In [275]:
tokenized_dataset = [doc.split() for doc in data_cleaned]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100,
window=5, min_count=1, workers=4)

In [276]:
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in
word2vec_model.wv], axis=0) for doc in data_cleaned])

In [277]:
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)



In [278]:
# Predict the clusters for each document
y_pred = km.predict(X)

In [279]:
# Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(data_cleaned, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [280]:
# Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples
print("Purity:", purity)

Purity: 0.631578947368421
