# Lab 8: Text Clustering

In [7]:
# Import required libraries
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from tabulate import tabulate
from collections import Counter

## Using Dataset

In [8]:
# Create documents
dataset = ["I love playing football on the weekends",
"I enjoy hiking and camping in the mountains",
"I like to read books and watch movies",
"I prefer playing video games over sports",
"I love listening to music and going to concerts"]

In [9]:
# Vectorize the dataset
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)

In [10]:
# Perform clustering
k = 2 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)



In [11]:
# Predict the clusters for each document
y_pred = km.predict(X)

In [12]:
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(dataset, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                           Predicted Cluster
-----------------------------------------------  -------------------
I love playing football on the weekends                            0
I enjoy hiking and camping in the mountains                        0
I like to read books and watch movies                              1
I prefer playing video games over sports                           0
I love listening to music and going to concerts                    1


In [13]:
# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()


Top terms per cluster:
Cluster 0:
 playing
 the
 weekends
 on
 football
 video
 sports
 prefer
 over
 games

Cluster 1:
 to
 and
 read
 watch
 movies
 like
 books
 concerts
 going
 music



## Using Preprocessing Dataset

In [14]:
# Import required libraries
import pandas as pd

In [15]:
data = pd.read_csv("customer_complaints_1.csv")
data

Unnamed: 0,author,posted_on,rating,text
0,"Alantae of Chesterfeild, MI","Nov. 22, 2016",1,I used to love Comcast. Until all these consta...
1,"Vera of Philadelphia, PA","Nov. 19, 2016",1,I'm so over Comcast! The worst internet provid...
2,"Sarah of Rancho Cordova, CA","Nov. 17, 2016",1,If I could give them a negative star or no sta...
3,"Dennis of Manchester, NH","Nov. 16, 2016",1,I've had the worst experiences so far since in...
4,"Ryan of Bellevue, WA","Nov. 14, 2016",1,Check your contract when you sign up for Comca...
5,"Terri of Mobile, AL","Nov. 9, 2016",1,Thank God. I am changing to Dish. They gave me...
6,"Kellie of Salt Lake City, UT","Nov. 9, 2016",1,I Have been a long time customer and only have...
7,"Kathleen of New Haven, CT","Nov. 6, 2016",2,There is a malfunction on the DVR manager whic...
8,"Shira of Bloomfield, NJ","Nov. 5, 2016",1,Charges overwhelming. Comcast service rep was ...
9,"Kristy of Alpharetta, GA","Nov. 2, 2016",1,"I have had cable, DISH, and U-verse, etc. in t..."


In [16]:
# Check number of rows and columns
data.shape

(19, 4)

### Data Preprocessing

#### Remove Duplicates

In [17]:
data_t = data['text']
data_t.head()

0    I used to love Comcast. Until all these consta...
1    I'm so over Comcast! The worst internet provid...
2    If I could give them a negative star or no sta...
3    I've had the worst experiences so far since in...
4    Check your contract when you sign up for Comca...
Name: text, dtype: object

In [18]:
# Check number of duplicates in 'Text' column
data_t_duplicates = data_t.duplicated()
print(data_t_duplicates.sum())

0


#### Data Cleaning & Standardization

In [19]:
# Import required libraries for data cleaning and standardization
import re
import string

In [20]:
def get_cleaned_textdata(sentence):
    modified_sentence = re.sub(r'<.*?>',' ', sentence)
    modified_sentence = ''.join([i if i not in string.punctuation else ' ' for i in modified_sentence])
    modified_sentence = re.sub(r'\d+', ' ', modified_sentence)
    modified_sentence = re.sub(r'\s+', ' ', modified_sentence)
    modified_sentence = modified_sentence.lower()
    return modified_sentence

In [21]:
data_t_clean = data_t.apply(get_cleaned_textdata)

In [22]:
print(data_t[0])

I used to love Comcast. Until all these constant updates. My internet and cable crash a lot at night, and sometimes during the day, some channels don't even work and on demand sometimes don't play either. I wish they will do something about it. Because just a few mins ago, the internet have crashed for about 20 mins for no reason. I'm tired of it and thinking about switching to Wow or something. Please do not get Xfinity.


#### Tokenization

In [23]:
import nltk
from nltk.tokenize import word_tokenize

In [24]:
data_token = data_t_clean.apply(word_tokenize)

In [25]:
# Check result of tokenization
print(data_token[0])

['i', 'used', 'to', 'love', 'comcast', 'until', 'all', 'these', 'constant', 'updates', 'my', 'internet', 'and', 'cable', 'crash', 'a', 'lot', 'at', 'night', 'and', 'sometimes', 'during', 'the', 'day', 'some', 'channels', 'don', 't', 'even', 'work', 'and', 'on', 'demand', 'sometimes', 'don', 't', 'play', 'either', 'i', 'wish', 'they', 'will', 'do', 'something', 'about', 'it', 'because', 'just', 'a', 'few', 'mins', 'ago', 'the', 'internet', 'have', 'crashed', 'for', 'about', 'mins', 'for', 'no', 'reason', 'i', 'm', 'tired', 'of', 'it', 'and', 'thinking', 'about', 'switching', 'to', 'wow', 'or', 'something', 'please', 'do', 'not', 'get', 'xfinity']


#### Remove Stopwords

In [26]:
stopwords = nltk.corpus.stopwords.words('english')

In [27]:
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

In [28]:
data_xstopwords = data_token.apply(remove_stopwords)

In [29]:
print(data_xstopwords[0])

['used', 'love', 'comcast', 'constant', 'updates', 'internet', 'cable', 'crash', 'lot', 'night', 'sometimes', 'day', 'channels', 'even', 'work', 'demand', 'sometimes', 'play', 'either', 'wish', 'something', 'mins', 'ago', 'internet', 'crashed', 'mins', 'reason', 'tired', 'thinking', 'switching', 'wow', 'something', 'please', 'get', 'xfinity']


#### Stemming

In [30]:
from nltk.stem.porter import PorterStemmer

In [31]:
porter_stemmer = PorterStemmer()

In [32]:
def porter_stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

In [33]:
data_porterstem = data_xstopwords.apply(porter_stemming)

In [34]:
print(data_porterstem[0])

['use', 'love', 'comcast', 'constant', 'updat', 'internet', 'cabl', 'crash', 'lot', 'night', 'sometim', 'day', 'channel', 'even', 'work', 'demand', 'sometim', 'play', 'either', 'wish', 'someth', 'min', 'ago', 'internet', 'crash', 'min', 'reason', 'tire', 'think', 'switch', 'wow', 'someth', 'pleas', 'get', 'xfiniti']


#### Lemmatization

In [35]:
from nltk.stem import WordNetLemmatizer

In [36]:
wordnet_lemmatizer = WordNetLemmatizer()

In [37]:
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

In [44]:
data_lemm = data_porterstem.apply(lemmatizer)

In [47]:
print(data_lemm)

0     [use, love, comcast, constant, updat, internet...
1     [comcast, worst, internet, provid, take, onlin...
2     [could, give, neg, star, star, review, would, ...
3     [worst, experi, far, sinc, instal, noth, probl...
4     [check, contract, sign, comcast, advertis, off...
5     [thank, god, chang, dish, gave, awesom, price,...
6     [long, time, custom, xfiniti, isp, local, walm...
7     [malfunct, dvr, manag, prevent, u, ad, record,...
8     [charg, overwhelm, comcast, servic, rep, ignor...
9     [cabl, dish, u, vers, etc, past, eh, know, com...
10    [tell, new, custom, run, nowher, run, tri, tur...
11    [disappoint, comcast, xfiniti, custom, almost,...
12    [peopl, uneth, disturb, oblivi, custom, need, ...
13    [unplan, unexpect, day, outag, rude, servic, r...
14    [warn, hidden, fee, sign, servic, charg, extra...
15    [comcast, overal, terribl, experi, everyon, el...
16    [call, infin, custom, servic, center, complain...
17    [outrag, take, month, get, internet, servi

### Data Process

In [56]:
lemmatized_text = [' '.join(tokens) for tokens in data_lemm]

In [58]:
# Vectorize the dataset
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(lemmatized_text)

In [63]:
# Perform clustering
k = 3 # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

In [64]:
# Predict the clusters for each document
y_pred = km.predict(X)

In [65]:
# Display the document and its predicted cluster in a table
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(lemmatized_text, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [67]:
# Print top terms per cluster
print("\nTop terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()


Top terms per cluster:
Cluster 0:
 contract
 mbp
 sign
 told
 chang
 blast
 xfiniti
 custom
 fee
 know

Cluster 1:
 servic
 rude
 call
 second
 comcast
 would
 sinc
 investig
 custom
 box

Cluster 2:
 internet
 day
 servic
 comcast
 speed
 tech
 technician
 cabl
 time
 week

