# LAB 8 - TEXT CLUSTERING (Word2Vec)

## load dataset

In [1]:
#Import the libraries
import pandas as pd
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from tabulate import tabulate
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import numpy as np

In [2]:
#Load the data
df = pd.read_csv('customer_complaints_1.csv')
df

Unnamed: 0,author,posted_on,rating,text
0,"Alantae of Chesterfeild, MI","Nov. 22, 2016",1,I used to love Comcast. Until all these consta...
1,"Vera of Philadelphia, PA","Nov. 19, 2016",1,I'm so over Comcast! The worst internet provid...
2,"Sarah of Rancho Cordova, CA","Nov. 17, 2016",1,If I could give them a negative star or no sta...
3,"Dennis of Manchester, NH","Nov. 16, 2016",1,I've had the worst experiences so far since in...
4,"Ryan of Bellevue, WA","Nov. 14, 2016",1,Check your contract when you sign up for Comca...
5,"Terri of Mobile, AL","Nov. 9, 2016",1,Thank God. I am changing to Dish. They gave me...
6,"Kellie of Salt Lake City, UT","Nov. 9, 2016",1,I Have been a long time customer and only have...
7,"Kathleen of New Haven, CT","Nov. 6, 2016",2,There is a malfunction on the DVR manager whic...
8,"Shira of Bloomfield, NJ","Nov. 5, 2016",1,Charges overwhelming. Comcast service rep was ...
9,"Kristy of Alpharetta, GA","Nov. 2, 2016",1,"I have had cable, DISH, and U-verse, etc. in t..."


In [3]:
columns_to_drop = df.columns[df.columns != 'text']
df.drop(columns=columns_to_drop, inplace=True)

In [4]:
df

Unnamed: 0,text
0,I used to love Comcast. Until all these consta...
1,I'm so over Comcast! The worst internet provid...
2,If I could give them a negative star or no sta...
3,I've had the worst experiences so far since in...
4,Check your contract when you sign up for Comca...
5,Thank God. I am changing to Dish. They gave me...
6,I Have been a long time customer and only have...
7,There is a malfunction on the DVR manager whic...
8,Charges overwhelming. Comcast service rep was ...
9,"I have had cable, DISH, and U-verse, etc. in t..."


# PREPROCESSING

## remove punctuation

In [6]:
def remove_punctuation(text):
    punctuation_free = ""
    
    for i in text:
        if i not in string.punctuation:
            punctuation_free += i
            
    return punctuation_free

df['Clean_Punctuation'] = df['text'].apply(remove_punctuation)
df

Unnamed: 0,text,Clean_Punctuation
0,I used to love Comcast. Until all these consta...,I used to love Comcast Until all these constan...
1,I'm so over Comcast! The worst internet provid...,Im so over Comcast The worst internet provider...
2,If I could give them a negative star or no sta...,If I could give them a negative star or no sta...
3,I've had the worst experiences so far since in...,Ive had the worst experiences so far since ins...
4,Check your contract when you sign up for Comca...,Check your contract when you sign up for Comca...
5,Thank God. I am changing to Dish. They gave me...,Thank God I am changing to Dish They gave me a...
6,I Have been a long time customer and only have...,I Have been a long time customer and only have...
7,There is a malfunction on the DVR manager whic...,There is a malfunction on the DVR manager whic...
8,Charges overwhelming. Comcast service rep was ...,Charges overwhelming Comcast service rep was s...
9,"I have had cable, DISH, and U-verse, etc. in t...",I have had cable DISH and Uverse etc in the pa...


## convert to lower case

In [7]:
df['Clean_Lower'] = df['Clean_Punctuation'].str.lower()
df

Unnamed: 0,text,Clean_Punctuation,Clean_Lower
0,I used to love Comcast. Until all these consta...,I used to love Comcast Until all these constan...,i used to love comcast until all these constan...
1,I'm so over Comcast! The worst internet provid...,Im so over Comcast The worst internet provider...,im so over comcast the worst internet provider...
2,If I could give them a negative star or no sta...,If I could give them a negative star or no sta...,if i could give them a negative star or no sta...
3,I've had the worst experiences so far since in...,Ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...
4,Check your contract when you sign up for Comca...,Check your contract when you sign up for Comca...,check your contract when you sign up for comca...
5,Thank God. I am changing to Dish. They gave me...,Thank God I am changing to Dish They gave me a...,thank god i am changing to dish they gave me a...
6,I Have been a long time customer and only have...,I Have been a long time customer and only have...,i have been a long time customer and only have...
7,There is a malfunction on the DVR manager whic...,There is a malfunction on the DVR manager whic...,there is a malfunction on the dvr manager whic...
8,Charges overwhelming. Comcast service rep was ...,Charges overwhelming Comcast service rep was s...,charges overwhelming comcast service rep was s...
9,"I have had cable, DISH, and U-verse, etc. in t...",I have had cable DISH and Uverse etc in the pa...,i have had cable dish and uverse etc in the pa...


## remove numbers and dash (-)

In [8]:
def remove_numbers(text):
        return re.sub("[\d-]",'',text)
    
df['Clean_Number'] = df['Clean_Lower'].apply(remove_numbers)
df

Unnamed: 0,text,Clean_Punctuation,Clean_Lower,Clean_Number
0,I used to love Comcast. Until all these consta...,I used to love Comcast Until all these constan...,i used to love comcast until all these constan...,i used to love comcast until all these constan...
1,I'm so over Comcast! The worst internet provid...,Im so over Comcast The worst internet provider...,im so over comcast the worst internet provider...,im so over comcast the worst internet provider...
2,If I could give them a negative star or no sta...,If I could give them a negative star or no sta...,if i could give them a negative star or no sta...,if i could give them a negative star or no sta...
3,I've had the worst experiences so far since in...,Ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...
4,Check your contract when you sign up for Comca...,Check your contract when you sign up for Comca...,check your contract when you sign up for comca...,check your contract when you sign up for comca...
5,Thank God. I am changing to Dish. They gave me...,Thank God I am changing to Dish They gave me a...,thank god i am changing to dish they gave me a...,thank god i am changing to dish they gave me a...
6,I Have been a long time customer and only have...,I Have been a long time customer and only have...,i have been a long time customer and only have...,i have been a long time customer and only have...
7,There is a malfunction on the DVR manager whic...,There is a malfunction on the DVR manager whic...,there is a malfunction on the dvr manager whic...,there is a malfunction on the dvr manager whic...
8,Charges overwhelming. Comcast service rep was ...,Charges overwhelming Comcast service rep was s...,charges overwhelming comcast service rep was s...,charges overwhelming comcast service rep was s...
9,"I have had cable, DISH, and U-verse, etc. in t...",I have had cable DISH and Uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,i have had cable dish and uverse etc in the pa...


## tokenize data

In [9]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to C:\Users\End
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
df['Tokenize_Data'] = df['Clean_Number'].apply(word_tokenize)
df

Unnamed: 0,text,Clean_Punctuation,Clean_Lower,Clean_Number,Tokenize_Data
0,I used to love Comcast. Until all these consta...,I used to love Comcast Until all these constan...,i used to love comcast until all these constan...,i used to love comcast until all these constan...,"[i, used, to, love, comcast, until, all, these..."
1,I'm so over Comcast! The worst internet provid...,Im so over Comcast The worst internet provider...,im so over comcast the worst internet provider...,im so over comcast the worst internet provider...,"[im, so, over, comcast, the, worst, internet, ..."
2,If I could give them a negative star or no sta...,If I could give them a negative star or no sta...,if i could give them a negative star or no sta...,if i could give them a negative star or no sta...,"[if, i, could, give, them, a, negative, star, ..."
3,I've had the worst experiences so far since in...,Ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,"[ive, had, the, worst, experiences, so, far, s..."
4,Check your contract when you sign up for Comca...,Check your contract when you sign up for Comca...,check your contract when you sign up for comca...,check your contract when you sign up for comca...,"[check, your, contract, when, you, sign, up, f..."
5,Thank God. I am changing to Dish. They gave me...,Thank God I am changing to Dish They gave me a...,thank god i am changing to dish they gave me a...,thank god i am changing to dish they gave me a...,"[thank, god, i, am, changing, to, dish, they, ..."
6,I Have been a long time customer and only have...,I Have been a long time customer and only have...,i have been a long time customer and only have...,i have been a long time customer and only have...,"[i, have, been, a, long, time, customer, and, ..."
7,There is a malfunction on the DVR manager whic...,There is a malfunction on the DVR manager whic...,there is a malfunction on the dvr manager whic...,there is a malfunction on the dvr manager whic...,"[there, is, a, malfunction, on, the, dvr, mana..."
8,Charges overwhelming. Comcast service rep was ...,Charges overwhelming Comcast service rep was s...,charges overwhelming comcast service rep was s...,charges overwhelming comcast service rep was s...,"[charges, overwhelming, comcast, service, rep,..."
9,"I have had cable, DISH, and U-verse, etc. in t...",I have had cable DISH and Uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,"[i, have, had, cable, dish, and, uverse, etc, ..."


## remove stopwords

In [11]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\End
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def remove_stopwords(text):
    output = []
    for i in text:
        if i not in stopwords:
            output.append(i)
    return output

df['Clean_Stopwords'] = df['Tokenize_Data'].apply(remove_stopwords)
df

Unnamed: 0,text,Clean_Punctuation,Clean_Lower,Clean_Number,Tokenize_Data,Clean_Stopwords
0,I used to love Comcast. Until all these consta...,I used to love Comcast Until all these constan...,i used to love comcast until all these constan...,i used to love comcast until all these constan...,"[i, used, to, love, comcast, until, all, these...","[used, love, comcast, constant, updates, inter..."
1,I'm so over Comcast! The worst internet provid...,Im so over Comcast The worst internet provider...,im so over comcast the worst internet provider...,im so over comcast the worst internet provider...,"[im, so, over, comcast, the, worst, internet, ...","[im, comcast, worst, internet, provider, im, t..."
2,If I could give them a negative star or no sta...,If I could give them a negative star or no sta...,if i could give them a negative star or no sta...,if i could give them a negative star or no sta...,"[if, i, could, give, them, a, negative, star, ...","[could, give, negative, star, stars, review, w..."
3,I've had the worst experiences so far since in...,Ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,"[ive, had, the, worst, experiences, so, far, s...","[ive, worst, experiences, far, since, install,..."
4,Check your contract when you sign up for Comca...,Check your contract when you sign up for Comca...,check your contract when you sign up for comca...,check your contract when you sign up for comca...,"[check, your, contract, when, you, sign, up, f...","[check, contract, sign, comcast, advertised, o..."
5,Thank God. I am changing to Dish. They gave me...,Thank God I am changing to Dish They gave me a...,thank god i am changing to dish they gave me a...,thank god i am changing to dish they gave me a...,"[thank, god, i, am, changing, to, dish, they, ...","[thank, god, changing, dish, gave, awesome, pr..."
6,I Have been a long time customer and only have...,I Have been a long time customer and only have...,i have been a long time customer and only have...,i have been a long time customer and only have...,"[i, have, been, a, long, time, customer, and, ...","[long, time, customer, xfinity, isp, local, wa..."
7,There is a malfunction on the DVR manager whic...,There is a malfunction on the DVR manager whic...,there is a malfunction on the dvr manager whic...,there is a malfunction on the dvr manager whic...,"[there, is, a, malfunction, on, the, dvr, mana...","[malfunction, dvr, manager, preventing, us, ad..."
8,Charges overwhelming. Comcast service rep was ...,Charges overwhelming Comcast service rep was s...,charges overwhelming comcast service rep was s...,charges overwhelming comcast service rep was s...,"[charges, overwhelming, comcast, service, rep,...","[charges, overwhelming, comcast, service, rep,..."
9,"I have had cable, DISH, and U-verse, etc. in t...",I have had cable DISH and Uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,"[i, have, had, cable, dish, and, uverse, etc, ...","[cable, dish, uverse, etc, past, eh, know, com..."


## stemming

In [13]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [14]:
def stemming(text):
    stem_text = []
    for word in text:
        stemmed_word = porter_stemmer.stem(word)
        stem_text.append(stemmed_word)
    return stem_text

df['Clean_StemmedWord'] = df['Clean_Stopwords'].apply(stemming)

In [15]:
df

Unnamed: 0,text,Clean_Punctuation,Clean_Lower,Clean_Number,Tokenize_Data,Clean_Stopwords,Clean_StemmedWord
0,I used to love Comcast. Until all these consta...,I used to love Comcast Until all these constan...,i used to love comcast until all these constan...,i used to love comcast until all these constan...,"[i, used, to, love, comcast, until, all, these...","[used, love, comcast, constant, updates, inter...","[use, love, comcast, constant, updat, internet..."
1,I'm so over Comcast! The worst internet provid...,Im so over Comcast The worst internet provider...,im so over comcast the worst internet provider...,im so over comcast the worst internet provider...,"[im, so, over, comcast, the, worst, internet, ...","[im, comcast, worst, internet, provider, im, t...","[im, comcast, worst, internet, provid, im, tak..."
2,If I could give them a negative star or no sta...,If I could give them a negative star or no sta...,if i could give them a negative star or no sta...,if i could give them a negative star or no sta...,"[if, i, could, give, them, a, negative, star, ...","[could, give, negative, star, stars, review, w...","[could, give, neg, star, star, review, would, ..."
3,I've had the worst experiences so far since in...,Ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,"[ive, had, the, worst, experiences, so, far, s...","[ive, worst, experiences, far, since, install,...","[ive, worst, experi, far, sinc, instal, noth, ..."
4,Check your contract when you sign up for Comca...,Check your contract when you sign up for Comca...,check your contract when you sign up for comca...,check your contract when you sign up for comca...,"[check, your, contract, when, you, sign, up, f...","[check, contract, sign, comcast, advertised, o...","[check, contract, sign, comcast, advertis, off..."
5,Thank God. I am changing to Dish. They gave me...,Thank God I am changing to Dish They gave me a...,thank god i am changing to dish they gave me a...,thank god i am changing to dish they gave me a...,"[thank, god, i, am, changing, to, dish, they, ...","[thank, god, changing, dish, gave, awesome, pr...","[thank, god, chang, dish, gave, awesom, price,..."
6,I Have been a long time customer and only have...,I Have been a long time customer and only have...,i have been a long time customer and only have...,i have been a long time customer and only have...,"[i, have, been, a, long, time, customer, and, ...","[long, time, customer, xfinity, isp, local, wa...","[long, time, custom, xfiniti, isp, local, walm..."
7,There is a malfunction on the DVR manager whic...,There is a malfunction on the DVR manager whic...,there is a malfunction on the dvr manager whic...,there is a malfunction on the dvr manager whic...,"[there, is, a, malfunction, on, the, dvr, mana...","[malfunction, dvr, manager, preventing, us, ad...","[malfunct, dvr, manag, prevent, us, ad, record..."
8,Charges overwhelming. Comcast service rep was ...,Charges overwhelming Comcast service rep was s...,charges overwhelming comcast service rep was s...,charges overwhelming comcast service rep was s...,"[charges, overwhelming, comcast, service, rep,...","[charges, overwhelming, comcast, service, rep,...","[charg, overwhelm, comcast, servic, rep, ignor..."
9,"I have had cable, DISH, and U-verse, etc. in t...",I have had cable DISH and Uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,"[i, have, had, cable, dish, and, uverse, etc, ...","[cable, dish, uverse, etc, past, eh, know, com...","[cabl, dish, uvers, etc, past, eh, know, comca..."


## lemmatization

In [16]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to C:\Users\End
[nltk_data]     User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
def lemmatizer(text):
    lemm_text = []
    for word in text:
        lemmatized_word = wordnet_lemmatizer.lemmatize(word)
        lemm_text.append(lemmatized_word)
    return lemm_text

df['Clean_LemmatizedWord'] = df['Clean_StemmedWord'].apply(lemmatizer)
df

Unnamed: 0,text,Clean_Punctuation,Clean_Lower,Clean_Number,Tokenize_Data,Clean_Stopwords,Clean_StemmedWord,Clean_LemmatizedWord
0,I used to love Comcast. Until all these consta...,I used to love Comcast Until all these constan...,i used to love comcast until all these constan...,i used to love comcast until all these constan...,"[i, used, to, love, comcast, until, all, these...","[used, love, comcast, constant, updates, inter...","[use, love, comcast, constant, updat, internet...","[use, love, comcast, constant, updat, internet..."
1,I'm so over Comcast! The worst internet provid...,Im so over Comcast The worst internet provider...,im so over comcast the worst internet provider...,im so over comcast the worst internet provider...,"[im, so, over, comcast, the, worst, internet, ...","[im, comcast, worst, internet, provider, im, t...","[im, comcast, worst, internet, provid, im, tak...","[im, comcast, worst, internet, provid, im, tak..."
2,If I could give them a negative star or no sta...,If I could give them a negative star or no sta...,if i could give them a negative star or no sta...,if i could give them a negative star or no sta...,"[if, i, could, give, them, a, negative, star, ...","[could, give, negative, star, stars, review, w...","[could, give, neg, star, star, review, would, ...","[could, give, neg, star, star, review, would, ..."
3,I've had the worst experiences so far since in...,Ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,ive had the worst experiences so far since ins...,"[ive, had, the, worst, experiences, so, far, s...","[ive, worst, experiences, far, since, install,...","[ive, worst, experi, far, sinc, instal, noth, ...","[ive, worst, experi, far, sinc, instal, noth, ..."
4,Check your contract when you sign up for Comca...,Check your contract when you sign up for Comca...,check your contract when you sign up for comca...,check your contract when you sign up for comca...,"[check, your, contract, when, you, sign, up, f...","[check, contract, sign, comcast, advertised, o...","[check, contract, sign, comcast, advertis, off...","[check, contract, sign, comcast, advertis, off..."
5,Thank God. I am changing to Dish. They gave me...,Thank God I am changing to Dish They gave me a...,thank god i am changing to dish they gave me a...,thank god i am changing to dish they gave me a...,"[thank, god, i, am, changing, to, dish, they, ...","[thank, god, changing, dish, gave, awesome, pr...","[thank, god, chang, dish, gave, awesom, price,...","[thank, god, chang, dish, gave, awesom, price,..."
6,I Have been a long time customer and only have...,I Have been a long time customer and only have...,i have been a long time customer and only have...,i have been a long time customer and only have...,"[i, have, been, a, long, time, customer, and, ...","[long, time, customer, xfinity, isp, local, wa...","[long, time, custom, xfiniti, isp, local, walm...","[long, time, custom, xfiniti, isp, local, walm..."
7,There is a malfunction on the DVR manager whic...,There is a malfunction on the DVR manager whic...,there is a malfunction on the dvr manager whic...,there is a malfunction on the dvr manager whic...,"[there, is, a, malfunction, on, the, dvr, mana...","[malfunction, dvr, manager, preventing, us, ad...","[malfunct, dvr, manag, prevent, us, ad, record...","[malfunct, dvr, manag, prevent, u, ad, record,..."
8,Charges overwhelming. Comcast service rep was ...,Charges overwhelming Comcast service rep was s...,charges overwhelming comcast service rep was s...,charges overwhelming comcast service rep was s...,"[charges, overwhelming, comcast, service, rep,...","[charges, overwhelming, comcast, service, rep,...","[charg, overwhelm, comcast, servic, rep, ignor...","[charg, overwhelm, comcast, servic, rep, ignor..."
9,"I have had cable, DISH, and U-verse, etc. in t...",I have had cable DISH and Uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,i have had cable dish and uverse etc in the pa...,"[i, have, had, cable, dish, and, uverse, etc, ...","[cable, dish, uverse, etc, past, eh, know, com...","[cabl, dish, uvers, etc, past, eh, know, comca...","[cabl, dish, uvers, etc, past, eh, know, comca..."


# WORD2VEC

In [18]:
tokenized_dataset = [doc.split() for doc in df]
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4)

In [19]:
#Create document embedding
X = np.array([np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv], axis=0) for doc in df])

In [20]:
#Perform clustering
k = 2  # Define the number of clusters
km = KMeans(n_clusters=k)
km.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [21]:
#Predict the clusters for each document
y_pred = km.predict(X)

In [22]:
#Tabulate the document and predicted cluster
table_data = [["Document", "Predicted Cluster"]]
table_data.extend([[doc, cluster] for doc, cluster in zip(df, y_pred)])
print(tabulate(table_data, headers="firstrow"))

Document                Predicted Cluster
--------------------  -------------------
text                                    0
Clean_Punctuation                       0
Clean_Lower                             1
Clean_Number                            0
Tokenize_Data                           1
Clean_Stopwords                         0
Clean_StemmedWord                       1
Clean_LemmatizedWord                    0


In [23]:
#Calculate purity
total_samples = len(y_pred)
cluster_label_counts = [Counter(y_pred)]
purity = sum(max(cluster.values()) for cluster in cluster_label_counts) / total_samples

print("Purity:", purity)

Purity: 0.625
