In [None]:
# !pip install wordcloud
# Can apply K Mediods as well 

In [1]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score

from wordcloud import WordCloud, STOPWORDS

In [2]:
with open("StopWords.txt") as f:
    stop_words = f.read()

stop_words = stop_words.split("\n")
stop_words_dict = {}
for stop_word in stop_words:
    stop_words_dict[stop_word] = 1

In [3]:
utterances_data = pd.read_csv("utterance_data.csv")

In [4]:
utterances_data

Unnamed: 0,utterance
0,i need $20000 transferred from my savings to m...
1,complete a transaction from savings to checkin...
2,transfer $20000 from my savings account to che...
3,take $20000 from savings and put it in checking
4,put $20000 into my checking account from my sa...
...,...
2245,give weather update now
2246,want to know the weather
2247,tell me the weather for today
2248,what is the current weather like



**We can have multiple pre-processing approaches:**

1. Feed the plain text
2. Remove the stop words.
3. Consider only unique tokens.
4. Consider only alphabetic tokens and ignoring numeric as well as alphanumeric tokens.
5. Removing the Proper Nouns from the sentences


An under-developed idea -> for each intent we can figure out most occuring words and then can see their distributions among all the clusters. We can make the use of the verb + noun to form the intent (make the use of corenlp.run)

**We will use multiple word embeddings with the above ideas and then perform the clustering to see the results**

In this notebook, we will use the following:

1. spacy small model
2. spacy medium model
3. spacy large model
4. BERT encoding of the sentence.

**We will use 3 clusters size for our experimentation and we will see the silhoutte score of the three**

1. This can be followed by calculation of our own metric
2. We can view the WordCloud for the same then.

# 1. Spacy small language model

In [5]:
spacy_small_model = spacy.load("en_core_web_sm")

### a. No Pre-processing

In [6]:
utterance_vector_list = []

for index, row in utterances_data.iterrows():
    utterance_vector = spacy_small_model(row["utterance"]).vector
    utterance_vector_list.append(utterance_vector)

In [7]:
spacy_small_utterance_vector_df = pd.DataFrame(utterance_vector_list)

In [8]:
spacy_small_utterance_vector_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,0.122316,0.409654,-0.315183,-0.396481,0.572325,-0.116967,0.068741,0.207870,0.518313,-0.358065,...,-0.059726,-0.139980,0.445532,-0.432499,0.463519,0.347772,-0.067814,0.047516,0.517396,-0.034318
1,0.310290,-0.086365,0.213203,-0.341037,0.436035,0.281055,0.016376,0.159820,0.827004,-0.615656,...,-0.129014,-0.352905,0.589819,-0.326162,0.036239,0.496323,0.136967,-0.293375,0.186098,0.227689
2,0.588311,0.083776,0.111749,-0.369836,0.340179,-0.151471,0.062078,0.255667,0.288381,-0.279114,...,-0.329753,-0.371157,0.375856,-0.151384,0.091602,0.219578,-0.051006,0.134960,0.169545,-0.191705
3,0.030160,0.081639,-0.128173,-0.223819,0.647052,0.492340,0.114859,0.036179,0.769509,-0.137410,...,-0.406225,-0.505913,0.392183,-0.206106,-0.393821,0.504962,0.253086,0.185312,0.250657,0.286735
4,0.654632,0.418925,-0.200710,-0.365614,0.362681,-0.282776,-0.014846,0.580091,0.320796,-0.357220,...,-0.140821,-0.255328,0.485503,-0.304104,0.206398,0.280626,0.042500,0.103559,0.538039,-0.139011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245,0.374229,0.313673,-0.002116,-0.183850,0.249785,0.226041,0.133056,0.674722,-0.127945,-0.716156,...,-0.265404,-0.246779,-0.652022,0.546815,-0.402363,0.093033,-0.360259,0.241924,-0.400351,-0.289231
2246,0.315445,0.118097,-0.541725,0.079783,0.004337,0.118448,0.079157,0.084522,-0.218408,-0.751499,...,0.542794,-0.122069,0.075047,-0.314177,-0.333620,-0.442252,-0.784682,0.041041,-0.208583,-0.243999
2247,0.447661,0.461515,-0.515398,-0.428918,0.192940,-0.295573,-0.433412,-0.289281,0.182548,-0.777191,...,0.834781,-0.125038,-0.265332,0.245498,-0.244638,0.111728,-0.363638,-0.116971,-0.410932,0.064658
2248,0.182239,-0.083130,-0.241828,-0.188214,0.264909,0.061547,-0.523108,0.135706,0.300140,-0.764321,...,0.506645,0.138761,-0.020453,-0.624038,-0.172071,-0.330216,-0.106386,0.391430,-0.204941,-0.206408


In [9]:
# defining a common function to be used which computes the word cloud object

def create_word_cloud(cluster_number, cluster_indices_dict, utterance_data):
    word_cloud_string = ""
    indices = cluster_indices_dict[cluster_number]
    word_cloud_string = " ".join(utterances_data.iloc[cluster_indices_dict[cluster_number]]["utterance"])
    wordcloud = WordCloud(
        width = 800, height = 800, background_color ='white', min_font_size = 10, max_words=10
    ).generate(word_cloud_string)
    
    return wordcloud


### 5 clusters - K Means

In [10]:
num_clusters = 5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 5 clusters, got a Silhoutte Score of 0.14667460197536936


### 5 clusters - K Medoids

In [15]:
num_clusters = 5
k_med_obj = KMedoids(n_clusters=num_clusters)
k_med_obj.fit(spacy_small_utterance_vector_df)
labels = k_med_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_med_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 5 clusters, got a Silhoutte Score of 0.03719554420359964


In [26]:
# fig, (ax1, ax2, ax3) = plt.subplots(3, 2, figsize=(20,20))

# ax_list = [ax1, ax2, ax3]
# wordcloud_list = [create_word_cloud(i, cluster_indices_dict) for i in range(num_clusters)]

# ax1[0].set_title("Cluster 1")
# ax1[1].set_title("Cluster 2")
# ax2[0].set_title("Cluster 3")
# ax2[1].set_title("Cluster 4")
# ax3[0].set_title("Cluster 5")

# ax1[0].imshow(wordcloud_list[0])
# ax1[1].imshow(wordcloud_list[1])
# ax2[0].imshow(wordcloud_list[2])
# ax2[1].imshow(wordcloud_list[3])
# ax3[0].imshow(wordcloud_list[4])

### 10 clusters - K means

In [13]:
num_clusters = 10
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 10 clusters, got a Silhoutte Score of 0.1492924686759916


In [24]:
# fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(5, 2, figsize=(10 ,30))
# wordcloud_list = [create_word_cloud(i, cluster_indices_dict) for i in range(num_clusters)]

# ax1[0].set_title("Cluster 1")
# ax1[1].set_title("Cluster 2")
# ax2[0].set_title("Cluster 3")
# ax2[1].set_title("Cluster 4")
# ax3[0].set_title("Cluster 5")
# ax3[1].set_title("Cluster 6")
# ax4[0].set_title("Cluster 7")
# ax4[1].set_title("Cluster 8")
# ax5[0].set_title("Cluster 9")
# ax5[1].set_title("Cluster 10")

# ax1[0].imshow(wordcloud_list[0])
# ax1[1].imshow(wordcloud_list[1])
# ax2[0].imshow(wordcloud_list[2])
# ax2[1].imshow(wordcloud_list[3])
# ax3[0].imshow(wordcloud_list[4])
# ax3[1].imshow(wordcloud_list[5])
# ax4[0].imshow(wordcloud_list[6])
# ax4[1].imshow(wordcloud_list[7])
# ax5[0].imshow(wordcloud_list[8])
# ax5[1].imshow(wordcloud_list[9])

### 15 clusters

In [22]:
num_clusters = 15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 15 clusters, got a Silhoutte Score of 0.13985773923517666


In [25]:
# fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(5, 3, figsize=(20 ,30))
# wordcloud_list = [create_word_cloud(i, cluster_indices_dict) for i in range(num_clusters)]

# ax1[0].set_title("Cluster 1")
# ax1[1].set_title("Cluster 2")
# ax1[2].set_title("Cluster 3")
# ax2[0].set_title("Cluster 4")
# ax2[1].set_title("Cluster 5")
# ax2[2].set_title("Cluster 6")
# ax3[0].set_title("Cluster 7")
# ax3[1].set_title("Cluster 8")
# ax3[2].set_title("Cluster 9")
# ax4[0].set_title("Cluster 10")
# ax4[1].set_title("Cluster 11")
# ax4[2].set_title("Cluster 12")
# ax5[0].set_title("Cluster 13")
# ax3[1].set_title("Cluster 14")
# ax5[2].set_title("Cluster 15")

# ax1[0].imshow(wordcloud_list[0])
# ax1[1].imshow(wordcloud_list[1])
# ax1[2].imshow(wordcloud_list[2])
# ax2[0].imshow(wordcloud_list[3])
# ax2[1].imshow(wordcloud_list[4])
# ax2[2].imshow(wordcloud_list[5])
# ax3[0].imshow(wordcloud_list[6])
# ax3[1].imshow(wordcloud_list[7])
# ax3[2].imshow(wordcloud_list[8])
# ax4[0].imshow(wordcloud_list[9])
# ax4[1].imshow(wordcloud_list[10])
# ax4[2].imshow(wordcloud_list[11])
# ax5[0].imshow(wordcloud_list[12])
# ax5[1].imshow(wordcloud_list[13])
# ax5[2].imshow(wordcloud_list[14])

### b. Remove Stop Words

In [58]:
def _remove_stop_words_from_sentence(sentence):
    tokens = sentence.split()
    new_tokens = []
    
    for token in tokens:
        if token in stop_words_dict:
            continue
        else:
            new_tokens.append(token)
            
    return " ".join(new_tokens)

In [61]:
utterance_vector_list = []

for index, row in utterances_data.iterrows():
    updated_sentence = _remove_stop_words_from_sentence(row["utterance"])
    utterance_vector = spacy_small_model(updated_sentence).vector
    utterance_vector_list.append(utterance_vector)

In [78]:
spacy_small_utterance_vector_df = pd.DataFrame(utterance_vector_list)

# remove all the nan values from the df
spacy_small_utterance_vector_df = spacy_small_utterance_vector_df.dropna(how="all")

spacy_small_utterance_vector_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,0.562662,0.473347,0.485400,-0.073912,0.114090,-0.287616,-0.207515,0.294349,0.060189,-0.324030,...,-1.296390,-0.685214,0.786285,-0.520211,-0.098837,0.185193,0.334207,-0.271178,0.228217,0.082919
1,0.451630,0.607523,-0.091802,-0.479464,0.076419,-0.227846,-0.203803,0.118133,-0.213195,-0.222603,...,-0.569544,-0.575336,0.382257,-0.471808,-0.378182,0.389817,0.303882,-0.158105,-0.155189,0.312928
2,1.016668,0.671490,0.500754,-0.390076,0.058386,-0.484407,-0.012161,0.660756,-0.458217,0.172499,...,-0.644170,-0.399399,0.128259,-0.319772,-0.273530,0.009402,0.052007,0.028806,-0.131704,0.091029
3,0.525074,0.327527,0.454826,-0.145802,0.085472,-0.089179,-0.020425,0.358983,-0.055223,-0.027690,...,-1.021162,-0.792806,0.575606,-0.528279,-0.035741,0.073623,0.320703,0.276655,-0.179833,0.095648
4,1.013240,0.643869,0.205970,-0.221575,0.193491,-0.213916,-0.091728,0.563753,-0.222484,-0.093423,...,-0.548465,-0.414092,0.233206,-0.451401,-0.438795,0.016098,0.068943,0.047777,0.068495,-0.011403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245,1.021287,0.561233,0.296712,-0.089666,-0.498656,0.140400,-0.158539,0.470795,-0.380939,-0.473017,...,-0.216688,-0.460209,-0.908340,-0.544224,-0.771503,-0.043804,-0.308694,-0.324932,-0.723985,-0.423137
2246,0.459708,0.418363,-0.071218,-0.603996,-0.368249,0.419205,0.288630,-0.816621,-1.184973,-0.290134,...,0.308164,0.459253,-0.518824,-0.715148,-0.152395,0.292750,-0.828462,-0.014193,-0.424802,-0.574992
2247,0.884753,0.254954,-0.138904,-0.339366,-0.443717,-0.121391,-0.322605,-0.388895,-0.289793,-0.424638,...,-0.320883,-0.024732,-0.758790,0.547576,-0.338948,0.546233,-0.507967,0.014007,-0.540997,-0.092341
2248,1.059937,0.382292,0.082401,-0.108312,-0.213050,-0.013790,-0.739656,0.151443,-0.383809,-0.585789,...,-0.252340,-0.177131,-0.622695,-1.225290,-0.847620,0.253067,0.380374,0.584055,0.051917,-0.728880


In [110]:
num_clusters = 5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 5 clusters, got a Silhoutte Score of 0.1365807664773829


In [123]:
num_clusters = 10
k_means_obj = KMeans(n_clusters=num_clusters, random_state=144)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 10 clusters, got a Silhoutte Score of 0.1731811522968166


In [121]:
num_clusters = 15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 15 clusters, got a Silhoutte Score of 0.14006449590651904


### c. Remove stop words + Remove numeric and alphanumeric tokens

In [142]:
def _remove_stop_words_keep_alphabetic_tokens(sentence):
    tokens = sentence.split()
    new_tokens = []
    
    for token in tokens:
        if token in stop_words_dict:
            continue
        elif not token.isalpha():
            continue
        else:
            new_tokens.append(token)
            
    return " ".join(new_tokens)

In [143]:
utterance_vector_list = []

for index, row in utterances_data.iterrows():
    updated_sentence = _remove_stop_words_keep_alphabetic_tokens(row["utterance"])
    utterance_vector = spacy_small_model(updated_sentence).vector
    utterance_vector_list.append(utterance_vector)

In [144]:
spacy_small_utterance_vector_df = pd.DataFrame(utterance_vector_list)

# remove all the nan values from the df
spacy_small_utterance_vector_df = spacy_small_utterance_vector_df.dropna(how="all")

spacy_small_utterance_vector_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,0.536053,0.081076,0.266426,0.256311,0.468932,0.053690,-0.872798,0.123359,-0.081703,-0.239513,...,-1.113839,-0.430697,0.452142,-0.267146,-0.552450,0.437249,0.929540,0.030206,-0.218935,-0.459127
1,0.834263,0.362954,0.230274,-0.223844,0.054927,0.185400,-0.451608,0.183808,-0.478668,-0.080900,...,-0.581383,-0.516338,-0.105411,-0.483467,-0.908574,0.321635,0.835348,-0.077314,-0.454880,-0.446741
2,1.306845,0.593806,0.334518,-0.127633,0.194318,-0.249198,0.033167,0.399669,-0.656645,0.195300,...,-0.611698,-0.077100,0.011761,-0.122307,-0.450592,0.076623,0.436669,-0.064404,-0.392655,-0.484399
3,0.316495,-0.016700,0.126092,-0.041294,0.476462,0.374032,0.002066,-0.414573,-0.003818,0.147108,...,-0.653411,-0.723515,0.203930,-0.384576,-0.179991,0.727567,0.831206,0.435847,-0.479633,-0.487526
4,1.023111,0.387359,0.110086,-0.011617,0.257807,-0.026622,-0.052976,0.318639,-0.442661,0.139326,...,-0.662048,-0.045154,-0.071173,-0.272868,-0.531742,-0.045500,0.173036,0.105475,-0.212058,-0.564833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245,1.021287,0.561233,0.296712,-0.089666,-0.498656,0.140400,-0.158539,0.470795,-0.380939,-0.473017,...,-0.216688,-0.460209,-0.908340,-0.544224,-0.771503,-0.043804,-0.308694,-0.324932,-0.723985,-0.423137
2246,0.459708,0.418363,-0.071218,-0.603996,-0.368249,0.419205,0.288630,-0.816621,-1.184973,-0.290134,...,0.308164,0.459253,-0.518824,-0.715148,-0.152395,0.292750,-0.828462,-0.014193,-0.424802,-0.574992
2247,0.884753,0.254954,-0.138904,-0.339366,-0.443717,-0.121391,-0.322605,-0.388895,-0.289793,-0.424638,...,-0.320883,-0.024732,-0.758790,0.547576,-0.338948,0.546233,-0.507967,0.014007,-0.540997,-0.092341
2248,1.059937,0.382292,0.082401,-0.108312,-0.213050,-0.013790,-0.739656,0.151443,-0.383809,-0.585789,...,-0.252340,-0.177131,-0.622695,-1.225290,-0.847620,0.253067,0.380374,0.584055,0.051917,-0.728880


In [145]:
num_clusters = 5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 5 clusters, got a Silhoutte Score of 0.1551293009951892


In [193]:
num_clusters = 10
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 10 clusters, got a Silhoutte Score of 0.16127457638324633


In [194]:
num_clusters = 15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_small_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_small_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 15 clusters, got a Silhoutte Score of 0.1562252423001354
