In [8]:
!python -m spacy download en_core_web_lg
#!pip install spacy
# Can apply K Mediods as well 

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.0MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp37-none-any.whl size=829180944 sha256=ce4ea6011ba259cbf6fa0f69c4ab8b64bf9f6b1000a633667924abaf21592a8e
  Stored in directory: /tmp/pip-ephem-wheel-cache-axcrp8ub/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [19]:
import spacy
import en_core_web_lg
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from wordcloud import WordCloud, STOPWORDS

In [13]:
with open("StopWords.txt") as f:
    stop_words = f.read()

stop_words = stop_words.split("\n")
stop_words_dict = {}
for stop_word in stop_words:
    stop_words_dict[stop_word] = 1

In [20]:
utterances_data = pd.read_csv("utterance_data.csv")

In [21]:
utterances_data

Unnamed: 0,utterance
0,i need $20000 transferred from my savings to m...
1,complete a transaction from savings to checkin...
2,transfer $20000 from my savings account to che...
3,take $20000 from savings and put it in checking
4,put $20000 into my checking account from my sa...
...,...
2245,give weather update now
2246,want to know the weather
2247,tell me the weather for today
2248,what is the current weather like


# 1. Spacy large language model

In [22]:
spacy_large_model = en_core_web_lg.load()

### a. No Pre-processing

In [23]:
utterance_vector_list = []

for index, row in utterances_data.iterrows():
    utterance_vector = spacy_large_model(row["utterance"]).vector
    utterance_vector_list.append(utterance_vector)

In [24]:
spacy_large_utterance_vector_df = pd.DataFrame(utterance_vector_list)

In [25]:
spacy_large_utterance_vector_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,0.008710,0.103141,-0.187033,0.030073,0.117481,0.079269,-0.111468,-0.210211,0.094328,1.763606,-0.312072,0.331742,0.162489,-0.075774,-0.154293,0.073596,-0.240892,1.366226,-0.140109,0.027980,0.042039,0.083535,-0.115051,0.102069,-0.091163,-0.031709,-0.200916,-0.184558,0.005096,0.029901,0.033209,0.133902,0.099660,0.165319,-0.165010,-0.031426,-0.135085,0.092813,0.086956,-0.066198,...,-0.067916,0.251919,0.062462,-0.017890,0.071531,-0.249271,-0.051243,0.167699,0.153493,-0.216393,0.352093,-0.079537,-0.115577,0.071479,-0.041945,0.052253,-0.089571,0.087267,-0.092642,0.047988,-0.071220,0.042365,-0.033118,0.015880,-0.016062,-0.092144,0.129053,0.015199,-0.115557,0.090926,-0.234095,0.121123,0.153193,-0.013857,0.029047,-0.075116,0.040600,0.048890,-0.061637,0.068836
1,-0.072819,0.057493,-0.101307,0.092494,0.111823,-0.012875,-0.105055,-0.135047,0.162695,1.672547,-0.229134,0.340677,0.024188,0.029691,-0.155931,-0.022752,-0.206816,1.490959,-0.059834,-0.075830,0.001136,0.108235,-0.193158,0.108995,-0.022056,-0.033842,-0.057781,-0.101532,-0.031044,0.088442,0.143829,-0.034016,0.223242,0.152086,-0.096072,-0.128594,-0.216509,0.153222,0.172887,-0.091478,...,-0.012384,0.302101,0.066471,0.072542,0.079076,-0.302846,0.040048,0.096990,0.476864,-0.353935,0.211312,-0.074391,-0.054336,0.034417,-0.022154,0.039698,-0.072059,0.133685,0.038626,-0.083567,0.043181,0.060807,0.019262,0.073107,-0.046561,-0.083199,0.046957,-0.017627,-0.025284,0.010635,-0.339528,0.177026,0.146313,-0.049051,-0.053600,-0.056010,-0.124576,0.010702,0.001371,0.067189
2,-0.087281,0.138547,-0.196484,0.091076,0.015438,0.083043,-0.118889,-0.167810,0.051155,1.758687,-0.282497,0.489109,0.110465,0.052203,-0.145880,0.097750,-0.281727,1.430369,-0.024612,-0.057365,0.005242,0.130776,-0.247962,0.076910,-0.008992,0.011627,-0.158251,-0.122897,-0.030298,0.078909,0.208844,0.121292,0.238144,0.176585,-0.295879,-0.039094,-0.223194,0.080566,0.210854,-0.136292,...,0.003178,0.259909,0.047121,-0.009564,0.010561,-0.331886,-0.019365,0.164659,0.433274,-0.323280,0.338691,-0.133718,0.046018,0.161899,-0.078188,0.143504,-0.111765,0.090804,0.065631,-0.102952,-0.052652,0.111530,-0.009929,0.163079,-0.112496,0.019432,0.015938,0.075931,-0.085741,0.049415,-0.367307,0.218683,0.267774,-0.007988,0.070087,-0.144897,-0.003220,0.080603,-0.064214,0.014527
3,-0.104942,0.134014,-0.046438,0.001980,0.124451,0.015563,-0.178291,-0.128011,0.110251,1.936557,-0.267043,0.294472,0.025675,-0.044318,-0.063867,-0.011535,-0.169269,1.188463,-0.197377,-0.108089,0.009009,0.169460,-0.048015,-0.016890,-0.045877,-0.095558,-0.119784,-0.146330,0.054310,-0.051040,0.040585,0.087417,0.084326,0.092900,-0.005159,-0.129693,-0.181298,0.116256,0.115921,0.027632,...,0.057582,0.289245,0.048371,0.015016,0.187155,-0.244078,-0.099080,0.058112,0.254930,-0.211799,0.210126,-0.141400,-0.008832,0.082557,-0.050988,0.040196,0.001268,0.230401,-0.059002,0.093552,-0.010465,0.029255,-0.060795,0.102639,-0.002565,-0.059402,0.199567,-0.016377,-0.022074,-0.010259,-0.305518,0.060414,0.183411,-0.094418,-0.107402,-0.036865,-0.054459,-0.005308,0.016235,0.049531
4,-0.070286,0.128930,-0.193588,0.143074,0.091759,0.084423,-0.187915,-0.162326,0.068780,1.879506,-0.326269,0.451947,0.103250,-0.007552,-0.072042,0.118239,-0.266627,1.252244,-0.091144,0.020665,0.038999,0.159509,-0.149045,0.081734,0.002988,-0.077006,-0.125131,-0.078582,0.030878,0.040446,0.169456,0.205537,0.123285,0.205128,-0.231660,-0.071511,-0.195423,0.044971,0.145734,-0.124433,...,0.004078,0.285665,0.071474,-0.060108,0.148492,-0.255272,-0.056435,0.155645,0.290250,-0.310708,0.327110,-0.086750,-0.036878,0.012461,-0.098884,0.063059,-0.127921,0.049730,0.065749,-0.089003,-0.048793,0.107118,-0.036780,0.050762,-0.080974,0.010597,0.136837,0.016425,-0.089489,-0.004806,-0.356972,0.153508,0.294198,0.024258,-0.033749,-0.124483,0.035318,0.054003,-0.081949,-0.056036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245,-0.113771,0.173270,-0.141332,-0.118344,-0.102967,-0.099597,0.155304,-0.242512,-0.087302,2.131250,-0.281875,-0.214697,-0.044575,-0.293848,-0.253242,-0.143998,-0.070473,1.182820,-0.000658,-0.218206,-0.012980,-0.063548,0.107755,-0.016530,-0.140189,0.188254,-0.129542,-0.229865,0.372325,0.006036,-0.176522,-0.055325,0.161905,0.030025,0.041837,0.118639,0.093270,-0.005341,0.005257,-0.074524,...,0.227218,0.078165,0.100536,0.001185,0.075830,-0.294086,-0.239904,0.253758,0.422380,-0.000031,0.112350,0.085515,-0.124973,-0.035300,-0.082347,0.127560,0.018218,0.028642,0.157685,0.182599,0.104395,-0.083374,-0.099881,0.113421,-0.026673,-0.191384,0.214993,-0.205132,0.128940,-0.067607,-0.310955,0.188608,-0.047555,-0.093398,0.189595,-0.338406,-0.331312,-0.106147,-0.112004,0.060834
2246,0.129389,0.179358,-0.236456,-0.013042,0.038800,0.028361,0.038872,0.107254,-0.050246,2.434420,-0.453556,-0.159449,0.153114,0.000636,-0.318486,-0.181144,-0.209716,1.092784,-0.129176,-0.141446,0.120583,-0.050696,-0.056241,-0.015911,-0.160589,0.044012,-0.163812,-0.116961,0.121442,-0.100160,-0.077931,0.208130,-0.087437,0.153371,0.245871,0.009363,0.160641,0.102450,0.005509,-0.109538,...,0.090877,0.182970,0.123538,-0.088508,0.151381,-0.142293,-0.199947,0.282183,0.332864,0.160268,0.107840,-0.011075,-0.165344,-0.119903,-0.027631,0.023271,0.085413,0.116069,-0.097634,0.117677,0.017980,-0.046387,-0.054779,0.151967,-0.003290,-0.093152,0.265622,-0.180505,0.130802,-0.020339,-0.214001,0.210640,-0.079368,-0.171908,0.134174,-0.215216,-0.246844,0.017182,0.138357,0.145429
2247,-0.051182,0.282077,-0.256207,-0.020200,0.093289,-0.132514,0.079985,-0.013077,-0.085716,2.433050,-0.354327,-0.110326,0.067992,-0.257195,-0.292850,-0.134692,-0.134513,0.979447,-0.072433,-0.145411,0.009081,-0.051227,-0.097910,0.043165,-0.055257,0.063406,-0.172380,-0.058990,0.076725,-0.077545,-0.029719,0.076893,0.041301,0.064946,0.179934,0.149692,0.035098,-0.050557,0.078402,-0.003740,...,0.143436,0.218373,-0.097845,-0.127992,0.064828,-0.214136,-0.074663,0.242318,0.345582,0.048601,0.115565,0.037482,-0.170564,0.017087,-0.051325,0.152871,0.083245,-0.019662,-0.056329,0.115639,0.087195,-0.010857,-0.086911,0.031347,-0.034017,-0.006080,0.324621,-0.235672,0.035060,-0.004556,-0.255987,-0.036358,0.101168,-0.130347,0.121947,-0.234553,-0.062716,0.047022,-0.071197,0.116277
2248,0.041140,0.230624,-0.041323,-0.056232,0.039635,0.065667,0.013528,0.030725,-0.011634,2.419017,-0.265467,-0.177208,0.110246,-0.028629,-0.183182,-0.258568,-0.078003,1.193075,-0.150595,-0.226410,0.041653,-0.137593,-0.134277,-0.029620,0.018633,0.075986,-0.128799,-0.036828,-0.012947,0.077046,-0.052290,0.179162,0.065965,0.052147,0.109097,0.163297,0.095764,-0.024075,-0.056347,-0.136012,...,0.204883,0.147243,0.067337,-0.087509,0.099338,-0.105724,-0.154060,0.194394,0.503307,0.024448,0.051745,0.039638,-0.177345,-0.165991,0.002009,-0.027055,0.075871,0.096923,0.026864,0.112015,0.079200,0.009244,-0.127711,0.104063,0.073674,-0.093760,0.226504,-0.210764,0.261290,-0.109963,-0.127216,0.201910,-0.000709,-0.154989,0.098442,0.002269,-0.236802,-0.089033,-0.004892,0.087839


In [26]:
# defining a common function to be used which computes the word cloud object

def create_word_cloud(cluster_number, cluster_indices_dict, utterance_data):
    word_cloud_string = ""
    indices = cluster_indices_dict[cluster_number]
    word_cloud_string = " ".join(utterances_data.iloc[cluster_indices_dict[cluster_number]]["utterance"])
    wordcloud = WordCloud(
        width = 800, height = 800, background_color ='white', min_font_size = 10, max_words=10
    ).generate(word_cloud_string)
    
    return wordcloud


### 5 clusters

In [30]:
num_clusters = 5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 5 clusters, got a Silhoutte Score of 0.20056748375583944


In [None]:
# fig, (ax1, ax2, ax3) = plt.subplots(3, 2, figsize=(20,20))

# ax_list = [ax1, ax2, ax3]
# wordcloud_list = [create_word_cloud(i, cluster_indices_dict) for i in range(num_clusters)]

# ax1[0].set_title("Cluster 1")
# ax1[1].set_title("Cluster 2")
# ax2[0].set_title("Cluster 3")
# ax2[1].set_title("Cluster 4")
# ax3[0].set_title("Cluster 5")

# ax1[0].imshow(wordcloud_list[0])
# ax1[1].imshow(wordcloud_list[1])
# ax2[0].imshow(wordcloud_list[2])
# ax2[1].imshow(wordcloud_list[3])
# ax3[0].imshow(wordcloud_list[4])

### 10 clusters

In [46]:
num_clusters = 10
k_means_obj = KMeans(n_clusters=num_clusters, random_state=69)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 10 clusters, got a Silhoutte Score of 0.2025511466109177


In [None]:
# fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(5, 2, figsize=(10 ,30))
# wordcloud_list = [create_word_cloud(i, cluster_indices_dict) for i in range(num_clusters)]

# ax1[0].set_title("Cluster 1")
# ax1[1].set_title("Cluster 2")
# ax2[0].set_title("Cluster 3")
# ax2[1].set_title("Cluster 4")
# ax3[0].set_title("Cluster 5")
# ax3[1].set_title("Cluster 6")
# ax4[0].set_title("Cluster 7")
# ax4[1].set_title("Cluster 8")
# ax5[0].set_title("Cluster 9")
# ax5[1].set_title("Cluster 10")

# ax1[0].imshow(wordcloud_list[0])
# ax1[1].imshow(wordcloud_list[1])
# ax2[0].imshow(wordcloud_list[2])
# ax2[1].imshow(wordcloud_list[3])
# ax3[0].imshow(wordcloud_list[4])
# ax3[1].imshow(wordcloud_list[5])
# ax4[0].imshow(wordcloud_list[6])
# ax4[1].imshow(wordcloud_list[7])
# ax5[0].imshow(wordcloud_list[8])
# ax5[1].imshow(wordcloud_list[9])

### 15 clusters

In [56]:
num_clusters = 15
k_means_obj = KMeans(n_clusters=num_clusters, random_state=21)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 15 clusters, got a Silhoutte Score of 0.2330695405333003


In [None]:
# fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(5, 3, figsize=(20 ,30))
# wordcloud_list = [create_word_cloud(i, cluster_indices_dict) for i in range(num_clusters)]

# ax1[0].set_title("Cluster 1")
# ax1[1].set_title("Cluster 2")
# ax1[2].set_title("Cluster 3")
# ax2[0].set_title("Cluster 4")
# ax2[1].set_title("Cluster 5")
# ax2[2].set_title("Cluster 6")
# ax3[0].set_title("Cluster 7")
# ax3[1].set_title("Cluster 8")
# ax3[2].set_title("Cluster 9")
# ax4[0].set_title("Cluster 10")
# ax4[1].set_title("Cluster 11")
# ax4[2].set_title("Cluster 12")
# ax5[0].set_title("Cluster 13")
# ax3[1].set_title("Cluster 14")
# ax5[2].set_title("Cluster 15")

# ax1[0].imshow(wordcloud_list[0])
# ax1[1].imshow(wordcloud_list[1])
# ax1[2].imshow(wordcloud_list[2])
# ax2[0].imshow(wordcloud_list[3])
# ax2[1].imshow(wordcloud_list[4])
# ax2[2].imshow(wordcloud_list[5])
# ax3[0].imshow(wordcloud_list[6])
# ax3[1].imshow(wordcloud_list[7])
# ax3[2].imshow(wordcloud_list[8])
# ax4[0].imshow(wordcloud_list[9])
# ax4[1].imshow(wordcloud_list[10])
# ax4[2].imshow(wordcloud_list[11])
# ax5[0].imshow(wordcloud_list[12])
# ax5[1].imshow(wordcloud_list[13])
# ax5[2].imshow(wordcloud_list[14])

### b. Remove Stop Words

In [57]:
def _remove_stop_words_from_sentence(sentence):
    tokens = sentence.split()
    new_tokens = []
    
    for token in tokens:
        if token in stop_words_dict:
            continue
        else:
            new_tokens.append(token)
            
    return " ".join(new_tokens)

In [59]:
utterance_vector_list = []

for index, row in utterances_data.iterrows():
    updated_sentence = _remove_stop_words_from_sentence(row["utterance"])
    utterance_vector = spacy_large_model(updated_sentence).vector
    utterance_vector_list.append(utterance_vector)

In [60]:
spacy_large_utterance_vector_df = pd.DataFrame(utterance_vector_list)

# remove all the nan values from the df
spacy_large_utterance_vector_df = spacy_large_utterance_vector_df.dropna(how="all")

spacy_large_utterance_vector_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,-0.143532,0.056766,0.000244,-0.090598,0.047727,0.143285,-0.260088,-0.291740,0.192298,1.257934,-0.113792,0.597844,0.078033,-0.057676,-0.117974,0.070104,-0.302965,1.334798,-0.061957,-0.136521,-0.064339,0.271282,-0.129680,0.208356,-0.046742,-0.072746,-0.245080,-0.179100,-0.006840,0.045058,0.075173,-0.130947,0.465040,0.147204,-0.242676,-0.073085,-0.221636,0.187415,0.214137,0.033391,...,0.019102,0.364323,-0.093112,-0.045274,0.047703,-0.282160,0.022005,0.025126,0.255102,-0.408202,0.310990,-0.181315,0.053954,0.230261,-0.122114,0.122254,-0.128995,0.185242,-0.120520,-0.152194,-0.125850,0.130693,-0.006596,0.173718,0.012657,0.042385,-0.043864,0.110784,-0.258898,0.230020,-0.304286,0.151673,0.206961,-0.026336,-0.088562,-0.027133,-0.024951,0.018906,-0.023593,-0.050930
1,-0.194127,0.053348,-0.058426,0.024769,0.103554,0.050281,-0.182575,-0.189400,0.264250,1.202345,-0.191908,0.490407,-0.002566,0.071390,-0.180383,0.040601,-0.271808,1.544649,-0.005461,-0.012490,-0.020755,0.308922,-0.102088,0.087304,-0.025872,-0.078096,-0.107939,-0.135701,0.028008,0.104933,0.126420,-0.198023,0.465857,0.164895,-0.263982,-0.098136,-0.218417,0.209606,0.210617,-0.018458,...,0.033042,0.295900,0.104373,0.042954,0.028227,-0.327532,0.080789,0.090239,0.609793,-0.389110,0.170580,-0.106521,-0.025875,0.146434,-0.054740,0.149435,-0.135647,0.132051,0.041497,-0.230247,-0.025448,0.075542,0.083388,0.157385,0.009085,-0.003066,-0.011806,0.093144,-0.069837,0.087612,-0.417989,0.193529,0.132885,-0.001235,-0.000165,-0.122186,-0.131087,0.024031,0.126454,0.007267
2,-0.184551,0.175481,-0.151998,-0.013791,-0.071378,0.117145,-0.130629,-0.224463,0.081302,1.499439,-0.237876,0.616939,0.069771,0.112929,-0.129047,0.115248,-0.313076,1.510613,0.012439,-0.095227,-0.034702,0.204074,-0.251856,0.037189,0.026717,0.002196,-0.174755,-0.132658,0.042582,0.105797,0.253236,-0.007617,0.453979,0.173289,-0.398570,-0.028614,-0.218894,0.097609,0.202495,-0.124318,...,0.110583,0.275505,0.025821,-0.003959,-0.040165,-0.342906,0.032444,0.119358,0.618196,-0.364366,0.292240,-0.168445,0.129371,0.253216,-0.128761,0.247298,-0.179716,0.091497,0.142178,-0.210664,-0.041114,0.177051,0.010071,0.282891,-0.160852,0.122865,-0.081380,0.150260,-0.131045,0.100787,-0.432687,0.241725,0.260814,-0.066272,0.124759,-0.152037,-0.040363,0.057172,-0.003413,-0.043399
3,-0.214198,0.126226,-0.022564,-0.076736,0.095619,0.053929,-0.216402,-0.242196,0.176926,1.276754,-0.248412,0.502791,0.046951,-0.022450,-0.044340,-0.002648,-0.281563,1.208956,-0.147481,-0.111636,0.028343,0.389468,0.028262,0.081939,-0.072924,-0.114837,-0.222278,-0.187412,0.118344,-0.024634,0.137589,-0.064811,0.362852,0.127298,-0.057538,-0.150061,-0.208716,0.166050,0.164097,0.093597,...,0.043974,0.396504,0.032068,-0.039780,0.145266,-0.316857,-0.008270,0.049908,0.312901,-0.282608,0.234834,-0.108082,0.047476,0.239487,-0.074538,0.056368,-0.124899,0.218354,-0.119064,-0.045816,-0.135003,0.119907,0.021903,0.266640,-0.004037,0.003408,0.096778,0.062469,-0.175024,0.196159,-0.335238,0.115077,0.239860,-0.074249,-0.120138,-0.056909,-0.010232,-0.018103,0.094197,-0.099168
4,-0.216296,0.190858,-0.140754,0.015634,-0.061801,0.062668,-0.181691,-0.200373,0.087487,1.612538,-0.309929,0.578965,-0.002198,0.083022,-0.031425,0.083837,-0.268579,1.266454,-0.058129,-0.063456,0.042714,0.287250,-0.082787,0.014845,-0.002377,-0.066641,-0.144778,-0.131003,0.086345,0.023587,0.253375,0.022015,0.359391,0.214067,-0.294393,-0.095101,-0.219531,0.088307,0.167538,-0.079325,...,0.153664,0.316003,0.030492,-0.018592,0.097331,-0.329692,0.032473,0.138883,0.572787,-0.290554,0.265621,-0.123481,0.083645,0.192526,-0.139856,0.130886,-0.174542,0.106659,0.144183,-0.192860,-0.087648,0.214350,0.040721,0.214293,-0.163132,0.133103,0.035519,0.099406,-0.116109,0.095108,-0.423101,0.207861,0.242283,-0.047272,0.040010,-0.183578,-0.033927,-0.022221,0.027321,-0.132194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245,-0.034511,0.142406,-0.076045,-0.169038,-0.347430,0.195750,0.255785,-0.156305,-0.098385,1.718950,-0.535635,-0.502350,-0.040765,-0.318815,-0.459115,-0.253130,-0.078105,1.428200,0.195763,-0.295972,0.021475,0.030290,0.083015,-0.045806,-0.010744,0.282547,-0.271835,-0.258925,0.409295,0.246101,-0.331135,-0.306340,0.203410,0.081661,-0.015985,0.177778,0.284569,-0.057042,-0.004840,-0.018530,...,0.171306,0.028683,0.016679,-0.283675,-0.136310,-0.339958,-0.467860,0.514261,0.356895,-0.117895,-0.055395,0.212670,-0.144216,-0.046939,0.007630,0.321835,0.031520,-0.168315,0.069915,0.273982,0.207185,-0.274439,-0.061435,0.183375,0.065865,-0.284805,0.221800,-0.189905,0.196380,-0.139263,-0.352385,0.465705,-0.064376,-0.060780,0.363360,-0.476825,-0.501345,0.000425,-0.210417,0.073737
2246,-0.004789,0.374390,-0.207165,-0.206445,-0.019480,0.186485,0.182125,0.159710,-0.198281,2.378900,-0.477175,-0.294037,-0.003820,-0.129990,-0.626430,-0.244388,-0.222495,0.915410,0.029915,-0.293711,0.121675,-0.096365,-0.135737,-0.049957,-0.082002,-0.072459,-0.274270,-0.125365,0.385235,-0.112439,-0.140148,0.108375,-0.013470,0.018148,0.139887,0.373485,0.393000,0.055950,0.040880,-0.024154,...,0.267235,0.152516,0.013186,-0.233195,0.010948,0.092987,-0.432301,0.609255,0.209655,0.300960,-0.268227,0.125113,-0.319565,-0.157934,-0.137330,0.252946,0.146077,-0.252138,-0.168861,0.029968,-0.028875,-0.258153,-0.138965,0.253430,0.149507,0.112510,0.501475,-0.333675,0.111890,0.037387,-0.244710,0.272275,-0.250735,-0.160248,0.331720,-0.322605,-0.424230,0.058473,0.212533,0.051537
2247,-0.084311,0.417737,-0.173832,-0.105170,0.021784,-0.120931,0.073115,0.013163,-0.168329,2.483167,-0.363793,-0.389302,0.044197,-0.247843,-0.486437,-0.154800,-0.156825,0.811610,0.029527,-0.290079,0.011570,-0.011071,-0.003125,-0.007968,-0.023929,0.070772,-0.319400,-0.077618,0.079347,-0.089260,-0.044423,-0.035073,0.190806,-0.090901,0.161034,0.408157,0.176434,0.044377,0.064029,0.027037,...,0.314932,0.124680,-0.140748,-0.270117,-0.032950,-0.144860,-0.284543,0.347410,0.312723,0.315987,-0.192573,0.090016,-0.130831,0.149608,-0.120597,0.260593,0.043395,-0.072711,-0.139984,-0.050124,0.109053,-0.142953,-0.117130,0.228610,0.030739,0.249270,0.353787,-0.239242,-0.000700,0.135938,-0.275337,0.040247,-0.039220,-0.137000,0.168200,-0.376473,-0.204331,0.026930,0.062849,0.056905
2248,0.141238,0.173069,0.263020,0.102215,-0.190480,0.095840,0.000140,0.183595,-0.033345,2.199250,-0.360690,-0.327745,-0.014490,-0.089899,-0.347910,-0.379645,-0.072225,1.475250,0.078280,-0.333235,0.048855,-0.138830,-0.207375,-0.049837,0.105768,0.193962,-0.295910,-0.028269,0.040800,0.336596,-0.198095,-0.106408,0.086575,-0.099034,0.053480,0.429690,0.338295,-0.174745,0.164702,-0.065228,...,0.064475,0.195480,0.165443,-0.255814,-0.048926,-0.248243,-0.463910,0.510254,0.299335,-0.032890,-0.088825,0.282360,-0.174902,-0.077014,0.117950,0.024640,0.111570,0.106830,-0.180541,-0.036753,0.013751,-0.086320,-0.354385,0.516860,0.209490,0.108895,0.250075,-0.313370,0.255685,0.009313,-0.147990,0.458210,0.011245,-0.214987,0.206380,-0.204490,-0.365450,-0.164570,-0.100182,-0.094973


In [73]:
num_clusters = 5
k_means_obj = KMeans(n_clusters=num_clusters, random_state=11)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 5 clusters, got a Silhoutte Score of 0.23011123691674917


In [89]:
num_clusters = 10
k_means_obj = KMeans(n_clusters=num_clusters, random_state=42)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 10 clusters, got a Silhoutte Score of 0.31541729933251694


In [102]:
num_clusters = 15
k_means_obj = KMeans(n_clusters=num_clusters, random_state=72)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 15 clusters, got a Silhoutte Score of 0.3413227979498536


### c. Remove stop words + Remove numeric and alphanumeric tokens

In [103]:
def _remove_stop_words_keep_alphabetic_tokens(sentence):
    tokens = sentence.split()
    new_tokens = []
    
    for token in tokens:
        if token in stop_words_dict:
            continue
        elif not token.isalpha():
            continue
        else:
            new_tokens.append(token)
            
    return " ".join(new_tokens)

In [104]:
utterance_vector_list = []

for index, row in utterances_data.iterrows():
    updated_sentence = _remove_stop_words_keep_alphabetic_tokens(row["utterance"])
    utterance_vector = spacy_large_model(updated_sentence).vector
    utterance_vector_list.append(utterance_vector)

In [105]:
spacy_large_utterance_vector_df = pd.DataFrame(utterance_vector_list)

# remove all the nan values from the df
spacy_large_utterance_vector_df = spacy_large_utterance_vector_df.dropna(how="all")

spacy_large_utterance_vector_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,-0.000193,-0.123207,-0.173320,0.044660,-0.174622,0.274310,-0.377301,-0.336219,0.027147,2.065867,-0.094447,0.479827,0.137135,0.027050,-0.209990,0.030707,-0.179315,1.441800,-0.033180,0.314438,-0.044312,0.013573,-0.212207,0.125598,-0.089924,-0.030224,-0.131280,-0.129380,0.024830,-0.107279,0.102628,-0.076358,0.450573,0.163010,-0.488583,0.125417,-0.088491,0.230494,0.126078,-0.254575,...,0.069652,0.200518,-0.118173,0.052986,0.119351,-0.027207,-0.042703,0.177687,0.218893,-0.403163,0.324517,-0.138012,0.093463,0.064005,-0.001747,0.063286,0.000698,0.254452,-0.075634,-0.237444,0.014189,-0.233248,-0.066873,0.224467,-0.126744,0.007167,-0.045481,-0.126991,-0.263583,0.146466,-0.446097,0.235077,0.121358,0.104740,0.231737,-0.224013,0.092440,0.059983,-0.100873,0.148646
1,-0.111920,-0.083340,-0.217935,0.183895,-0.035294,0.102048,-0.231727,-0.171589,0.176363,1.780500,-0.216457,0.348176,0.001461,0.199467,-0.280600,-0.003698,-0.163492,1.729825,0.044371,0.387745,0.016058,0.134460,-0.150187,-0.035290,-0.047824,-0.048878,0.045981,-0.076712,0.069185,0.020618,0.172635,-0.190619,0.455415,0.185595,-0.459065,0.038215,-0.116948,0.253011,0.142813,-0.260357,...,0.077925,0.138835,0.184320,0.160762,0.072225,-0.159003,0.061651,0.237216,0.759983,-0.375785,0.110520,-0.036646,-0.036158,-0.020172,0.069223,0.118800,-0.041703,0.157363,0.156171,-0.333210,0.129783,-0.224990,0.083171,0.187280,-0.097253,-0.052205,0.003009,-0.094006,0.021180,-0.046257,-0.581198,0.277011,0.031645,0.109622,0.284258,-0.317373,-0.096113,0.057401,0.143517,0.186047
2,-0.114956,0.114984,-0.317034,0.098086,-0.252429,0.185304,-0.149172,-0.224240,-0.062187,2.080800,-0.275902,0.553766,0.101927,0.232006,-0.188687,0.109668,-0.242931,1.645140,0.059465,0.191866,-0.010831,0.022566,-0.350242,-0.080932,0.030192,0.057687,-0.078346,-0.084250,0.081353,0.038690,0.340935,0.074468,0.440874,0.193206,-0.608472,0.108276,-0.137910,0.087535,0.145003,-0.360181,...,0.177505,0.141695,0.058357,0.071523,-0.032323,-0.214233,-0.002205,0.248588,0.741708,-0.343808,0.292856,-0.137315,0.183244,0.162645,-0.059200,0.261936,-0.122188,0.095525,0.274189,-0.285202,0.076804,-0.022770,-0.019429,0.357010,-0.313897,0.133926,-0.097357,0.023386,-0.082715,-0.001038,-0.569134,0.327788,0.230994,-0.003600,0.402266,-0.320127,0.023907,0.097125,-0.041709,0.079360
3,-0.117971,-0.007441,-0.211333,0.067764,-0.094802,0.125383,-0.304491,-0.253646,0.001526,2.097234,-0.318813,0.321406,0.085331,0.085760,-0.087268,-0.090546,-0.143645,1.232063,-0.175719,0.355913,0.110158,0.210550,0.051030,-0.085096,-0.133560,-0.100374,-0.093276,-0.143233,0.233470,-0.223433,0.206655,0.033868,0.280259,0.129833,-0.180020,-0.002876,-0.066957,0.194887,0.042677,-0.154232,...,0.111105,0.254153,0.090460,0.062142,0.281956,-0.085036,-0.093161,0.218991,0.315226,-0.193840,0.197590,-0.015956,0.082666,0.079381,0.077547,-0.046524,0.007525,0.309638,-0.073207,-0.060146,-0.001065,-0.251226,-0.019376,0.379337,-0.154568,-0.057794,0.188922,-0.207515,-0.123793,0.090031,-0.497683,0.174084,0.176190,0.024885,0.179109,-0.273640,0.116971,-0.001698,0.095443,0.068251
4,-0.159398,0.136512,-0.301292,0.139282,-0.239021,0.109036,-0.220659,-0.190514,-0.053528,2.239140,-0.376776,0.500603,0.001171,0.190136,-0.052015,0.065692,-0.180635,1.303318,-0.039331,0.236345,0.097551,0.139012,-0.113546,-0.112214,-0.010540,-0.038685,-0.036377,-0.081932,0.142621,-0.076404,0.341129,0.115953,0.308452,0.250296,-0.462624,0.015194,-0.138802,0.074512,0.096062,-0.297191,...,0.237819,0.198392,0.064897,0.051037,0.160171,-0.195733,-0.002165,0.275922,0.678135,-0.240472,0.255590,-0.074366,0.119228,0.077679,-0.074732,0.098958,-0.114945,0.116751,0.276996,-0.260276,0.011656,0.029449,0.023481,0.260972,-0.317089,0.148260,0.066301,-0.047809,-0.061805,-0.008989,-0.555714,0.280379,0.205050,0.022999,0.283618,-0.364284,0.032916,-0.014026,0.001318,-0.044954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245,-0.034511,0.142406,-0.076045,-0.169038,-0.347430,0.195750,0.255785,-0.156305,-0.098385,1.718950,-0.535635,-0.502350,-0.040765,-0.318815,-0.459115,-0.253130,-0.078105,1.428200,0.195763,-0.295972,0.021475,0.030290,0.083015,-0.045806,-0.010744,0.282547,-0.271835,-0.258925,0.409295,0.246101,-0.331135,-0.306340,0.203410,0.081661,-0.015985,0.177778,0.284569,-0.057042,-0.004840,-0.018530,...,0.171306,0.028683,0.016679,-0.283675,-0.136310,-0.339958,-0.467860,0.514261,0.356895,-0.117895,-0.055395,0.212670,-0.144216,-0.046939,0.007630,0.321835,0.031520,-0.168315,0.069915,0.273982,0.207185,-0.274439,-0.061435,0.183375,0.065865,-0.284805,0.221800,-0.189905,0.196380,-0.139263,-0.352385,0.465705,-0.064376,-0.060780,0.363360,-0.476825,-0.501345,0.000425,-0.210417,0.073737
2246,-0.004789,0.374390,-0.207165,-0.206445,-0.019480,0.186485,0.182125,0.159710,-0.198281,2.378900,-0.477175,-0.294037,-0.003820,-0.129990,-0.626430,-0.244388,-0.222495,0.915410,0.029915,-0.293711,0.121675,-0.096365,-0.135737,-0.049957,-0.082002,-0.072459,-0.274270,-0.125365,0.385235,-0.112439,-0.140148,0.108375,-0.013470,0.018148,0.139887,0.373485,0.393000,0.055950,0.040880,-0.024154,...,0.267235,0.152516,0.013186,-0.233195,0.010948,0.092987,-0.432301,0.609255,0.209655,0.300960,-0.268227,0.125113,-0.319565,-0.157934,-0.137330,0.252946,0.146077,-0.252138,-0.168861,0.029968,-0.028875,-0.258153,-0.138965,0.253430,0.149507,0.112510,0.501475,-0.333675,0.111890,0.037387,-0.244710,0.272275,-0.250735,-0.160248,0.331720,-0.322605,-0.424230,0.058473,0.212533,0.051537
2247,-0.084311,0.417737,-0.173832,-0.105170,0.021784,-0.120931,0.073115,0.013163,-0.168329,2.483167,-0.363793,-0.389302,0.044197,-0.247843,-0.486437,-0.154800,-0.156825,0.811610,0.029527,-0.290079,0.011570,-0.011071,-0.003125,-0.007968,-0.023929,0.070772,-0.319400,-0.077618,0.079347,-0.089260,-0.044423,-0.035073,0.190806,-0.090901,0.161034,0.408157,0.176434,0.044377,0.064029,0.027037,...,0.314932,0.124680,-0.140748,-0.270117,-0.032950,-0.144860,-0.284543,0.347410,0.312723,0.315987,-0.192573,0.090016,-0.130831,0.149608,-0.120597,0.260593,0.043395,-0.072711,-0.139984,-0.050124,0.109053,-0.142953,-0.117130,0.228610,0.030739,0.249270,0.353787,-0.239242,-0.000700,0.135938,-0.275337,0.040247,-0.039220,-0.137000,0.168200,-0.376473,-0.204331,0.026930,0.062849,0.056905
2248,0.141238,0.173069,0.263020,0.102215,-0.190480,0.095840,0.000140,0.183595,-0.033345,2.199250,-0.360690,-0.327745,-0.014490,-0.089899,-0.347910,-0.379645,-0.072225,1.475250,0.078280,-0.333235,0.048855,-0.138830,-0.207375,-0.049837,0.105768,0.193962,-0.295910,-0.028269,0.040800,0.336596,-0.198095,-0.106408,0.086575,-0.099034,0.053480,0.429690,0.338295,-0.174745,0.164702,-0.065228,...,0.064475,0.195480,0.165443,-0.255814,-0.048926,-0.248243,-0.463910,0.510254,0.299335,-0.032890,-0.088825,0.282360,-0.174902,-0.077014,0.117950,0.024640,0.111570,0.106830,-0.180541,-0.036753,0.013751,-0.086320,-0.354385,0.516860,0.209490,0.108895,0.250075,-0.313370,0.255685,0.009313,-0.147990,0.458210,0.011245,-0.214987,0.206380,-0.204490,-0.365450,-0.164570,-0.100182,-0.094973


In [113]:
num_clusters = 5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 5 clusters, got a Silhoutte Score of 0.2354484488243004


In [146]:
num_clusters = 10
k_means_obj = KMeans(n_clusters=num_clusters, random_state=19)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 10 clusters, got a Silhoutte Score of 0.33135953789345596


In [154]:
num_clusters = 15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(spacy_large_utterance_vector_df)
labels = k_means_obj.labels_

sil_score = silhouette_score(spacy_large_utterance_vector_df, k_means_obj.labels_, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

cluster_indices_dict = {}
for i in range(num_clusters):
    cluster_indices_dict[i] = np.where(labels == i)[0]

Fitted 15 clusters, got a Silhoutte Score of 0.3742338246477373
