In [2]:
import nltk
import gensim
import gensim.downloader as api
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from nltk.stem import WordNetLemmatizer


In [8]:
lemmatizer =  WordNetLemmatizer()

# Loading Stopwords
with open("StopWords.txt") as f:
    stop_words = f.read()

stop_words = stop_words.split("\n")
stop_words_dict = {}
for stop_word in stop_words:
    stop_words_dict[stop_word] = 1

# Function to remove stopwords,numeric and alpha numeric characters  
def _remove_stop_words_keep_alphabetic_tokens(sentence):
    tokens = sentence.split()
    new_tokens = []
    
    for token in tokens:
        token = lemmatizer.lemmatize(token)
        
        if token in stop_words_dict:
            continue
        elif not token.isalpha():
            continue
        else:
            new_tokens.append(token)
            
    return " ".join(new_tokens)

# # Function to calculate sentence embeddings
def calculate_sentence_embeddings(tokens):
    Vector_mean=np.mean(glove_embedding.vectors,axis=0)
    word_embedding = np.zeros((len(tokens),glove_embedding.vector_size))
    for index,word in enumerate(tokens):
        
        if word in glove_embedding.vocab.keys():
            embedding = glove_embedding.get_vector(word)
            word_embedding[index,:] = embedding
        else:
            word_embedding[index,:] = Vector_mean
    
    sentence_embeddings = (np.sum(word_embedding,axis=0))/len(tokens)
    
    return sentence_embeddings

# Funtion to remove stop words
def _remove_stop_words_from_sentence(sentence):
    tokens = sentence.split()
    new_tokens = []
    
    for token in tokens:
        token = lemmatizer.lemmatize(token)
        if token in stop_words_dict:
            continue
        else:
            new_tokens.append(token)
            
    return " ".join(new_tokens)
    

In [9]:
# Loading the required dataset
utterances_data = pd.read_csv("utterance_data.csv")
utterances_data["utterance"]

0       i need $20000 transferred from my savings to m...
1       complete a transaction from savings to checkin...
2       transfer $20000 from my savings account to che...
3         take $20000 from savings and put it in checking
4       put $20000 into my checking account from my sa...
                              ...                        
2245                              give weather update now
2246                             want to know the weather
2247                        tell me the weather for today
2248                     what is the current weather like
2249                              las vegas weather today
Name: utterance, Length: 2250, dtype: object

In [5]:
# Loading word2vec pretrained model for english language
glove_embedding = api.load('glove-wiki-gigaword-300')

## 1. Clustering without removing stopwords


In [10]:
# tokenizing sentences 
tokenized_data = utterances_data["utterance"].apply(word_tokenize)

In [11]:
# fectching emebeddings for the utterances data

data_embeddings = np.zeros((tokenized_data.shape[0],glove_embedding.vector_size))

for index,sentence in enumerate(tokenized_data):
    data_embeddings[index,:]=calculate_sentence_embeddings(sentence)

## 5 Clusters

In [12]:
num_clusters =5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 5 clusters, got a Silhoutte Score of 0.16434900831205654


## 10 Clusters

In [13]:
num_clusters =10
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 10 clusters, got a Silhoutte Score of 0.1627663607728707


## 15 Clusters

In [14]:
num_clusters =15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 15 clusters, got a Silhoutte Score of 0.20488595789482783


## After preprocessing

## Removing Stopwords

In [15]:
# removing stopwordsand tokenizing sentences 
no_stopwords_tokenized_data = utterances_data["utterance"].apply(_remove_stop_words_from_sentence).apply(word_tokenize)

In [18]:
# fectching emebeddings for the utterances data

no_stopwords_data_embeddings = np.zeros((no_stopwords_tokenized_data.shape[0],glove_embedding.vector_size))

for index,sentence in enumerate(no_stopwords_tokenized_data):
    no_stopwords_data_embeddings[index,:]=calculate_sentence_embeddings(sentence)

no_stopwords_data_embeddings = pd.DataFrame(no_stopwords_data_embeddings)

In [19]:
no_stopwords_data_embeddings.fillna(value=no_stopwords_data_embeddings.mean(axis=0),inplace=True)

## 5 Clusters

In [20]:
num_clusters =5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(no_stopwords_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(no_stopwords_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 5 clusters, got a Silhoutte Score of 0.19795586113360233


## 10 Clusters

In [21]:
num_clusters =10
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(no_stopwords_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(no_stopwords_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 10 clusters, got a Silhoutte Score of 0.2769502434359487


## 15 Clusters

In [22]:
num_clusters =15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(no_stopwords_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(no_stopwords_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 15 clusters, got a Silhoutte Score of 0.2769482453398693


### b. Removing stopwords,nunmeric and alpha numeric characters

In [23]:
# removing stopwords,,nunmeric and alpha numeric  and tokenizing sentences 
preprocessed_tokenized_data = utterances_data["utterance"].apply(_remove_stop_words_keep_alphabetic_tokens).apply(word_tokenize)

In [24]:
# fectching emebeddings for the utterances data

preprocessed_data_embeddings = np.zeros((preprocessed_tokenized_data.shape[0],glove_embedding.vector_size))

for index,sentence in enumerate(preprocessed_tokenized_data):
    preprocessed_data_embeddings[index,:]=calculate_sentence_embeddings(sentence)

preprocessed_data_embeddings = pd.DataFrame(preprocessed_data_embeddings)

  sentence_embeddings = (np.sum(word_embedding,axis=0))/len(tokens)


In [25]:
preprocessed_data_embeddings.fillna(value=preprocessed_data_embeddings.mean(axis=0),inplace = True)

## 5 Clusters

In [26]:
num_clusters =5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(preprocessed_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(preprocessed_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 5 clusters, got a Silhoutte Score of 0.21104735684723155


## 10 Clusters

In [27]:
num_clusters =10
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(preprocessed_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(preprocessed_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 10 clusters, got a Silhoutte Score of 0.29494133011001644


## 15 Clusters

In [28]:
num_clusters =15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(preprocessed_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(preprocessed_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 15 clusters, got a Silhoutte Score of 0.3561676161352428
