In [177]:
import nltk
import gensim
from gensim.models import FastText
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize,wordpunct_tokenize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


In [232]:
# Loading Stopwords
with open("StopWords.txt") as f:
    stop_words = f.read()

stop_words = stop_words.split("\n")
stop_words_dict = {}
for stop_word in stop_words:
    stop_words_dict[stop_word] = 1

# Function to remove stopwords,numeric and alpha numeric characters  
def _remove_stop_words_keep_alphabetic_tokens(sentence):
    tokens = sentence.split()
    new_tokens = []
    
    for token in tokens:
        if token in stop_words_dict:
            continue
        elif not token.isalpha():
            continue
        else:
            new_tokens.append(token)
            
    return " ".join(new_tokens)


# Function to calculate sentence embeddings
def calculate_sentence_embeddings(tokens):
    Vector_mean=np.mean(FastText_embedding.vectors,axis=0)
    word_embedding = np.zeros((len(tokens),FastText_embedding.vector_size))
    
    for index,word in enumerate(tokens):
        embedding = FastText_embedding.get_vector(word)
        if embedding is not None:
            word_embedding[index,:] = embedding
        else:
            word_embedding[index,:] = Vector_mean
    
    sentence_embedding = (np.sum(word_embedding,axis=0))/len(tokens)
    return sentence_embedding



# Funtion to remove stop words
def _remove_stop_words_from_sentence(sentence):
    tokens = sentence.split()
    new_tokens = []
    
    for token in tokens:
        if token in stop_words_dict:
            continue
        else:
            new_tokens.append(token)
            
    return " ".join(new_tokens)
    

In [162]:
# Loading the required dataset
utterances_data = pd.read_csv("utterance_data.csv")
utterances_data["utterance"]

0       i need $20000 transferred from my savings to m...
1       complete a transaction from savings to checkin...
2       transfer $20000 from my savings account to che...
3         take $20000 from savings and put it in checking
4       put $20000 into my checking account from my sa...
                              ...                        
2245                              give weather update now
2246                             want to know the weather
2247                        tell me the weather for today
2248                     what is the current weather like
2249                              las vegas weather today
Name: utterance, Length: 2250, dtype: object

In [50]:
# Loading Fasttext pretrained model for english language
FastText_embedding = gensim.models.fasttext.load_facebook_vectors("wiki.en.bin")

## 1. Clustering without removing stopwords


In [208]:
# tokenizing sentences 
tokenized_data = utterances_data["utterance"].apply(word_tokenize)

In [None]:
# fectching emebeddings for the utterances data
FastText_embedding_for_data = load_embeddings(utterances_data["utterance"])
data_embeddings = np.zeros((tokenized_data.shape[0],FastText_embedding.vector_size))

for index,sentence in enumerate(tokenized_data):
    data_embeddings[index,:]=calculate_sentence_embeddings(sentence)

## 5 Clusters

In [203]:
num_clusters =5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 5 clusters, got a Silhoutte Score of 0.11621700013158452


## 10 Clusters

In [204]:
num_clusters =10
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 10 clusters, got a Silhoutte Score of 0.17415951924692652


## 15 Clusters

In [205]:
num_clusters =15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 15 clusters, got a Silhoutte Score of 0.18029232457404518


## After preprocessing

## a. Removing Stopwords

In [235]:
# removing stopwordsand tokenizing sentences 
no_stopwords_tokenized_data = utterances_data["utterance"].apply(_remove_stop_words_from_sentence).apply(word_tokenize)

In [237]:
# fectching emebeddings for the utterances data

no_stopwords_data_embeddings = np.zeros((no_stopwords_tokenized_data.shape[0],FastText_embedding.vector_size))

for index,sentence in enumerate(no_stopwords_tokenized_data):
    no_stopwords_data_embeddings[index,:]=calculate_sentence_embeddings(sentence)

no_stopwords_data_embeddings = pd.DataFrame(no_stopwords_data_embeddings).fillna(value=df.mean(axis=0))

  sentence_embeddings = (np.sum(word_embedding,axis=0))/len(tokens)


## 5 Clusters

In [238]:
num_clusters =5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(no_stopwords_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(no_stopwords_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 5 clusters, got a Silhoutte Score of 0.16123890891394987


## 10 Clusters

In [239]:
num_clusters =10
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(no_stopwords_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(no_stopwords_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 10 clusters, got a Silhoutte Score of 0.2561561317872256


## 15 Clusters

In [240]:
num_clusters =15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(no_stopwords_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(no_stopwords_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 15 clusters, got a Silhoutte Score of 0.30028990690370394


## b. Removing stopwords,nunmeric and alpha numeric characters

In [212]:
# removing stopwords,,nunmeric and alpha numeric  and tokenizing sentences 
preprocessed_tokenized_data = utterances_data["utterance"].apply(_remove_stop_words_keep_alphabetic_tokens).apply(word_tokenize)

In [228]:
# fectching emebeddings for the utterances data

preprocessed_data_embeddings = np.zeros((preprocessed_tokenized_data.shape[0],FastText_embedding.vector_size))

for index,sentence in enumerate(preprocessed_tokenized_data):
    preprocessed_data_embeddings[index,:]=calculate_sentence_embeddings(sentence)

preprocessed_data_embeddings = pd.DataFrame(preprocessed_data_embeddings).fillna(value=df.mean(axis=))

## 5 Clusters

In [229]:
num_clusters =5
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(preprocessed_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(preprocessed_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 5 clusters, got a Silhoutte Score of 0.1998719724165426


## 10 Clusters

In [230]:
num_clusters =10
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(preprocessed_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(preprocessed_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 10 clusters, got a Silhoutte Score of 0.2911512920189505


## 15 Clusters

In [231]:
num_clusters =15
k_means_obj = KMeans(n_clusters=num_clusters)
k_means_obj.fit(preprocessed_data_embeddings)
labels = k_means_obj.labels_

## Calculating silhoutte score
## The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters.

sil_score = silhouette_score(preprocessed_data_embeddings,labels, metric="cosine")

print(f"Fitted {num_clusters} clusters, got a Silhoutte Score of {sil_score}")

Fitted 15 clusters, got a Silhoutte Score of 0.3157928506490441
