## User Input: Number of Clusters

In [None]:
# number of clusters for each component (fixed)
num_clusters = 5

In [None]:
# import packages
import pandas as pd
import numpy as np
import utils
import pickle
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.decomposition import PCA

# Load grouped data by component from saved pickle

In [None]:
with open("DummyPreproccessedForTfidf.pickle", "rb") as pickle_file:
    grouped_data = pickle.load(pickle_file)

In [None]:
grouped_data

# Examine ALPHA Subgroup

In [None]:
# Select ALPHA subgroup to perform TF-IDF vectorization and k-means clustering
groups = ['ALPHA']

# Vectorization: TF-IDF

In [None]:
### VECTORIZATION ###
from sklearn.feature_extraction.text import TfidfVectorizer
vocabulary = {}
vocabulary_freq = {}
for g in groups:
    tfidf_vect = TfidfVectorizer()
    message_list = list(grouped_data[g]['Requirement'])
    X = tfidf_vect.fit_transform(message_list)
    document_term_matrix = pd.DataFrame(X.toarray(), 
                                        columns=tfidf_vect.get_feature_names())
    vocabulary[g] = tfidf_vect.vocabulary_
    vocabulary_freq[g] = utils.count_vocab_freq(vocabulary[g], 
                                                corpus=message_list)
    print(g, X.shape)

In [None]:
# view vocabulary for subgroup component ALPHA
list(vocabulary_freq.items())[:50]

# Suggestion:  Look into acronmyms used

# Word Cloud Visualization

In [None]:
# Create word cloud of this vocabulary
print('One System: Word Cloud \n')
one_system_word_cloud = WordCloud(width=3000, height=2000).generate_from_frequencies(vocabulary_freq[g])
one_system_word_cloud.background_color = 'white'
plt.figure(figsize=[15,10])
plt.axis('off')
plt.imshow(one_system_word_cloud)
# plt.savefig('one_system_word_cloud.png')

# Principal Component Analysis

In [None]:
# for component_name in list(groups):
#     # up to 95% of total variance
#     pca = PCA(n_components=0.95, svd_solver='full')
#     prin_comp = pca.fit_transform(X.todense())
#     cumsum_variance = np.cumsum(pca.explained_variance_)
    
#     print(f'Component Name = {component_name}:')

#     print(f'# of eigenvalues (principal components) needed to reach '
#           f'{100*pca.n_components}% of explained variance: {pca.n_components_}')

#     print(f'Cumulative explained variance per principal component: '
#           f'{cumsum_variance[:4]} ... {cumsum_variance[-4:]} \n')

# K-means on Subgroup

In [None]:
g

In [None]:
### K Means ###

from sklearn.cluster import KMeans
import numpy as np

# num_clusters = 6 # number of labels in eval_df
model = KMeans(n_clusters = num_clusters, init = 'k-means++', random_state = 5).fit(X)
sizes = np.array(np.unique(model.labels_, return_counts = True))[1]
print('Cluster sizes: ', end=' ')
print(sizes)

grouped_data[g]['predicted'] = model.labels_
grouped_data[g].head()

# Visualize: What's in the clusters? 

In [None]:
# Show first 3 requirements in each cluster, plus the word cloud
for count in range(num_clusters):
    vocabulary = {}
    vocabulary_freq = {}
    tfidf_vect = TfidfVectorizer()
    message_list = list(grouped_data[g]['Description'].loc[grouped_data[g]['predicted'] == count])
    message_list_preprocess = list(grouped_data[g]['Requirement'].loc[grouped_data[g]['predicted'] == count])
    X = tfidf_vect.fit_transform(message_list_preprocess)
    document_term_matrix = pd.DataFrame(X.toarray(), 
                                        columns=tfidf_vect.get_feature_names())
    vocabulary[count] = tfidf_vect.vocabulary_
    vocabulary_freq[count] = utils.count_vocab_freq(vocabulary[count],
                                                    corpus=message_list_preprocess)
    # Word Cloud
    word_cloud = WordCloud(width=3000, height=2000).generate_from_frequencies(vocabulary_freq[count])
    word_cloud.background_color = 'white'

    print(f"Cluster #: {count}. Dimensions of TF-IDF matrix: {X.shape}")
    print(message_list[:3])
    print("\n")
    print(message_list_preprocess[:3])
    print("\n")
    print(f"Vocab Frequency: {list(vocabulary_freq[count].items())[:30]}")

    plt.figure(figsize=[15,10])
    plt.axis('off')
    plt.imshow(word_cloud)
    print("\n")

In [None]:
### VISUALIZE ### What requirements are in the cluster (first 20 only)?

# Update the following to inspect the clusters
view = 'Requirement' # Uncomment to view prepocessed message 
# view = 'Summary' # Uncomment to view original summary message 

for cluster_number in range(num_clusters):
    print(f'Component Name: {g}')
    print(f'Cluster Number: {cluster_number}')
    print(f"{list(grouped_data[g][view].loc[grouped_data[g]['predicted'] == cluster_number].head(20))} \n")

# How to determine optimal K clusters

In [None]:
### Determining optimal K clusters ###
# Uses elbow method and silhouette coefficient
# This takes a while...
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

silhouette_coefficients = []
Sum_of_squared_distances = []
K = range(2, 100)
kmeans_kwargs = {
         "init": "random",
         "n_init": 10,
         "max_iter": 300,
         "random_state": 42,
     }
for k in K: 
    km = KMeans(n_clusters=k, **kmeans_kwargs)
    km = km.fit(X)
    Sum_of_squared_distances.append(km.inertia_)
#     print(km.labels_)
    score = silhouette_score(X, km.labels_)
    silhouette_coefficients.append(score)
    
import matplotlib.pyplot as plt

plt.plot(K,Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(K, silhouette_coefficients)
plt.xticks(K)
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
silhouette_coefficients

max_value = max(silhouette_coefficients)
max_index = silhouette_coefficients.index(max_value)
print(max_index)

# Using YAKE for Keyword Extraction for each cluster

In [None]:
# Create clusters_data to more easily access the requirements in each cluster
clusters_data = []
for cluster_number in range(max_index):
    clusters_data.append(list(grouped_data[g]['Requirement'].loc[grouped_data[g]['predicted']==cluster_number]))

In [None]:
clusters_data[0]

In [None]:
### Using Yake to on clusters and save keywords for each cluster

import yake
from sklearn.feature_extraction.text import CountVectorizer

language = "en"
max_ngram_size = 1
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 50
yake_keywords_clusters = {}
custom_kw_extractor = yake.KeywordExtractor(lan=language, 
                                            n=max_ngram_size, 
                                            dedupLim=deduplication_thresold, 
                                            dedupFunc=deduplication_algo, 
                                            windowsSize=windowSize, 
                                            top=numOfKeywords, 
                                            features=None)
for cluster_number in range(max_index):
    print(cluster_number)
    keywords = []
    for text in clusters_data[cluster_number]:
        kw = custom_kw_extractor.extract_keywords(text)
        keywords.append(' '.join([word for word, score in kw]))
    yake_keywords_clusters[cluster_number] = kw

    vectorizer = CountVectorizer(stop_words='english', ngram_range=(0,1))
    X = vectorizer.fit_transform(keywords)
    ## X will be array of word embeddings
#     print('Word embeddings shape: ', end=' ')
#     print(cluster_number, X.shape)
    document_term_matrix = pd.DataFrame(X.toarray(),
                                        columns=vectorizer.get_feature_names())
    
# with open("yake_vocabulary.txt", "w") as text_file:
#     text_file.write(str(yake_vocab))

In [None]:
yake_keywords_clusters[0]
# TODO: for each cluster, change list to dictionary