## User Input: Number of Clusters

In [None]:
# number of clusters for each component (fixed)
# Note: There are 19 unique component/s for this dataset. 
num_clusters = 5

In [None]:
# import packages
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.decomposition import PCA

# Load grouped data by component from saved pickle

In [None]:
with open("DummyPreproccessedForDoc2Vec.pickle", "rb") as pickle_file:
    grouped_data = pickle.load(pickle_file)

In [None]:
grouped_data

# Examine by Subgroup

In [None]:
# Select subgroup to perform Doc2Vec vectorization and k-means clustering
groups = ['ALPHA']
# groups = grouped_data.keys()

# Vectorization: Doc2Vec

In [None]:
# Doc to Vec
import gensim

def tagged_document(list_of_lists):
    for i, list_of_words in enumerate(list_of_lists):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [None]:
cluster_X = {}

for g in groups:
    corpus = list(tagged_document(grouped_data[g]['Requirement']))
    d2v = gensim.models.doc2vec.Doc2Vec(vector_size=200, dm=0, min_count=2, epochs=50, seed=5)
    d2v.random.seed(5)
    d2v.build_vocab(corpus)
    d2v.random.seed(5)
    d2v.train(corpus, total_examples=d2v.corpus_count, epochs=d2v.epochs)
    # fit language model
    X = []
    for row in grouped_data[g]['Requirement']:
        d2v.random.seed(5)
        X.append(d2v.infer_vector(row.split(' ')))

    X = np.array(X)
    cluster_X[g] = X
    print('Word embeddings shape: ', end=' ')
    print(X.shape)

# Principal Component Analysis

In [None]:
# for component_name in list(groups):
#     # up to 95% of total variance
#     pca = PCA(n_components=0.95, svd_solver='full')
#     prin_comp = pca.fit_transform(cluster_X[component_name])
#     cumsum_variance = np.cumsum(pca.explained_variance_)
    
#     print(f'Component Name = {component_name}:')

#     print(f'# of eigenvalues (principal components) needed to reach '
#           f'{100*pca.n_components}% of explained variance: {pca.n_components_}')

#     print(f'Cumulative explained variance per principal component: '
#           f'{cumsum_variance[:4]} ... {cumsum_variance[-4:]} \n')

# K-means on one Subgroup

In [None]:
# Cluster
from sklearn.cluster import KMeans

for g in groups:
    
    #FIXME: Each subgroup will have a different optimal K clusters... 
    # num_clusters = 6  
    if len(grouped_data[g]) < num_clusters:
        num_clusters = 1
        
    
    model = KMeans(n_clusters=num_clusters, init='k-means++', random_state=5).fit(cluster_X[g])
    sizes = np.array(np.unique(model.labels_, return_counts=True))[1]
    print('Cluster sizes: ', end=' ')
    print(sizes)

    grouped_data[g]['Predicted_Cluster_#'] = model.labels_
    print(grouped_data[g][0:len(model.labels_):500])

# Doc2Vec: Inspect Requirements for Each Cluster

**Note:** For visualization purposes, shuffle the rows in the dataset for each cluster using `pd.sample(frac=1)`.

In [None]:
# Show first 5 requirements in each cluster

for g in groups:
    print(g)
    for count in range(num_clusters):
        print(count)
        print(list(grouped_data[g]['Requirement'].loc[grouped_data[g]['Predicted_Cluster_#'] == count].head()))

In [None]:
### VISUALIZE ### What requirements are in the cluster?

# Update the following to inspect the clusters
view = 'Requirement' # Uncomment to view prepocessed message 
# view = 'Summary' # Uncomment to view original summary message 

for cluster_number in range(num_clusters):
    print(f'Component Name: {g}')
    print(f'Cluster Number: {cluster_number}')
    print(f"{list(grouped_data[g][view].loc[grouped_data[g]['Predicted_Cluster_#'] == cluster_number].head(20))} \n")

# Determine optimal K clusters

In [None]:
### Determining optimal K clusters ###
# Uses elbow method and silhouette coefficient
# This takes a while...
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def determine_optimal_k(K):
    silhouette_coefficients = []
    Sum_of_squared_distances = []
    kmeans_kwargs = {
             "init": "random",
             "n_init": 10,
             "max_iter": 300,
             "random_state": 42,
         }
    for k in K: 
        km = KMeans(n_clusters=k, **kmeans_kwargs)
        km = km.fit(X)
        Sum_of_squared_distances.append(km.inertia_)
    #     print(km.labels_)
        score = silhouette_score(X, km.labels_)
        silhouette_coefficients.append(score)


    plt.plot(K,Sum_of_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()
    
    plt.style.use("fivethirtyeight")
    plt.plot(K, silhouette_coefficients)
    plt.xticks(K)
    plt.xlabel("Number of Clusters")
    plt.ylabel("Silhouette Coefficient")
    plt.show()
    
    max_value = max(silhouette_coefficients)
    max_index = silhouette_coefficients.index(max_value)
    print('max_index',max_index)
    return max_index

In [None]:
# Note: Changed range(2, 100) -> range(2, 87)
K = range(2, 100)
max_index = determine_optimal_k(K)

# YAKE

In [None]:
# Create clusters_data to more easily access the requirements in each cluster
clusters_data = []
for cluster_number in range(max_index):
    clusters_data.append(list(grouped_data[g]['Requirement'].loc[grouped_data[g]['Predicted_Cluster_#'] == cluster_number]))

In [None]:
# clusters_data[0]

In [None]:
### Using Yake to on clusters and save keywords for each cluster

import yake
from sklearn.feature_extraction.text import CountVectorizer

language = "en"
max_ngram_size = 1
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 10
yake_keywords_clusters = {}
for cluster_number in range(max_index):
    custom_kw_extractor = yake.KeywordExtractor(lan=language, 
                                            n=max_ngram_size, 
                                            dedupLim=deduplication_thresold, 
                                            dedupFunc=deduplication_algo, 
                                            windowsSize=windowSize, 
                                            top=numOfKeywords, 
                                            features=None)
    keywords = []
    for text in clusters_data[cluster_number]:
        kw = custom_kw_extractor.extract_keywords(text)
        keywords.append(' '.join([word for word, score in kw]))
    yake_keywords_clusters[cluster_number] = kw


    vectorizer = CountVectorizer(stop_words='english', ngram_range=(0,1))
    X = vectorizer.fit_transform(keywords)
    ## X will be array of word embeddings
#     print('Word embeddings shape: ', end=' ')
#     print(cluster_number, X.shape)
    document_term_matrix = pd.DataFrame(X.toarray(), 
                                        columns=vectorizer.get_feature_names())
    
# with open("yake_vocabulary.txt", "w") as text_file:
#     text_file.write(str(yake_vocab))

In [None]:
yake_keywords_clusters