## User Input: Number of Clusters

In [None]:
# number of clusters for each component (fixed)
num_clusters = 5

In [None]:
# import packages
import pandas as pd
import numpy as np
import utils
import pickle
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import utils

# TF-IDF: Load cleaned dataset from saved pickle

In [None]:
# In this notebook, do not perform cluster analysis for each component. 
# Perform analysis for the whole dataset. 
with open("cleaned_dummy_tfidf.pickle", "rb") as pickle_file:
    cleaned_data = pickle.load(pickle_file)

In [None]:
# Show dataset
cleaned_data

# TF-IDF: Vectorization

In [None]:
### VECTORIZATION ###
tfidf_vect = TfidfVectorizer()
message_list = list(cleaned_data['Requirement'])
X = tfidf_vect.fit_transform(message_list)
document_term_matrix = pd.DataFrame(X.toarray(),
                                    columns=tfidf_vect.get_feature_names())
vocabulary = tfidf_vect.vocabulary_
vocabulary_freq = utils.count_vocab_freq(vocabulary, corpus=message_list)

# View size of document term matrix for TF-IDF
print(f'Document Term Matrix shape: {X.shape}')
print(f'\n')

# View vocabulary list
print(f'Vocabulary:')
print(list(vocabulary_freq.items())[:250])

# TF-IDF: Word Cloud Visualization

In [None]:
# Create word cloud of this vocabulary
print('Dummy Dataset: Word Cloud \n')
dataset_word_cloud = WordCloud(width=3000, height=2000).generate_from_frequencies(vocabulary_freq)
dataset_word_cloud.background_color = 'white'
plt.figure(figsize=[15,10])
plt.axis('off')
plt.imshow(dataset_word_cloud)
# plt.savefig('dataset_word_cloud.png')

# TF-IDF: Principal Component Analysis

In [None]:
# # up to 95% of total variance
# pca = PCA(n_components=0.95, svd_solver='full')
# prin_comp = pca.fit_transform(X.todense())
# cumsum_variance = np.cumsum(pca.explained_variance_)

# print(f'# of eigenvalues (principal components) needed to reach '
#       f'{100*pca.n_components}% of explained variance: {pca.n_components_}')

# print(f'Cumulative explained variance per principal component: '
#       f'{cumsum_variance[:4]} ... {cumsum_variance[-4:]} \n')

# TF-IDF: K-Means for the Whole Dataset

In [None]:
### K Means ###

model = KMeans(n_clusters=num_clusters,
               init='k-means++',
               random_state=5).fit(X)
sizes = np.array(np.unique(model.labels_,
                           return_counts=True))[1]
print('Cluster sizes (# of requirements per cluster): ', end=' ')
print(sizes)

cleaned_data['Predicted_Cluster_#'] = model.labels_

In [None]:
with open("Clusters_cleaned_dummy_tfidf.pickle", "wb") as pickle_file:
    pickle.dump(cleaned_data, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

# TF-IDF: Inspect Requirements for Each Cluster

**Note:** For visualization purposes, shuffle the rows in the dataset for each cluster using `pd.sample(frac=1)`.

In [None]:
data_reqs = cleaned_data[['Component/s', 'Requirement', 'Predicted_Cluster_#']]

for k in range(num_clusters):
    vocabulary = {}
    vocabulary_freq = {}
    tfidf_vect = TfidfVectorizer()
    message_list = list(cleaned_data['Description'].loc[cleaned_data['Predicted_Cluster_#'] == k])
    message_list_preprocess = list(data_reqs['Requirement'].loc[data_reqs['Predicted_Cluster_#'] == k])
    X = tfidf_vect.fit_transform(message_list_preprocess)
    document_term_matrix = pd.DataFrame(X.toarray(), 
                                        columns=tfidf_vect.get_feature_names())
    vocabulary[k] = tfidf_vect.vocabulary_
    vocabulary_freq[k] = utils.count_vocab_freq(vocabulary[k],
                                                corpus=message_list_preprocess)
    # Word Cloud
    word_cloud = WordCloud(width=3000, height=2000).generate_from_frequencies(vocabulary_freq[k])
    word_cloud.background_color = 'white'

    print(f"Cluster #: {k}. Dimensions of TF-IDF matrix: {X.shape}")
    print(message_list[:3])
    print("\n")
    print(message_list_preprocess[:3])
    print("\n")
    print(f"Vocab Frequency: {list(vocabulary_freq[k].items())[:30]}")

    plt.figure(figsize=[15,10])
    plt.axis('off')
    plt.imshow(word_cloud)
    print("\n")
    print(data_reqs.loc[data_reqs['Predicted_Cluster_#'] == k].sample(frac=1))
    print('\n')

# Doc2Vec: Load cleaned dataset from saved pickle

In [None]:
# In this notebook, do not perform cluster analysis for each component. 
# Perform analysis for the whole dataset -- some test cases can span requirements from multiple system components. 
with open("cleaned_dummy_doc2vec.pickle", "rb") as pickle_file:
    cleaned_data_doc = pickle.load(pickle_file)

In [None]:
cleaned_data_doc

# Doc2Vec: Vectorization

In [None]:
# Doc to Vec
import gensim

def tagged_document(list_of_lists):
    for i, list_of_words in enumerate(list_of_lists):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [None]:
corpus = list(tagged_document(cleaned_data_doc['Requirement']))
d2v = gensim.models.doc2vec.Doc2Vec(vector_size=200, dm=0, min_count=2, epochs=50, seed=5)
d2v.random.seed(5)
d2v.build_vocab(corpus)
d2v.random.seed(5)
d2v.train(corpus, total_examples=d2v.corpus_count, epochs=d2v.epochs)

# fit language model
X = []
for row in cleaned_data_doc['Requirement']:
    d2v.random.seed(5)
    X.append(d2v.infer_vector(row.split(' ')))

cluster_X = np.array(X)
print('Word embeddings shape: ', end=' ')
print(cluster_X.shape)

# Doc2Vec: Principal Component Analysis

In [None]:
# # up to 95% of total variance
# pca = PCA(n_components=0.95, svd_solver='full')
# prin_comp = pca.fit_transform(cluster_X.todense())
# cumsum_variance = np.cumsum(pca.explained_variance_)

# print(f'# of eigenvalues (principal components) needed to reach '
#       f'{100*pca.n_components}% of explained variance: {pca.n_components_}')

# print(f'Cumulative explained variance per principal component: '
#       f'{cumsum_variance[:4]} ... {cumsum_variance[-4:]} \n')

# Doc2Vec: K-Means for the Whole Dataset

In [None]:
### K Means ###

model = KMeans(n_clusters=num_clusters,
               init='k-means++',
               random_state=5).fit(cluster_X)
sizes = np.array(np.unique(model.labels_,
                           return_counts=True))[1]
print('Cluster sizes (# of requirements per cluster): ', end=' ')
print(sizes)

cleaned_data_doc['Predicted_Cluster_#'] = model.labels_

# Doc2Vec: Inspect Requirements for Each Cluster

**Note:** For visualization purposes, shuffle the rows in the dataset for each cluster using `pd.sample(frac=1)`.

In [None]:
data_reqs = cleaned_data_doc[['Component/s', 'Requirement', 'Predicted_Cluster_#']]

for k in range(num_clusters):
    print(data_reqs.loc[data_reqs['Predicted_Cluster_#'] == k].sample(frac=1))
    print('\n')