## User Input: Number of Clusters

In [None]:
# number of clusters for each component (fixed)
num_clusters = 5

In [None]:
# import packages
import pandas as pd
import numpy as np
import utils
import pickle
from sklearn.decomposition import PCA
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Load grouped data by component from saved pickle

In [None]:
with open("DummyPreproccessedForTfidf.pickle", "rb") as pickle_file:
    grouped_data = pickle.load(pickle_file)

In [None]:
# # Example: How to access a single document text:
# grouped_data['ALPHA']['Description'].iloc[0]

# Examine by Subgroup

In [None]:
# Explore all groups
groups = grouped_data.keys()

# Vectorization: TF-IDF

In [None]:
### VECTORIZATION ###
from sklearn.feature_extraction.text import TfidfVectorizer
vocabulary = {}
vocabulary_freq = {}
group_X = {}
for g in groups:
    tfidf_vect = TfidfVectorizer()
    message_list = list(grouped_data[g]['Requirement'])
    X = tfidf_vect.fit_transform(message_list)
    document_term_matrix = pd.DataFrame(X.toarray(), 
                                        columns=tfidf_vect.get_feature_names())
    vocabulary[g] = tfidf_vect.vocabulary_
    vocabulary_freq[g] = utils.count_vocab_freq(vocabulary[g], 
                                                corpus=message_list)
    group_X[g] = X

    print(g, X.shape)

In [None]:
group_X
with open("VectorizationTfidf.pickle", "wb") as pickle_file:
    pickle.dump(group_X, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# view vocabulary (frequency) by subgroup component 
component_group = 'ALPHA' # change to examine each component vocabulary
vocabulary_freq[component_group]

# Suggestion:  Look into acronyms used

In [None]:
# view vocabulary from each grouped component
for component_name in list(groups):
    print(f'Component Name = {component_name}:')
    print(f'{list(vocabulary_freq[component_name].items())[:30]} \n')

# Word Cloud Visualization

In [None]:
# Create word cloud of this vocabulary
for g in list(groups):
    print(f'{g}: Word Cloud')
    word_cloud = WordCloud(width=3000, height=2000).generate_from_frequencies(vocabulary_freq[g])
    word_cloud.background_color = 'white'
    plt.figure(figsize=[15,10])
    plt.axis('off')
    plt.imshow(word_cloud)
    # plt.savefig('word_cloud.png')
    print('\n')

# Principal Component Analysis

In [None]:
# for component_name in list(groups):
#     # up to 95% of total variance
#     pca = PCA(n_components=0.95, svd_solver='full')
#     prin_comp = pca.fit_transform(group_X[component_name].todense())
#     cumsum_variance = np.cumsum(pca.explained_variance_)
    
#     print(f'Component Name = {component_name}:')

#     print(f'# of eigenvalues (principal components) needed to reach '
#           f'{100*pca.n_components}% of explained variance: {pca.n_components_}')

#     print(f'Cumulative explained variance per principal component: '
#           f'{cumsum_variance[:4]} ... {cumsum_variance[-4:]} \n')

# K-means on Subgroup

In [None]:
### K Means ###

from sklearn.cluster import KMeans
import numpy as np

for g in groups:
    
    #FIXME: Each subgroup will have a different optimal K clusters... 
    # num_clusters = 6 # defined on top cell
    if len(grouped_data[g]) < num_clusters:
        num_clusters = 1
    
    model = KMeans(n_clusters=num_clusters, 
                   init='k-means++', 
                   random_state=5).fit(group_X[g])
    sizes = np.array(np.unique(model.labels_, 
                               return_counts=True))[1]
    print(g, 'Cluster sizes: ', end=' ')
    print(sizes)

    grouped_data[g]['predicted'] = model.labels_
    grouped_data[g].head()

In [None]:
with open("ClustersTfidf.pickle", "wb") as pickle_file:
    pickle.dump(grouped_data, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)