## Reading the data

In [11]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import matplotlib.pyplot as plt

df = pd.read_csv("../../corpus_sprint2_balanced_cpsi.csv", encoding="utf-8")
df = df.dropna()

model = Doc2Vec.load("../../model_doc2vec_balanced_20epochs")

## One hot encoding and concatenating

In [12]:
vectors = model.docvecs.vectors_docs

In [13]:
#print(df)
one_hot = pd.Categorical(df['Product'])
df_dummies = pd.get_dummies(one_hot)

one_hot2 = pd.Categorical(df['Sub-product'])
df_dummies2 = pd.get_dummies(one_hot2)

one_hot3 = pd.Categorical(df['Issue'])
df_dummies3 = pd.get_dummies(one_hot3)

one_hot4 = pd.Categorical(df['Sub-issue'])
df_dummies4 = pd.get_dummies(one_hot4)

v_df = pd.DataFrame(vectors)
df_concat = pd.concat([v_df, df_dummies, df_dummies2, df_dummies3, df_dummies4], axis = 1)
print(df_concat)

               0         1         2         3         4         5         6  \
0       0.133436 -0.624834 -0.594827  0.570151  0.342196  0.020293  0.133006   
1      -0.318046  0.242658 -0.007933 -0.041328 -0.222670 -0.074306 -0.143333   
2       0.023057 -0.411038  0.192171  0.336090  0.266424  0.342068 -0.418245   
3      -0.008265 -0.581705  0.510446  0.362619  0.014993  0.225840 -0.340280   
4      -0.279806 -0.600087  0.297371  0.130487 -0.266591  0.507612 -0.490625   
...          ...       ...       ...       ...       ...       ...       ...   
111629 -0.203572  0.011650 -0.021257  0.030504  0.428672  0.291125  0.159538   
111630  0.261742 -0.519895 -0.311496 -0.137755  0.381747 -0.065420 -0.922225   
111631 -0.075438  0.441664  0.628079  0.151694  0.677278 -0.191139 -0.296138   
111632 -0.035411 -0.189166  0.361630 -0.160894  0.240512  0.507424 -0.449626   
111633 -0.204480  0.115633  0.179984 -0.154141 -0.375160  0.394541 -0.029626   

               7         8         9  .

## Applying k-means

In [14]:
from sklearn.cluster import KMeans
from sklearn import metrics

### Visual with elbow

In [15]:
inertia = []
k_min=1
k_max=25
for i in range(k_min, k_max):
    print(i)
    km = KMeans(n_clusters=i,init='k-means++')
    km.fit(df_concat)
    inertia.append(km.inertia_)

plt.plot(range(k_min,k_max), inertia,marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Cluster inertia')
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12


KeyboardInterrupt: 

### Automatic with silhouette

In [8]:
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
range_n_clusters = [2, 3, 4]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(vectors) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(vectors)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(vectors, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(vectors, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(vectors[:, 0], vectors[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

2
Train done
Silhouette done
0.0722061
13
Train done
Silhouette done
0.20267813
Cluster id labels for inputted data
[9 9 7 ... 2 2 2]
Centroids data
[[-8.00356716e-02  3.50124314e-02 -8.20507854e-03 ... -3.36513040e-11
   1.28755346e-07 -1.01979822e-07]
 [ 1.38851963e-02  2.15379782e-02 -9.50428173e-02 ...  6.45741238e-11
   1.30617991e-07 -1.03842467e-07]
 [ 3.02012600e-02  1.05728664e-01 -5.62472753e-02 ...  2.09183781e-11
   1.29686669e-07 -1.02911144e-07]
 ...
 [-4.47234437e-02  2.66684126e-02 -1.10124908e-01 ... -1.62799552e-10
   1.26659870e-07 -9.94186848e-08]
 [ 6.79797083e-02  2.80941594e-02 -8.12916160e-02 ... -5.36601874e-11
   1.28522515e-07 -1.01514161e-07]
 [-8.99374038e-02 -3.88852283e-02 -9.72059965e-02 ... -3.15594662e-10
   1.24098733e-07 -9.68575478e-08]]
Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):
-352372.62
Silhouette_score: 
0.20267813
