In [None]:
## Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM
from keras.layers import Flatten
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.layers.convolutional import Conv1D
from sklearn.metrics import classification_report
from keras.layers.convolutional import MaxPooling1D
from sklearn.model_selection import TimeSeriesSplit
import warnings
warnings.filterwarnings('ignore')
np.random.seed(7)

In [None]:
def load_embeddings():
    """
    Loading the stress dataset in pandas dataframe
    :return: scaled data
    """
    # loading data
    stress = pd.read_csv("data.csv")
    true_labels = stress['Label']
    stress = stress.loc[:, stress.columns != 'Label']

    # checking data shape
    row, col = stress.shape
    print(f'There are {row} rows and {col} columns')
    print(stress.head(10))

    # to work on copy of the data
    stress_scaled = stress.copy()

    # Scaling the data to keep the different attributes in same range.
    stress_scaled[stress_scaled.columns] = StandardScaler().fit_transform(stress_scaled)
    print(stress_scaled.describe())

    return stress_scaled, true_labels

In [None]:
def pca_embeddings(df_scaled):
    """To reduce the dimensions of the stress dataset we use Principal Component Analysis (PCA).
    Here we reduce it
    :param df_scaled: scaled data
    :return: pca result, pca for plotting graph
    """

    pca_2 = PCA(n_components=2)
    pca_2_result = pca_2.fit_transform(df_scaled)
    print('Explained Variation per Principal Component: {}'.format(pca_2.explained_variance_ratio_))
    print('Cumulative Variance Explained by 2 Principal Components: {:.2%}'.format(
        np.sum(pca_2.explained_variance_ratio_)))
    return pca_2_result, pca_2


In [None]:
def kmean_hyper_param_tuning(data):
    """
    s
    :param data: dimensionality reduced data after applying PCA
    :return: best number of clusters for the model (used for KMeans n_clusters)
    """
    # candidate values for our number of cluster
    parameters = [2, 3, 4] #2,3,4,5,10,15,20,25,30,35,40

    # instantiating ParameterGrid, pass number of clusters as input
    parameter_grid = ParameterGrid({'n_clusters': parameters})

    best_score = -1
    kmeans_model = KMeans()     # instantiating KMeans model
    silhouette_scores = []

    # evaluation based on silhouette_score
    for p in parameter_grid:
        kmeans_model.set_params(**p)    # set current hyper parameter
        kmeans_model.fit(data)          #
        ss = metrics.silhouette_score(data, kmeans_model.labels_)   # calculate silhouette_score
        calinski_harabaz_score = metrics.calinski_harabasz_score(data, kmeans_model.labels_)
        davies_bouldin_score = metrics.davies_bouldin_score(data, kmeans_model.labels_)
        silhouette_scores += [ss]       # store all the scores

        print('Parameter:', p, 'Score', ss)
        print('Parameter:', p, 'Calinski Harabaz Score:', calinski_harabaz_score)
        print('Parameter:', p, 'Davies Bouldin Score:', davies_bouldin_score)
        print("  ")

        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p

    # plotting silhouette score
    plt.bar(range(len(silhouette_scores)), list(silhouette_scores), align='center', color='#722f59', width=0.5)
    plt.xticks(range(len(silhouette_scores)), list(parameters))
    plt.title('Silhouette Score', fontweight='bold')
    plt.xlabel('Number of Clusters')
    plt.show()

    return best_grid['n_clusters']


In [None]:
def visualizing_results(pca_result, label, centroids_pca):
    """ Visualizing the clusters
    :param pca_result: PCA applied data
    :param label: K Means labels
    :param centroids_pca: PCA format K Means centroids
    """
    # ------------------ Using Matplotlib for plotting-----------------------
    x = pca_result[:, 0]
    y = pca_result[:, 1]

    plt.scatter(x, y, c=label, alpha=0.5, s= 200)  # plot different colors per cluster
    plt.title('Psychological State Clusters')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')

    plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='X', s=200, linewidths=1.5,
                color='red', edgecolors="black", lw=1.5)

    plt.show()



In [None]:
def main():
    print("1. Loading Stress dataset")
    data_scaled, true_labels = load_embeddings()

    print("2. Reducing via PCA")
    pca_result, pca_2 = pca_embeddings(data_scaled)

    print("3. HyperTuning the Parameter for KMeans")
    optimum_num_clusters = kmean_hyper_param_tuning(data_scaled)
    print("optimum num of clusters =", optimum_num_clusters)

    # fitting KMeans
    kmeans = KMeans(n_clusters=optimum_num_clusters)
    kmeans.fit(data_scaled)
    centroids = kmeans.cluster_centers_
    centroids_pca = pca_2.transform(centroids)

    print("4. Visualizing the data")
    visualizing_results(pca_result, kmeans.labels_, centroids_pca)

    print("Normalized Mutual Info Score(NMI): {}".format(metrics.normalized_mutual_info_score(true_labels, kmeans.labels_)))
    print("Adjusted Rand Score(ARI): {}".format(metrics.adjusted_rand_score(true_labels, kmeans.labels_)))
    print("Accuracy: {}".format(metrics.accuracy_score(true_labels, kmeans.labels_)))

In [None]:
if __name__ == "__main__":
    main()