# Sentence emcoding and Clustering
This file processes feedbacks or feedback sentences after sentimental analysis. 
It focuses on processing feedbacks that are negative to try to extract more useful information out.

## Load sentiment processed feedback files
The purpose of clustering is to properly classify problems highlighted by the users on Windows Update. 
We should only look at comments where people are stating problems. 

In [None]:
import pandas as pd
import numpy as np

fileToProcess = "Negative_CleanedFeedbackSentences.csv"

dataset = pd.read_csv(fileToProcess)
print(dataset.info())
feedbackSentences = dataset['0'].values

## Sentence Encoding

Text are in strings and need to be encoded into uniformed vectors first before clustering step
### ref: 
- https://towardsdatascience.com/an-intuitive-explanation-of-sentence-bert-1984d144a868
- https://www.sbert.net/docs/installation.html
- https://www.sbert.net/docs/package_reference/SentenceTransformer.html
- https://www.sbert.net/docs/pretrained_models.html#model-overview

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2') # pre-trained model to encode: https://www.sbert.net/docs/pretrained_models.html#model-overview

#encode the sentences
feedbackEmbeddings = model.encode(feedbackSentences, convert_to_tensor=True, normalize_embeddings = True)

In [None]:
# Funtion to comp sentence encoding in cos similarity
def PairsToCompareEncoding(i_feedbackEmbeddings, i_feedbackSentences, rangeLimit, printLimit):
    #compute the similarity scores
    cosine_scores = util.cos_sim(i_feedbackEmbeddings, i_feedbackEmbeddings)
    i_feedbackEmbeddingsValue = i_feedbackEmbeddings.values

    pairs = []
    for i in range(100):
        for j in range(i+1, 101):
            pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
            
    #sort the scores in decreasing order 
    pairs = sorted(pairs, key=lambda i_feedbackEmbeddingsValue: i_feedbackEmbeddingsValue['score'], reverse=True)
    for i in range(100):
        for j in range(i+1, 101):
            pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
    #sort the scores in decreasing order 
    pairs = sorted(pairs, key=lambda i_feedbackEmbeddingsValue: i_feedbackEmbeddingsValue['score'], reverse=True)

    print("Non-related--------------------------------------------------------------------------------------------")
    for pair in pairs[-1*printLimit:]:
        i, j = pair['index']
        print("{} \t\t {} \t\t Score: {:.4f}".format(i_feedbackSentences[i], i_feedbackSentences[j], pair['score']))#
    print("Related-------------------------------------------------------------------------------------------------")
    for pair in pairs[0:printLimit]:
        i, j = pair['index']        
        print("{} \t\t {} \t\t Score: {:.4f}".format(i_feedbackSentences[i], i_feedbackSentences[j], pair['score']))


In [None]:
PairsToCompareEncoding(feedbackEmbeddings,feedbackSentences, 100, 20 )

## K-means Clustering
- k means is good to roughly divide the data into equal parts.
- The nature of feedbacks here is people might complain about something more than the others. 
- Below are sample code how to do a k-mean cluster. 
- With this method, we might be able to separate general comments vs detailed comments.
- ref: https://en.wikipedia.org/wiki/Cluster_analysis

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
feedbackEmbeddingsValue = feedbackEmbeddings.numpy()

#finding optimal number of clusters using the elbow method  
wcss_list= []  #Initializing the list for the values of WCSS  

maxClusterTry = 15
#Using for loop for iterations from 1 to 10.  
for i in range(1, maxClusterTry):  
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 1)  
    kmeans.fit(feedbackEmbeddingsValue)  
    wcss_list.append(kmeans.inertia_)  
plt.plot(range(1, maxClusterTry), wcss_list)  
plt.title('The Elobw Method Graph')  
plt.xlabel('Number of clusters(k)')  
plt.ylabel('wcss_list')  
plt.show()  

In [None]:
#training the K-means model on a dataset 
numCluster = 10
kmeans = KMeans(n_clusters=numCluster, init='k-means++', random_state= 1)  
y_predict= kmeans.fit_predict(feedbackEmbeddingsValue)

In [None]:
unique, counts = np.unique(y_predict, return_counts=True)
np.asarray((unique, counts)).T


In [None]:
clusteredData = pd.DataFrame({'text' : feedbackSentences, 'clusterLabel' : y_predict}, columns = ['text', 'clusterLabel'])
outputFile = "cluster" + str(numCluster) + ".csv"
clusteredData.to_csv(outputFile, index=False)

# Visualization with Word Cloud

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
def plotWordCloud(text):
    stopwords = set(STOPWORDS)
    stopwords.update(["update", "updates", "upgraded","upgrades" ,"windows", "window", "microsoft", "computer", "win", "upgrade", "PC", "will", "take", "even", "work", "think", "laptop", "use"
    ])
    # Create and generate a word cloud image:
    wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
    # Display the generated image:
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
for i in range(0, numCluster):
    group = feedbackSentences[y_predict == i]
    text = " ".join(review for review in group)
    plotWordCloud(text)
