### Importing section

In [3]:
import os
import string
from nltk import tokenize
from gensim.models import Word2Vec
from nltk import word_tokenize
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import pandas as pd
import gensim
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [None]:
embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [5]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]

In [6]:
corpus_embeddings = embedder.encode(corpus)

In [7]:
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [8]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

In [None]:
for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

### Reading the data from the folder and cleaning it.

In [4]:
sentences = []
for filename in os.listdir(r"D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\TelevisonMergedT1+T2"):
   with open(os.path.join(r"D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\TelevisonMergedT1+T2", filename)) as f:
       text = f.read()
       text = text.replace("ï»¿","")
       sents = tokenize.sent_tokenize(text)
       for s in sents:
           #s = s.lower()
           #s = s.translate(str.maketrans('', '', string.punctuation))
           sentences.append(s)

tokensSentenceslist = []

for s  in sentences:
    wordsList = gensim.utils.simple_preprocess(s) #removing the punction and so on....
    filtered_words = [word for word in wordsList if word not in stopwords.words('english')]
    tokensSentenceslist.append(filtered_words)


##################### Uncomment below section for testing #########################
# print(len(sentences))
#
# for s in sentences:
#      print("The sentence is : ")
#      print(s)
#      print("-----------------------End of the sentence -------------")
#
# print (sentences)


# print (len(tokensSentenceslist))
# print (tokensSentenceslist)


In [None]:
print (tokensSentenceslist[0])

In [None]:
testSentences = gensim.utils.simple_preprocess("This is just a test and i don't know")

print (testSentences)

In [None]:
print (len(sentences))
print(sentences[0:5])


### Generating the Word2Vec Model

In [None]:
#model = Word2Vec(tokensSentenceslist, min_count=1)

#model = Word2Vec(tokensSentenceslist, vector_size=50, min_count=1, sg=1)
#model = Word2Vec(sentences=tokensSentenceslist, vector_size=100, workers=1, seed=42)

model = Word2Vec(window=10, min_count=2,workers=6,vector_size=100,seed=42,sg=0)
model.build_vocab(tokensSentenceslist, progress_per=1000)
model.train(tokensSentenceslist, total_examples=model.corpus_count, epochs=model.epochs)


##################### Uncomment below section for testing #########################


# print(list(model.wv.index_to_key))
# print(len(list(model.wv.index_to_key)))

In [None]:
model.corpus_count
#model.epochs

In [None]:
#model.wv.most_similar("television")
model.wv.most_similar("argument")
#model.wv.similarity("tv","television")

### Vectorizing each sentence using the avg of the Word embidings of each word

In [6]:
def vectorize(list_of_docs, model, strategy):
    """Generate vectors for list of documents using a Word Emx`bedding.

    Args:
        list_of_docs: List of documents.
        model: Gensim Word Embedding.
        strategy: Aggregation strategy ("average", or "min-max".)

    Raises:
        ValueError: If the strategy is other than "average" or "min-max".

    Returns:
        List of vectors.
    """
    features = []
    size_output = model.vector_size
    embedding_dict = model

    if strategy == "min-max":
        size_output *= 2

    if hasattr(model, "wv"):
        embedding_dict = model.wv

    for tokens in list_of_docs:
        zero_vector = np.zeros(size_output)
        vectors = []
        for token in tokens:
            if token in embedding_dict:
                try:
                    vectors.append(embedding_dict[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            if strategy == "min-max":
                min_vec = vectors.min(axis=0)
                max_vec = vectors.max(axis=0)
                features.append(np.concatenate((min_vec, max_vec)))
            elif strategy == "average":
                avg_vec = vectors.mean(axis=0)
                features.append(avg_vec)
            else:
                raise ValueError(f"Aggregation strategy {strategy} does not exist!")
        else:
            features.append(zero_vector)
    return features

### Apply the function above

In [7]:
vectorized_docs = vectorize(tokensSentenceslist, model=model, strategy="average")

Test

In [None]:
print(len(vectorized_docs), len(vectorized_docs[0]))
print(model.wv["argument"])

print("#######################################################")
print(vectorized_docs[0])

### Kmeans algorithm with mini batch

In [8]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

### Applying the Kmeans algorithm

In [None]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=19, print_silhouette_values=True)

df_clusters = pd.DataFrame({
    "text": sentences,
    "tokens": [" ".join(text) for text in tokensSentenceslist],
    "cluster": cluster_labels
})

### Evaluate top terms of the cluster

In [None]:
print("Top terms per cluster (based on centroids):")
for i in range(10): # number of cluster k should be put here!!
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=10)
    #print(clustering.cluster_centers_[i])
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

In [None]:
from collections import Counter

for i in range(10):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")

### Retrieve a random sample of documents for a given cluster


In [None]:
for i,t in enumerate(df_clusters.query(f"cluster == {0}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

In [10]:
df_clusters.to_excel('clusteredArgument- Word2Vec - k=19.xlsx')

In [None]:
df_clusters.shape

# df_clusters.tokens[0]
# df_clusters.head()

### Most representative clusters

In [None]:
test_cluster = 0
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
# print(most_representative_docs[0])
for d in most_representative_docs[:10]:
    print(d)
    print(sentences[d])
    print("-------------")

In [None]:
df_mostRepresentative = pd.DataFrame({})
df_ClsuteringSentencesCount = pd.DataFrame({})

for i in range (19) :
    listSentencesMostRep =[]
    most_representative_docs_Save = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[i], axis=1)
    )
    queryMostCount = len(df_clusters.query(f"cluster == {i}"))
    df_ClsuteringSentencesCount = df_ClsuteringSentencesCount.append({
       "cluster" : i,
        "Sentences Count": queryMostCount
    },ignore_index=True)
    if (queryMostCount > 3):
        for d in most_representative_docs_Save[:3]:
            listSentencesMostRep.append(sentences[d])
    else:
        for d in most_representative_docs_Save[:queryMostCount]:
            listSentencesMostRep.append(sentences[d])

    for sent in listSentencesMostRep:
        df_mostRepresentative = df_mostRepresentative.append({
         "text": sent,
         "cluster": i
         }, ignore_index=True)

df_mostRepresentative.to_csv(f"SbertClustering-All-Essays- {19} -Clusters - 10MostRepresentative.csv")
df_ClsuteringSentencesCount.to_csv(f"SbertClustering-All-Essays- {19} - Clusters - ClusteringCount.csv")

In [None]:
#array = clustering.cluster_centers_[0]
#print(len(vectorized_docs))
#print(array)
array = vectorized_docs[120].reshape(1,-1)
convertedArray = array.astype(float)
# vectorized_docs[i] = sentences[i] it is the same
clustering.predict(convertedArray)

In [None]:
#print(array.reshape(1,-1))

print(array)

print(convertedArray)

### Predicting new clusters for testing

In [None]:
## testing
def vectorizeSentenceTest(sentences):
    tokensSentenceslist = []
    for s  in sentences:
        wordsList = word_tokenize(s)
        tokensSentenceslist.append(wordsList)
    return tokensSentenceslist

testTokens = vectorizeSentenceTest(["Buying some products can be very expensive"])

vectorized_docs_tesing = vectorize(testTokens, model=model, strategy="average")

def predictTest(vectorizedDocsTest):
    array = vectorizedDocsTest
    print(clustering.predict(array))
    return
#len(vectorized_docs_tesing), len(vectorized_docs_tesing[0])
#print(vectorized_docs_tesing)

predictTest(vectorized_docs_tesing)

In [None]:
from Cython import typeof

print(typeof(vectorized_docs))

# print(vectorized_docs[0])
# print(vectorized_docs[0].shape)
# print ("############################################")
#
# print(clustering.cluster_centers_[0])
# print(clustering.cluster_centers_[0].shape)
print (most_representative_docs.shape)

In [None]:
itemindex = np.where(vectorized_docs == clustering.cluster_centers_[0])

print(itemindex)

In [25]:
testDocs = np.sort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[0], axis=1)
)

In [None]:
print(vectorized_docs[16035])

### Testing wether the centroid are sentences or not

In [None]:
itemindex = np.where(vectorized_docs == clustering.cluster_centers_[9])

print(itemindex[0])