### importing necessary libs

In [71]:
import os
import string
from nltk import tokenize
from gensim.models import Word2Vec
from nltk import word_tokenize
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import pandas as pd
import gensim
from nltk.corpus import stopwords
import sentence_transformers
from sentence_transformers import SentenceTransformer,util
from sklearn.cluster import KMeans

### defining the model with the pretrainded data

In [72]:
model = SentenceTransformer('all-mpnet-base-v2', device="cuda")
# embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

### Adding only 100 Essays for embeddings

In [73]:
sentences = []
for filename in os.listdir(r"D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\TelevisonMergedT1+T2"):
   with open(os.path.join(r"D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\TelevisonMergedT1+T2", filename)) as f:
       text = f.read()
       text = text.replace("ï»¿","")
       sents = tokenize.sent_tokenize(text)
       for s in sents:
           #s = s.lower()
           #s = s.translate(str.maketrans('', '', string.punctuation))
           sentences.append(s)


### Getting the embeddings

In [74]:
corpus_embeddings = model.encode(sentences, show_progress_bar =True, device="cuda")

corpus_embeddings.shape

Batches:   0%|          | 0/1210 [00:00<?, ?it/s]

(38715, 768)

### Testing torch with cuda

In [75]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

### applying the clustering method

In [88]:
k = 100

clustering, cluster_labels = mbkmeans_clusters(X=corpus_embeddings, k=k, print_silhouette_values=True)

df_clusters = pd.DataFrame({
    "text": sentences,
    "cluster": cluster_labels
})

For n_clusters = 100
Silhouette coefficient: 0.02
Inertia:18140.359375
Silhouette values:
    Cluster 0: Size:162 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 60: Size:163 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 67: Size:141 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 92: Size:144 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 4: Size:162 | Avg:1.00 | Min:1.00 | Max: 1.00
    Cluster 74: Size:162 | Avg:0.85 | Min:0.09 | Max: 0.92
    Cluster 30: Size:10 | Avg:0.43 | Min:-0.07 | Max: 0.64
    Cluster 99: Size:216 | Avg:0.28 | Min:0.07 | Max: 0.43
    Cluster 45: Size:230 | Avg:0.14 | Min:0.01 | Max: 0.33
    Cluster 14: Size:20 | Avg:0.12 | Min:-0.08 | Max: 0.26
    Cluster 47: Size:328 | Avg:0.10 | Min:-0.25 | Max: 0.24
    Cluster 61: Size:77 | Avg:0.10 | Min:-0.06 | Max: 0.22
    Cluster 65: Size:170 | Avg:0.08 | Min:-0.07 | Max: 0.18
    Cluster 17: Size:398 | Avg:0.08 | Min:-0.14 | Max: 0.20
    Cluster 23: Size:253 | Avg:0.07 | Min:-0.05 | Max: 0.16
    Cluster 34: Size:24

### random statments from cluster

In [21]:
test_cluster = 0
queryCount = len(df_clusters.query(f"cluster == {test_cluster}"))

print(queryCount)

if (queryCount > 10):
    for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(10).iterrows()):
        print(t[1]["text"])
        print("-------------")
else:
    for i,t in enumerate(df_clusters.query(f"cluster == {test_cluster}").sample(queryCount).iterrows()):
        print(t[1]["text"])
        print("-------------")


37
So without advertising the movie is not so long and the kids can play outside.
-------------
The most children are not interested in this advertsing and do not watch them, so that it is unnessecary to produce television advertising directed to young children, then they do not see them.
-------------
Nowadays it is common, that even little children spend some time in front of the TV and since there has to be some sort of advertising between two shows that are made for children, it wouldn't be too clever to put something in there, that isn't directed toward children as well.
-------------
They can only watch it and cannot understand, what is shown in the advertisement.
-------------
So if their parents are sitting right beside them and are also looking at the stuff their children watch, the tv can be switched off when an ad comes and parents can protect the kids from unsuitable content or advertising.
-------------
I think that Children in the age of two to five are too young to under

### Most representative clusters

In [22]:
#test_cluster = 0
most_representative_docs = np.argsort(
    np.linalg.norm(corpus_embeddings - clustering.cluster_centers_[test_cluster], axis=1)
)
# print(most_representative_docs[0])

if (queryCount > 10):
    for d in most_representative_docs[:10]:
        print(d)
        print(sentences[d])
        print("-------------")
else:
    for d in most_representative_docs[:queryCount]:
        print(d)
        print(sentences[d])
        print("-------------")


7
The most children are not interested in this advertsing and do not watch them, so that it is unnessecary to produce television advertising directed to young children, then they do not see them.
-------------
589
But I think children are very easy to manipulate, so on some tv channels for children there isnÂ´t television advertising.
-------------
1145
The reason for that is that the television advertise directly toward young children.
-------------
299
The Children probably don't even understand the television advertisment, but the parents like the advertisment more when it is a well made advertisment.
-------------
563
Advertising on channels who are made for kids are something else - the way the product is getting presented will be a lot more interesting for a kid than for an adult.
-------------
343
But in my opinion they should keep the amount of advertisements they show between kids TV channels to a minimum and they should also just play the advertisement at the end of a show so

### Saving to .csv File

In [89]:
df_clusters.to_csv(f"SbertClustering-All-Essays- {k} -Clusters.csv")

### saving most representative to csv file

In [90]:
df_mostRepresentative = pd.DataFrame({})
df_ClsuteringSentencesCount = pd.DataFrame({})

for i in range (k) :
    listSentencesMostRep =[]
    most_representative_docs_Save = np.argsort(
    np.linalg.norm(corpus_embeddings - clustering.cluster_centers_[i], axis=1)
    )
    queryMostCount = len(df_clusters.query(f"cluster == {i}"))
    df_ClsuteringSentencesCount = df_ClsuteringSentencesCount.append({
       "cluster" : i,
        "Sentences Count": queryMostCount
    },ignore_index=True)
    if (queryMostCount > 10):
        for d in most_representative_docs_Save[:10]:
            listSentencesMostRep.append(sentences[d])
    else:
        for d in most_representative_docs_Save[:queryMostCount]:
            listSentencesMostRep.append(sentences[d])

    for sent in listSentencesMostRep:
        df_mostRepresentative = df_mostRepresentative.append({
         "text": sent,
         "cluster": i
         }, ignore_index=True)



df_mostRepresentative.to_csv(f"SbertClustering-All-Essays- {k} -Clusters - 10MostRepresentative.csv")
df_ClsuteringSentencesCount.to_csv(f"SbertClustering-All-Essays- {k} - Clusters - ClusteringCount.csv")

  df_ClsuteringSentencesCount = df_ClsuteringSentencesCount.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_ClsuteringSentencesCount = df_ClsuteringSentencesCount.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mostRepresentative = df_mostRepresentative.append({
  df_mo

### Test

In [None]:
listTest = ["Ahmed" , "mido", "test test"]
listNumbers = [1 , 10, 15]
df_mostRepresentative = pd.DataFrame({
})

df_mostRepresentative = df_mostRepresentative.append({
    "text": listTest,
    "cluster": listNumbers
}, ignore_index=True)

listTest = ["After" , "after mido", "test after"]
listNumbers = [2 , 3, 65]

df_mostRepresentative = df_mostRepresentative.append({
    "text": listTest,
    "cluster": listNumbers
}, ignore_index=True)

df_mostRepresentative.to_csv('test.csv')

### Testing torch with cuda


In [46]:
import torch

torch.cuda.is_available()

True

### Testing word_embeddings

In [None]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]
corpus_embeddings = model.encode(corpus)

########################## TESTING #########################################################
#Compute cosine similarity between all pairs
cos_sim = util.cos_sim(corpus_embeddings, corpus_embeddings)

#Add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append([cos_sim[i][j], i, j])

#Sort list by the highest cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t {} \t {:.4f}".format(corpus[i], corpus[j], cos_sim[i][j]))

for sentence, embedding in zip(corpus, corpus_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")