### Importing section

In [1]:
import os
import string
from nltk import tokenize
from gensim.models import Word2Vec
from nltk import word_tokenize
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import pandas as pd
import gensim
from nltk.corpus import stopwords

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [4]:
embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [5]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
          ]

In [6]:
corpus_embeddings = embedder.encode(corpus)

In [7]:
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [8]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

In [9]:
for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']

Cluster  2
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  3
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  4
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  5
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']



### Reading the data from the folder and cleaning it.

In [26]:
sentences = []
for filename in os.listdir(r"D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\TelevisonMergedT1+T2"):
   with open(os.path.join(r"D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\TelevisonMergedT1+T2", filename)) as f:
       text = f.read()
       text = text.replace("ï»¿","")
       sents = tokenize.sent_tokenize(text)
       for s in sents:
           #s = s.lower()
           #s = s.translate(str.maketrans('', '', string.punctuation))
           sentences.append(s)

tokensSentenceslist = []

for s  in sentences:
    wordsList = gensim.utils.simple_preprocess(s)
    filtered_words = [word for word in wordsList if word not in stopwords.words('english')]
    tokensSentenceslist.append(filtered_words)


##################### Uncomment below section for testing #########################
# print(len(sentences))
#
# for s in sentences:
#      print("The sentence is : ")
#      print(s)
#      print("-----------------------End of the sentence -------------")
#
# print (sentences)


# print (len(tokensSentenceslist))
# print (tokensSentenceslist)


In [27]:
print (tokensSentenceslist[0])

['advertising', 'television', 'becoming', 'clever']


In [28]:
testSentences = gensim.utils.simple_preprocess("This is just a test and i don't know")

print (testSentences)

['this', 'is', 'just', 'test', 'and', 'don', 'know']


In [29]:
print (len(sentences))
print(sentences[0:5])

38715
['Advertising in Television is becoming more and more clever.', 'Lots of those ads invade our subconsciousness and try to make us buy the product even if we do not actually need it.', "When it comes to children, lots of them don't have enough money to afford these products.", "They don't even understand the concept of money.", 'Even if they wanted it, they can not get the thing they were told they need.']



### Generating the Word2Vec Model

In [30]:
#model = Word2Vec(tokensSentenceslist, min_count=1)

#model = Word2Vec(tokensSentenceslist, vector_size=50, min_count=1, sg=1)
#model = Word2Vec(sentences=tokensSentenceslist, vector_size=100, workers=1, seed=42)

model = Word2Vec(window=10, min_count=2,workers=6,vector_size=100,seed=42,sg=0)
model.build_vocab(tokensSentenceslist, progress_per=1000)
model.train(tokensSentenceslist, total_examples=model.corpus_count, epochs=model.epochs)


##################### Uncomment below section for testing #########################


# print(list(model.wv.index_to_key))
# print(len(list(model.wv.index_to_key)))

(1313567, 1737390)

In [31]:
model.corpus_count
#model.epochs

38715

In [32]:
#model.wv.most_similar("television")
model.wv.most_similar("argument")
#model.wv.similarity("tv","television")

[('reason', 0.9420978426933289),
 ('point', 0.9409478902816772),
 ('side', 0.9202869534492493),
 ('agree', 0.8974602818489075),
 ('aspect', 0.8958882093429565),
 ('reasons', 0.8917336463928223),
 ('support', 0.8810402154922485),
 ('personally', 0.8794190287590027),
 ('disagree', 0.8751274943351746),
 ('points', 0.8721221089363098)]

### Vectorizing each sentence using the avg of the Word embidings of each word

In [33]:
def vectorize(list_of_docs, model, strategy):
    """Generate vectors for list of documents using a Word Emx`bedding.

    Args:
        list_of_docs: List of documents.
        model: Gensim Word Embedding.
        strategy: Aggregation strategy ("average", or "min-max".)

    Raises:
        ValueError: If the strategy is other than "average" or "min-max".

    Returns:
        List of vectors.
    """
    features = []
    size_output = model.vector_size
    embedding_dict = model

    if strategy == "min-max":
        size_output *= 2

    if hasattr(model, "wv"):
        embedding_dict = model.wv

    for tokens in list_of_docs:
        zero_vector = np.zeros(size_output)
        vectors = []
        for token in tokens:
            if token in embedding_dict:
                try:
                    vectors.append(embedding_dict[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            if strategy == "min-max":
                min_vec = vectors.min(axis=0)
                max_vec = vectors.max(axis=0)
                features.append(np.concatenate((min_vec, max_vec)))
            elif strategy == "average":
                avg_vec = vectors.mean(axis=0)
                features.append(avg_vec)
            else:
                raise ValueError(f"Aggregation strategy {strategy} does not exist!")
        else:
            features.append(zero_vector)
    return features

### Apply the function above

In [34]:
vectorized_docs = vectorize(tokensSentenceslist, model=model, strategy="average")

Test

In [35]:
print(len(vectorized_docs), len(vectorized_docs[0]))
print(model.wv["argument"])

print("#######################################################")
print(vectorized_docs[0])

38715 100
[ 0.14049838  0.26483944 -0.31178233  0.04889421 -0.4739972   0.7198708
 -0.45431072  1.288989   -0.8510209  -0.12554932  0.16687194  0.74362314
  0.53374034  0.13064659 -0.07579185 -0.48557985 -0.23639245 -1.2327642
 -0.43592197  0.48491108 -1.0318269  -0.54556245  0.5922161   0.38107365
  0.540607    0.5871719  -0.15847757 -0.05579238 -0.326022   -0.9108644
 -0.06940957  0.16453063 -0.01091491 -0.40957034  0.45949882 -0.33499575
  0.37234515 -0.44981864 -0.2568936   0.5194661   0.32487923 -0.8094511
  0.45209676 -0.0290043   0.12825406  0.5299665   0.19131848  0.63766295
  1.0680393  -0.7605646   0.39149797 -0.67647886 -0.9543225  -0.27332538
  0.5363619   0.27479282  0.1006263  -0.05415126 -0.6557303   0.17720665
 -0.5534491   0.67129415 -1.1589129   0.18394628 -0.16723034 -0.38107216
  0.28851694 -0.10897604 -1.6831535  -0.22472814  0.6801643  -0.63879627
 -0.02100011 -0.24930024  0.6857125   0.03476049 -1.1521822  -1.1853676
  0.18162669 -0.25959158  0.11647352 -0.735303

### Kmeans algorithm with mini batch

In [36]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

### Applying the Kmeans algorithm

In [37]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=10, print_silhouette_values=True)

df_clusters = pd.DataFrame({
    "text": sentences,
    "tokens": [" ".join(text) for text in tokensSentenceslist],
    "cluster": cluster_labels
})

For n_clusters = 10
Silhouette coefficient: 0.13
Inertia:102984.12434970376
Silhouette values:
    Cluster 8: Size:163 | Avg:1.00 | Min:0.85 | Max: 1.00
    Cluster 5: Size:492 | Avg:0.54 | Min:0.07 | Max: 0.59
    Cluster 7: Size:5798 | Avg:0.21 | Min:0.01 | Max: 0.44
    Cluster 0: Size:4777 | Avg:0.14 | Min:-0.13 | Max: 0.39
    Cluster 9: Size:6689 | Avg:0.11 | Min:-0.06 | Max: 0.32
    Cluster 2: Size:7661 | Avg:0.11 | Min:-0.08 | Max: 0.31
    Cluster 3: Size:2207 | Avg:0.11 | Min:-0.20 | Max: 0.33
    Cluster 4: Size:4894 | Avg:0.11 | Min:-0.11 | Max: 0.32
    Cluster 1: Size:2895 | Avg:0.09 | Min:-0.11 | Max: 0.31
    Cluster 6: Size:3139 | Avg:0.03 | Min:-0.18 | Max: 0.28


### Evaluate top terms of the cluster

In [38]:
print("Top terms per cluster (based on centroids):")
for i in range(10): # number of cluster k should be put here!!
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=10)
    #print(clustering.cluster_centers_[i])
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: anyway letting donâ okey sometimes childen shouldnâ put controll whatch 
Cluster 1: probably afford tell best mad advert disappoint want course sad 
Cluster 2: advertises childre firstly necessary difference however thats personaly affected desicide 
Cluster 3: allowed forbidden nutshell opinion schould childern difficult question tein allowd 
Cluster 4: childern schould personaly difficult shouldnâ nutshell oppinion sense adversting necessary 
Cluster 5: versuchen sie zeit stuffed shoes die hair wie fantasies app 
Cluster 6: addicted others thy nothing helpful behave doesnâ forget better healthy 
Cluster 7: partents possibility addition wont pretty wonâ wich worse thought simply 
Cluster 8: anhand wird ihr beurteilt qualitã carrot ten shouts boredom speaker 
Cluster 9: case course furthermore cannot useful realise anything worse actually mean 


In [39]:
from collections import Counter

for i in range(10):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")

Cluster 0: tv(2584) children(2273) watch(2252) television(1513) watching(988) 
Cluster 1: buy(1308) parents(1262) want(1101) toy(696) child(664) 
Cluster 2: children(3768) advertising(2152) young(1449) television(1434) think(927) 
Cluster 3: children(1625) young(1358) advertising(1345) television(1179) directed(1177) 
Cluster 4: children(3905) young(2465) advertising(2239) television(1856) directed(1048) 
Cluster 5: sasageyo(1092) die(654) zeit(325) sie(324) diese(177) 
Cluster 6: learn(966) children(964) things(634) play(530) life(528) 
Cluster 7: children(586) like(535) child(463) example(432) television(350) 
Cluster 8: ihr(163) text(163) wird(163) anhand(163) beurteilt(163) 
Cluster 9: children(3087) parents(2020) want(1391) get(1048) things(1042) 


### Retrieve a random sample of documents for a given cluster


In [40]:
for i,t in enumerate(df_clusters.query(f"cluster == {0}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

The problem is that children from two to five arenÂ´t even able to read in this age so they start watching TV before they are able to read a book.
-------------
If they are constantly confronted with something they like they wonÂ´t stop watching it causing them to be addicted to television  which is bad for a young child that should grow.
-------------
So I think that it is important that the parents show their kids films which are appropriated too the childrens ages.
-------------
First of all, it could be a problem if the children don't watch television.
-------------
It also depends on what time it is.
-------------
Another problem, however is their increasing time in front of the TV.
-------------
It's all much more important than showing them how to turn on a tv.
-------------
Kids from young age on get used to a TV, a mobile phone and the computer.
-------------
In fact we have to  ask ourselves why a five year old children would watch that much tv that they see some advertisemen

In [18]:
df_clusters.to_csv('clusteredArgument.csv')

In [19]:
df_clusters.shape

# df_clusters.tokens[0]
# df_clusters.head()

(38715, 3)

### Most representative clusters

In [46]:
test_cluster = 0
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
# print(most_representative_docs[0])
for d in most_representative_docs[:10]:
    print(d)
    print(sentences[d])
    print("-------------")

23971
In my opinion it is not good to let the children watch TV everytime but the solution is not to just forbid it because lots of children like to watch TV and for example in the morning when everyone is still sleeping and when it is cold outside the children are glad to watch TV so they can pass a little time and it is also easier for the parents so they are able to sleep longer.
-------------
11348
There is no doubt that it isnÂ´t that good for young children to watch TV because they can get sick or they only want to do this all day long.
-------------
35281
However nowadays there are too many parents not being able to handle parenting and just letting their children watch tv all day.
-------------
36914
That means that parents have to know what their children watch and when series are on the TV specially produced for childrens learning.
-------------
17941
I think itÂ´s ok when children watch Tv, but the parent must controll the programs and the time.
-------------
31267
In this g

In [21]:
#array = clustering.cluster_centers_[0]
#print(len(vectorized_docs))
#print(array)
array = vectorized_docs[120].reshape(1,-1)
convertedArray = array.astype(float)
# vectorized_docs[i] = sentences[i] it is the same
clustering.predict(convertedArray)

array([6])

In [22]:
#print(array.reshape(1,-1))

print(array)

print(convertedArray)

[[ 0.16868031 -0.11518344  0.04375385 -0.16961181 -0.32910988  0.07519562
  -0.03473708  0.2881326  -0.22865133 -0.30180973 -0.2749814   0.05678155
   0.22902955  0.25514063 -0.08736596 -0.4990135  -0.0990077  -0.01215302
  -0.6290117  -0.21519752  0.07409988  0.7055213   0.22721244  0.12753491
  -0.03387943  0.01707247 -0.2618402   0.33965072 -0.17506336  0.1853242
  -0.40236542 -0.16700786  0.3402451  -0.11415518 -0.32711414 -0.12293297
   0.32198665 -0.3418258   0.10985515  0.17799467  0.2483626  -0.08238377
   0.2432443  -0.50477004  0.32689932 -0.09225568  0.0034604   0.32537356
   0.67596817 -0.65887624  0.18741415 -0.30047625 -0.18230766  0.08700434
   0.10143799  0.18505335 -0.13225521  0.24774665 -0.10696658 -0.04113882
  -0.32722214 -0.33064002 -0.24776226  0.09273881  0.3967111  -0.05888849
   0.14747654 -0.1701913  -0.14426146  0.06001846  0.2656059   0.10673421
  -0.04613359 -0.6488152   0.15446118 -0.22528869  0.26844132 -0.23360653
  -0.22219734 -0.25563705  0.04363551  

### Predicting new clusters for testing

In [45]:
## testing
def vectorizeSentenceTest(sentences):
    tokensSentenceslist = []
    for s  in sentences:
        wordsList = word_tokenize(s)
        tokensSentenceslist.append(wordsList)
    return tokensSentenceslist

testTokens = vectorizeSentenceTest(["Buying some products can be very expensive"])

vectorized_docs_tesing = vectorize(testTokens, model=model, strategy="average")

def predictTest(vectorizedDocsTest):
    array = vectorizedDocsTest
    print(clustering.predict(array))
    return
#len(vectorized_docs_tesing), len(vectorized_docs_tesing[0])
#print(vectorized_docs_tesing)

predictTest(vectorized_docs_tesing)

[1]


In [24]:
from Cython import typeof

print(typeof(vectorized_docs))

# print(vectorized_docs[0])
# print(vectorized_docs[0].shape)
# print ("############################################")
#
# print(clustering.cluster_centers_[0])
# print(clustering.cluster_centers_[0].shape)
print (most_representative_docs.shape)

list
(38715,)


In [24]:
itemindex = np.where(vectorized_docs == clustering.cluster_centers_[0])

print(itemindex)

(array([], dtype=int64), array([], dtype=int64))


In [25]:
testDocs = np.sort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[0], axis=1)
)

In [26]:
print(vectorized_docs[16035])

[-0.12176871 -0.79313385 -0.00996342 -0.24011901 -0.24725829  0.19357361
 -0.46350023  0.23181807 -0.21608464  0.12644906  0.04430683  0.75692064
 -0.07661071  1.2821914   0.3033127   0.27832875  0.19656463 -1.1782662
 -0.41312933 -0.08160181 -0.48066178  0.05651013 -0.23837197  0.36745435
  1.1551768   0.14768282 -0.11665241  0.10531715 -0.42095342  0.1757475
 -0.30491138 -0.2725099  -0.24853076  0.20087126  0.8452291  -0.2359287
 -0.3064599  -1.2386026  -0.09007625  0.20649533 -0.11888672  0.23353544
  0.75751346 -0.21393278  0.21745685 -0.2003168   0.13339151  0.92616767
  0.73504096 -0.2729033 ]


### Testing wether the centroid are sentences or not

In [27]:
itemindex = np.where(vectorized_docs == clustering.cluster_centers_[9])

print(itemindex[0])

[]
