### Importing section

In [1]:
import os
import string
from nltk import tokenize
from gensim.models import Word2Vec
from nltk import word_tokenize
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
import pandas as pd
import gensim
from nltk.corpus import stopwords

### Reading the data from the folder and cleaning it.

In [2]:
sentences = []
for filename in os.listdir(r"D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\TelevisonMergedT1+T2"):
   with open(os.path.join(r"D:\UDE\6th Semester\MEMS\MEWS Data\MEWS_Essays\MEWS_Essays\Essays_all\TelevisonMergedT1+T2", filename)) as f:
       text = f.read()
       text = text.replace("ï»¿","")
       sents = tokenize.sent_tokenize(text)
       for s in sents:
           #s = s.lower()
           #s = s.translate(str.maketrans('', '', string.punctuation))
           sentences.append(s)

tokensSentenceslist = []

for s  in sentences:
    wordsList = gensim.utils.simple_preprocess(s)
    filtered_words = [word for word in wordsList if word not in stopwords.words('english')]
    tokensSentenceslist.append(filtered_words)


##################### Uncomment below section for testing #########################
# print(len(sentences))
#
# for s in sentences:
#      print("The sentence is : ")
#      print(s)
#      print("-----------------------End of the sentence -------------")
#
# print (sentences)


# print (len(tokensSentenceslist))
# print (tokensSentenceslist)


In [3]:
print (tokensSentenceslist[0])

['advertising', 'television', 'becoming', 'clever']


In [4]:
testSentences = gensim.utils.simple_preprocess("This is just a test and i don't know")

print (testSentences)

['this', 'is', 'just', 'test', 'and', 'don', 'know']


In [6]:
print (len(sentences))
print(sentences[0:5])

38715
['Advertising in Television is becoming more and more clever.', 'Lots of those ads invade our subconsciousness and try to make us buy the product even if we do not actually need it.', "When it comes to children, lots of them don't have enough money to afford these products.", "They don't even understand the concept of money.", 'Even if they wanted it, they can not get the thing they were told they need.']


Generating the Word2Vec Model

In [7]:
#model = Word2Vec(tokensSentenceslist, min_count=1)

#model = Word2Vec(tokensSentenceslist, vector_size=50, min_count=1, sg=1)
#model = Word2Vec(sentences=tokensSentenceslist, vector_size=100, workers=1, seed=42)

model = Word2Vec(window=10, min_count=2,workers=6,vector_size=100,seed=42,sg=0)
model.build_vocab(tokensSentenceslist, progress_per=1000)
model.train(tokensSentenceslist, total_examples=model.corpus_count, epochs=model.epochs)


##################### Uncomment below section for testing #########################


# print(list(model.wv.index_to_key))
# print(len(list(model.wv.index_to_key)))

(1313549, 1737390)

In [8]:
model.corpus_count
#model.epochs

38715

In [9]:
#model.wv.most_similar("television")
model.wv.most_similar("argument")
#model.wv.similarity("tv","television")

[('reason', 0.9564138650894165),
 ('point', 0.947143018245697),
 ('side', 0.9210261106491089),
 ('support', 0.9066071510314941),
 ('points', 0.8929862380027771),
 ('agree', 0.8925236463546753),
 ('aspect', 0.88764488697052),
 ('positive', 0.8861747980117798),
 ('reasons', 0.8837171196937561),
 ('disagree', 0.8779966235160828)]

### Vectorizing each sentence using the avg of the Word embidings of each word

In [10]:
def vectorize(list_of_docs, model, strategy):
    """Generate vectors for list of documents using a Word Emx`bedding.

    Args:
        list_of_docs: List of documents.
        model: Gensim Word Embedding.
        strategy: Aggregation strategy ("average", or "min-max".)

    Raises:
        ValueError: If the strategy is other than "average" or "min-max".

    Returns:
        List of vectors.
    """
    features = []
    size_output = model.vector_size
    embedding_dict = model

    if strategy == "min-max":
        size_output *= 2

    if hasattr(model, "wv"):
        embedding_dict = model.wv

    for tokens in list_of_docs:
        zero_vector = np.zeros(size_output)
        vectors = []
        for token in tokens:
            if token in embedding_dict:
                try:
                    vectors.append(embedding_dict[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            if strategy == "min-max":
                min_vec = vectors.min(axis=0)
                max_vec = vectors.max(axis=0)
                features.append(np.concatenate((min_vec, max_vec)))
            elif strategy == "average":
                avg_vec = vectors.mean(axis=0)
                features.append(avg_vec)
            else:
                raise ValueError(f"Aggregation strategy {strategy} does not exist!")
        else:
            features.append(zero_vector)
    return features

### Apply the function above

In [11]:
vectorized_docs = vectorize(tokensSentenceslist, model=model, strategy="average")

Test

In [12]:
print(len(vectorized_docs), len(vectorized_docs[0]))
print(model.wv["argument"])

print("#######################################################")
print(vectorized_docs[0])

38715 100
[ 0.11528822  0.18933415 -0.08412005  0.10589357 -0.4265991   1.0531337
 -0.13715024  1.298568   -0.8779157  -0.1582561  -0.0780249   0.9799034
  0.44961306  0.5217004  -0.37189397 -0.7606606  -0.12328722 -1.425695
 -0.83165705  0.6247762  -0.95108575 -0.16939847  0.6635875   0.8707157
  0.59455246  0.29358917 -0.42626062  0.38870233 -0.44945085 -0.76935554
 -0.42892438  0.149744   -0.04381364 -0.356411    0.37657246 -0.4028334
  0.43931213 -0.53439194 -0.19522092  0.3774061   0.1344047  -0.5865974
  0.07677748  0.10171629 -0.21673799  0.46849895  0.36986643  0.5009073
  0.76379424 -0.7040103   0.36664996 -0.1903421  -0.61849546  0.01906019
  0.5785666   0.04054277 -0.06649713  0.02076649 -0.6030253   0.1796957
 -0.4925605   0.7685024  -1.3589346   0.18270253 -0.3059741  -0.17928098
  0.43794832 -0.28152093 -1.6943305  -0.2687159   0.7706033  -0.56518394
 -0.16859932 -0.23504497  0.4105878  -0.04653804 -0.9585702  -0.91110015
  0.09044305 -0.11684474 -0.07303404 -0.5460595  -

### Kmeans algorithm with mini batch

In [13]:
def mbkmeans_clusters(X, k, mb=500, print_silhouette_values=False):
    """Generate clusters.

    Args:
        X: Matrix of features.
        k: Number of clusters.
        mb: Size of mini-batches. Defaults to 500.
        print_silhouette_values: Print silhouette values per cluster.

    Returns:
        Trained clustering model and labels based on X.
    """
    km = MiniBatchKMeans(n_clusters=k, batch_size=mb).fit(X)
    print(f"For n_clusters = {k}")
    print(f"Silhouette coefficient: {silhouette_score(X, km.labels_):0.2f}")
    print(f"Inertia:{km.inertia_}")

    if print_silhouette_values:
        sample_silhouette_values = silhouette_samples(X, km.labels_)
        print(f"Silhouette values:")
        silhouette_values = []
        for i in range(k):
            cluster_silhouette_values = sample_silhouette_values[km.labels_ == i]
            silhouette_values.append(
                (
                    i,
                    cluster_silhouette_values.shape[0],
                    cluster_silhouette_values.mean(),
                    cluster_silhouette_values.min(),
                    cluster_silhouette_values.max(),
                )
            )
        silhouette_values = sorted(
            silhouette_values, key=lambda tup: tup[2], reverse=True
        )
        for s in silhouette_values:
            print(
                f"    Cluster {s[0]}: Size:{s[1]} | Avg:{s[2]:.2f} | Min:{s[3]:.2f} | Max: {s[4]:.2f}"
            )
    return km, km.labels_

### Applying the Kmeans algorithm

In [14]:
clustering, cluster_labels = mbkmeans_clusters(X=vectorized_docs, k=10, print_silhouette_values=True)

df_clusters = pd.DataFrame({
    "text": sentences,
    "tokens": [" ".join(text) for text in tokensSentenceslist],
    "cluster": cluster_labels
})

For n_clusters = 10
Silhouette coefficient: 0.14
Inertia:103203.68269933804
Silhouette values:
    Cluster 7: Size:655 | Avg:0.58 | Min:0.21 | Max: 0.67
    Cluster 2: Size:313 | Avg:0.45 | Min:-0.02 | Max: 0.50
    Cluster 3: Size:2442 | Avg:0.25 | Min:-0.04 | Max: 0.46
    Cluster 4: Size:6745 | Avg:0.16 | Min:-0.05 | Max: 0.37
    Cluster 5: Size:647 | Avg:0.14 | Min:-0.09 | Max: 0.42
    Cluster 0: Size:5430 | Avg:0.13 | Min:-0.13 | Max: 0.38
    Cluster 9: Size:5366 | Avg:0.12 | Min:-0.10 | Max: 0.36
    Cluster 8: Size:8451 | Avg:0.11 | Min:-0.07 | Max: 0.30
    Cluster 1: Size:6422 | Avg:0.09 | Min:-0.10 | Max: 0.30
    Cluster 6: Size:2244 | Avg:0.07 | Min:-0.11 | Max: 0.33


### Evaluate top terms of the cluster

In [15]:
print("Top terms per cluster (based on centroids):")
for i in range(10): # number of cluster k should be put here!!
    tokens_per_cluster = ""
    most_representative = model.wv.most_similar(positive=[clustering.cluster_centers_[i]], topn=10)
    #print(clustering.cluster_centers_[i])
    for t in most_representative:
        tokens_per_cluster += f"{t[0]} "
    print(f"Cluster {i}: {tokens_per_cluster}")

Top terms per cluster (based on centroids):
Cluster 0: sometimes stupid arenâ anyway electronics must whatch wathing theire childen 
Cluster 1: childern anyway necessary adversting permit advertisings advertises think firstly difficult 
Cluster 2: outweigh controverse discussion schreibaufgabe iâ cons answers controversy write answer 
Cluster 3: allowed forbidden opinion adversting difficult personally permit childern question ages 
Cluster 4: sort secondly addition absolutly advertsing overwhelmed doubt smart danger risks 
Cluster 5: reasons disagree personally sides sum points conclusion difficult arguments conclude 
Cluster 6: bored else fun anymore healthy forget together seeing encourage love 
Cluster 7: shoes stuffed bar cream ils drink truck fantasies wheels wars 
Cluster 8: furthermore useful difference case worse sure canâ belive thats cannot 
Cluster 9: probably decision stop anything course mean cannot mad actually makes 


In [16]:
from collections import Counter

for i in range(10):
    tokens_per_cluster = ""
    most_frequent = Counter(" ".join(df_clusters.query(f"cluster == {i}")["tokens"]).split()).most_common(5)
    for t in most_frequent:
        tokens_per_cluster += f"{t[0]}({str(t[1])}) "
    print(f"Cluster {i}: {tokens_per_cluster}")

Cluster 0: tv(2638) children(2377) watch(2097) television(1614) watching(1083) 
Cluster 1: children(5062) young(2767) advertising(2666) television(2189) think(1163) 
Cluster 2: schreibaufgabe(293) essay(161) following(156) statement(151) welches(149) 
Cluster 3: children(2164) young(1744) advertising(1682) television(1417) directed(1390) 
Cluster 4: children(747) advertising(662) television(560) would(429) like(412) 
Cluster 5: statement(327) agree(171) arguments(118) following(90) opinion(77) 
Cluster 6: play(599) children(417) new(393) like(365) outside(345) 
Cluster 7: sasageyo(1092) die(654) qualitã(325) zeit(325) sie(324) 
Cluster 8: children(4031) advertising(1443) young(1126) television(1077) things(1039) 
Cluster 9: parents(2376) want(1897) buy(1872) children(1832) child(1018) 


### Retrieve a random sample of documents for a given cluster


In [17]:
for i,t in enumerate(df_clusters.query(f"cluster == {0}").sample(10).iterrows()):
    print(t[1]["text"])
    print("-------------")

And if they watch there shouldn't be advertising in between the series they are watching.
-------------
If the children look TV, they get attracted by this advertisement.
-------------
You can barely find 7 years old kids on instagram.
-------------
Moreover there are much more effective, healthy and communnicative ways for children to learn a new language and imrpove their speaking skills than watching television.
-------------
For example the german tv channel 'KIKA'.
-------------
A TV can not attack you or do anything by its own.
-------------
Television adevertising is on every channel.
-------------
But in the end it should not be forbidden to watch TV but it should be very rare like once a week or something like that.
-------------
For that parents can just install a code for those channels so the children will not have a chance to watch these.
-------------
If you think television adds are bad for your child, just don't let it watch tv, what it doesn't know, it doesn't miss.
--

In [18]:
df_clusters.to_csv('clusteredArgument.csv')

In [19]:
df_clusters.shape

# df_clusters.tokens[0]
# df_clusters.head()

(38715, 3)

### Most representative clusters

In [20]:
test_cluster = 0
most_representative_docs = np.argsort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[test_cluster], axis=1)
)
# print(most_representative_docs[0])
for d in most_representative_docs[:10]:
    print(d)
    print(sentences[d])
    print("-------------")

23971
In my opinion it is not good to let the children watch TV everytime but the solution is not to just forbid it because lots of children like to watch TV and for example in the morning when everyone is still sleeping and when it is cold outside the children are glad to watch TV so they can pass a little time and it is also easier for the parents so they are able to sleep longer.
-------------
25361
And if people think that they're kids are getting brainwashed, then maybe they shouldn't put them right in front of the TV.
-------------
21924
Children who watch too much TV do not care anymore about their real lifewith their true friends and prefer to watch Tv than to go out.These people may have a big disadvantage in their lives.
-------------
10737
So, we just decided that watching TV can sometimes be helpful even when itÂ´s not that healthy.
-------------
36675
Because children have enough imagination for other activities if you let them think and don't put them in front of the Tv w

In [21]:
#array = clustering.cluster_centers_[0]
#print(len(vectorized_docs))
#print(array)
array = vectorized_docs[120].reshape(1,-1)
convertedArray = array.astype(float)
# vectorized_docs[i] = sentences[i] it is the same
clustering.predict(convertedArray)

array([6])

In [22]:
#print(array.reshape(1,-1))

print(array)

print(convertedArray)

[[ 0.16868031 -0.11518344  0.04375385 -0.16961181 -0.32910988  0.07519562
  -0.03473708  0.2881326  -0.22865133 -0.30180973 -0.2749814   0.05678155
   0.22902955  0.25514063 -0.08736596 -0.4990135  -0.0990077  -0.01215302
  -0.6290117  -0.21519752  0.07409988  0.7055213   0.22721244  0.12753491
  -0.03387943  0.01707247 -0.2618402   0.33965072 -0.17506336  0.1853242
  -0.40236542 -0.16700786  0.3402451  -0.11415518 -0.32711414 -0.12293297
   0.32198665 -0.3418258   0.10985515  0.17799467  0.2483626  -0.08238377
   0.2432443  -0.50477004  0.32689932 -0.09225568  0.0034604   0.32537356
   0.67596817 -0.65887624  0.18741415 -0.30047625 -0.18230766  0.08700434
   0.10143799  0.18505335 -0.13225521  0.24774665 -0.10696658 -0.04113882
  -0.32722214 -0.33064002 -0.24776226  0.09273881  0.3967111  -0.05888849
   0.14747654 -0.1701913  -0.14426146  0.06001846  0.2656059   0.10673421
  -0.04613359 -0.6488152   0.15446118 -0.22528869  0.26844132 -0.23360653
  -0.22219734 -0.25563705  0.04363551  

### Predicting new clusters for testing

In [23]:
## testing
def vectorizeSentenceTest(sentences):
    tokensSentenceslist = []
    for s  in sentences:
        wordsList = word_tokenize(s)
        tokensSentenceslist.append(wordsList)
    return tokensSentenceslist

testTokens = vectorizeSentenceTest(["Watching tv for long hours leads to lazness"])

vectorized_docs_tesing = vectorize(testTokens, model=model, strategy="average")

def predictTest(vectorizedDocsTest):
    array = vectorizedDocsTest
    print(clustering.predict(array))
    return
#len(vectorized_docs_tesing), len(vectorized_docs_tesing[0])
#print(vectorized_docs_tesing)

predictTest(vectorized_docs_tesing)

[0]


In [24]:
from Cython import typeof

print(typeof(vectorized_docs))

# print(vectorized_docs[0])
# print(vectorized_docs[0].shape)
# print ("############################################")
#
# print(clustering.cluster_centers_[0])
# print(clustering.cluster_centers_[0].shape)
print (most_representative_docs.shape)

list
(38715,)


In [24]:
itemindex = np.where(vectorized_docs == clustering.cluster_centers_[0])

print(itemindex)

(array([], dtype=int64), array([], dtype=int64))


In [25]:
testDocs = np.sort(
    np.linalg.norm(vectorized_docs - clustering.cluster_centers_[0], axis=1)
)

In [26]:
print(vectorized_docs[16035])

[-0.12176871 -0.79313385 -0.00996342 -0.24011901 -0.24725829  0.19357361
 -0.46350023  0.23181807 -0.21608464  0.12644906  0.04430683  0.75692064
 -0.07661071  1.2821914   0.3033127   0.27832875  0.19656463 -1.1782662
 -0.41312933 -0.08160181 -0.48066178  0.05651013 -0.23837197  0.36745435
  1.1551768   0.14768282 -0.11665241  0.10531715 -0.42095342  0.1757475
 -0.30491138 -0.2725099  -0.24853076  0.20087126  0.8452291  -0.2359287
 -0.3064599  -1.2386026  -0.09007625  0.20649533 -0.11888672  0.23353544
  0.75751346 -0.21393278  0.21745685 -0.2003168   0.13339151  0.92616767
  0.73504096 -0.2729033 ]


### Testing wether the centroid are sentences or not

In [27]:
itemindex = np.where(vectorized_docs == clustering.cluster_centers_[9])

print(itemindex[0])

[]
