# scikit-learn and kmeans (version 0.20.2)
## this notebook uses nltk (version 3.4) for preprocessing; scikit learn for ML; k-means for clustering
## using the data set from our client, this is our first attempt to apply ML algorithms and clustering. The results are not great. 

In [37]:
import numpy as np
import pandas as pd
import nltk
import re
from sklearn import feature_extraction
import mpld3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [38]:
!pip install mpld3



In [39]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("spanish")

In [40]:
# preparar funciones de procesamiento de texto
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [41]:
dataset = pd.read_csv('mattermost_running.csv', sep=',', parse_dates=['creation_date'])

In [42]:
text = dataset.text.dropna()
text.shape

(287,)

In [43]:
palabras = []

for i in text:
    if i is float:
        continue
    if len(i) < 2:
        continue
    if (re.search(r'\d', i)):
        continue
    if i.startswith('http') :
        continue       
    p = tokenize_and_stem(i)
    palabras.extend(p)

In [44]:
# lista de "stopwords"
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('spanish')

stopwords.append('https')
stopwords.append('rt')
stopwords.append('//t.co/86i0lev9kv')
stopwords.append('hola')
stopwords.append('Hola')

f_text = [word for word in palabras if word not in stopwords]

In [45]:
vocab_frame = pd.DataFrame({'words': f_text}, index = range(len(f_text)))
print(vocab_frame[0:10])

     words
0      hol
1  necesit
2    conoc
3     sobr
4    preci
5    segur
6      aut
7    prueb
8      hol
9   client


## ML 

In [46]:
# Compute the term frequency-inverse document frequency matrix
tfidf_vectorizer = TfidfVectorizer(binary=True, max_df=.95,
                                 min_df=15, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,5))

tfidf_matrix = tfidf_vectorizer.fit_transform(text.astype('U'))
print("La matrix tiene %i filas (documentos) y %i columnas (palabras)\n" % tfidf_matrix.shape)

  sorted(inconsistent))


La matrix tiene 287 filas (documentos) y 17 columnas (palabras)



In [47]:
terms = tfidf_vectorizer.get_feature_names()
print("Hay en total %i palabras:\n" % len(terms))
print(terms)

Hay en total 17 palabras:

['aut', 'buen', 'com', 'envi', 'graci', 'hac', 'hol', 'necesit', 'pag', 'par', 'pas', 'pued', 'q', 'quier', 'sab', 'segur', 'si']


In [48]:
num_clusters = 7
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [49]:
# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))

El cluster 0 tiene 16 elementos
El cluster 1 tiene 161 elementos
El cluster 2 tiene 29 elementos
El cluster 3 tiene 21 elementos
El cluster 4 tiene 18 elementos
El cluster 5 tiene 24 elementos
El cluster 6 tiene 18 elementos


In [50]:
dist = 1 - cosine_similarity(tfidf_matrix)

In [36]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("[[ Cluster %d ]]" % i, end='\n\n')
    
    print("   WORDS /// ", end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(terms[ind], end=' / ')
    print('\n\n')

Top terms per cluster:

[[ Cluster 0 ]]

   WORDS /// segur / hac / sab / si / pag / aut / 


[[ Cluster 1 ]]

   WORDS /// graci / necesit / pag / si / aut / sab / 


[[ Cluster 2 ]]

   WORDS /// envi / pag / quier / si / buen / par / 


[[ Cluster 3 ]]

   WORDS /// pued / pas / com / quier / hac / pag / 


[[ Cluster 4 ]]

   WORDS /// q / aut / pas / par / quier / pued / 


[[ Cluster 5 ]]

   WORDS /// par / segur / quier / com / pas / necesit / 


[[ Cluster 6 ]]

   WORDS /// hol / buen / aut / segur / par / pag / 


