# Clustering with scikit-learn and kmeans 
## NLTK for preprocessing; scikit learn for vectorizing, matrix; k-means for clustering
## This is our first attempt to apply ML algorithms with the data set from our client. The results are not great. 
scikit learn (version 0.20.2); nltk (version 3.4)

In [10]:
import numpy as np
import pandas as pd
import nltk
import re
import mpld3
from nltk.corpus import stopwords
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [11]:
!pip install mpld3



In [12]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("spanish")

### Obtain a tokenizer function to use in our vectorizer

In [4]:
# preparar funciones de procesamiento de texto
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

### Add words to stopwords list that we want the ML to ignore

In [13]:
# lista de "stopwords"
stopwords = nltk.corpus.stopwords.words('spanish')

stopwords.append('https')
stopwords.append('rt')
stopwords.append('//t.co/86i0lev9kv')
stopwords.append('hola')
stopwords.append('Hola')

## Import our raw text

In [15]:
dataset = pd.read_csv('mattermost_running.csv', sep=',', parse_dates=['creation_date'])

In [16]:
text = dataset.text.dropna()
text.shape
print(type(text))

<class 'pandas.core.series.Series'>


## Create a vectorizer 

In [28]:
# Compute the term frequency-inverse document frequency matrix
tfidf_vectorizer = TfidfVectorizer(binary=True, max_df=.95,
                                 min_df=15, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,5))

## Transform our vector object into a matrix

In [29]:
tfidf_matrix = tfidf_vectorizer.fit_transform(text.astype('U'))
print("La matrix tiene %i filas (documentos) y %i columnas (palabras)\n" % tfidf_matrix.shape)

  sorted(inconsistent))


La matrix tiene 287 filas (documentos) y 17 columnas (palabras)



In [30]:
terms = tfidf_vectorizer.get_feature_names()
print("Hay en total %i palabras:\n" % len(terms))
print(terms)

Hay en total 17 palabras:

['aut', 'buen', 'com', 'envi', 'graci', 'hac', 'hol', 'necesit', 'pag', 'par', 'pas', 'pued', 'q', 'quier', 'sab', 'segur', 'si']


## k_means: form clusters

In [31]:
num_clusters = 7
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [32]:
# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))

El cluster 0 tiene 146 elementos
El cluster 1 tiene 22 elementos
El cluster 2 tiene 15 elementos
El cluster 3 tiene 24 elementos
El cluster 4 tiene 35 elementos
El cluster 5 tiene 20 elementos
El cluster 6 tiene 25 elementos


In [33]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("[[ Cluster %d ]]" % i, end='\n\n')
    
    print("   WORDS /// ", end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(terms[ind], end=' / ')
    print('\n\n')

Top terms per cluster:

[[ Cluster 0 ]]

   WORDS /// graci / pas / hac / com / necesit / quier / 


[[ Cluster 1 ]]

   WORDS /// pued / pag / si / com / aut / hac / 


[[ Cluster 2 ]]

   WORDS /// par / com / hac / necesit / envi / pas / 


[[ Cluster 3 ]]

   WORDS /// hol / buen / aut / par / segur / pag / 


[[ Cluster 4 ]]

   WORDS /// segur / hac / sab / par / quier / pas / 


[[ Cluster 5 ]]

   WORDS /// q / aut / pas / par / quier / pued / 


[[ Cluster 6 ]]

   WORDS /// envi / pag / necesit / quier / si / par / 


