# Clustering with scikit-learn and kmeans 
## NLTK for preprocessing; scikit learn for vectorizing, matrix; k-means for clustering
## This is our second attempt to apply ML algorithms with the data set from our client. The results a little better. 
scikit learn (version 0.20.2); nltk (version 3.4)

In [1]:
#!pip install mpld3

In [2]:
import numpy as np
import pandas as pd
import nltk
import re
#import mpld3
from nltk.corpus import stopwords
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [3]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("spanish")

### Obtain a tokenizer function to use in our vectorizer

In [4]:
# preparar funciones de procesamiento de texto
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

### Add words to stopwords list that we want the ML to ignore

In [5]:
# lista de "stopwords"
stopwords = nltk.corpus.stopwords.words('spanish')

stopwords.append('https')
stopwords.append('rt')
stopwords.append('//t.co/86i0lev9kv')
stopwords.append('hola')
stopwords.append('Hola')
stopwords.append('de')
stopwords.append('del')
stopwords.append('gracias')
stopwords.append('muchas')


## Import our raw text

In [14]:
df = pd.read_csv('new_mattermost.csv')
x = df.groupby('UserName')

clients = x.get_group('whatsapp')
#print(type(clients))
rawmess = clients.Mensaje.dropna()

no_newlines = []
for line in rawmess:
    line = str(line)
    line = line.split('\n')
    no_newlines.append(line)
text = []
for var in no_newlines:
    for content in var:
        if content.isdigit():
            continue 
        if re.search("the",content):
            continue
        if re.search("channel",content):
            continue
        if re.search('/www.+?',content):
            continue
        if re.search('http',content):
            continue
        if re.search('buen dia',content):
            continue
        if re.search('turno gratis',content):
            continue    
        if content == "":
            continue
        text.append(content)
print(type(text))

<class 'list'>


In [15]:
texts = pd.Series(text)

## Create a vectorizer 

In [16]:
# Compute the term frequency-inverse document frequency matrix
tfidf_vectorizer = TfidfVectorizer(binary=True, max_df=.95,
                                 min_df=15, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,5))

## Transform our vector object into a matrix

In [9]:
tfidf_matrix = tfidf_vectorizer.fit_transform(texts.astype('U'))
print("La matrix tiene %i filas (documentos) y %i columnas (palabras)\n" % tfidf_matrix.shape)

  sorted(inconsistent))


La matrix tiene 6151 filas (documentos) y 302 columnas (palabras)



In [10]:
terms = tfidf_vectorizer.get_feature_names()
print("Hay en total %i palabras:\n" % len(terms))
print(terms)

Hay en total 302 palabras:

['abon', 'aca', 'adar', 'agend', 'ahi', 'ahor', 'algun', 'arañit', 'asegur', 'asi', 'asist', 'atiend', 'aut', 'avis', 'años', 'barri', 'bien', 'buen', 'buen dia', 'buen dias', 'buen tard', 'cambi', 'campañ', 'campañ gratuit', 'campañ gratuit medicin', 'campañ gratuit medicin estet', 'capital', 'carg', 'carl', 'cas', 'centr', 'claudi', 'cobertur', 'cobr', 'com', 'comprob', 'comunic', 'confirm', 'consult', 'consult par', 'contest', 'cordob', 'corre', 'corre electron', 'cost', 'cuand', 'cuant', 'cuot', 'd', 'dal', 'dar', 'dat', 'deb', 'debit', 'desd', 'despu', 'despues', 'deteccion', 'deteccion lunar', 'dia', 'dias', 'direccion', 'disculp', 'dni', 'domicili', 'dond', 'dos', 'dr', 'dr.', 'dra', 'electron', 'entonc', 'envi', 'envi link', 'esper', 'estan', 'estet', 'estudi', 'falt', 'favor', 'fech', 'fil', 'fil type', 'fil type not', 'fil type not support', 'fil type not support yet', 'flebolog', 'flebologi', 'fot', 'gabriel', 'gmail.com', 'gnc', 'graci', 'gratis'

## k_means: form clusters

In [11]:
num_clusters = 7
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [12]:
# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))

El cluster 0 tiene 44 elementos
El cluster 1 tiene 54 elementos
El cluster 2 tiene 574 elementos
El cluster 3 tiene 4267 elementos
El cluster 4 tiene 313 elementos
El cluster 5 tiene 418 elementos
El cluster 6 tiene 481 elementos


In [13]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("[[ Cluster %d ]]" % i, end='\n\n')
    
    print("   WORDS /// ", end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(terms[ind], end=' / ')
    print('\n\n')

Top terms per cluster:

[[ Cluster 0 ]]

   WORDS /// domicili / mari / barri / fech / vill / dni / 


[[ Cluster 1 ]]

   WORDS /// par deteccion lunar / par deteccion / turn par deteccion / turn par deteccion lunar / deteccion / deteccion lunar / 


[[ Cluster 2 ]]

   WORDS /// par / turn / turn par / dia / par flebologi / flebologi / 


[[ Cluster 3 ]]

   WORDS /// buen / hol / tard / flebologi / pued / si / 


[[ Cluster 4 ]]

   WORDS /// si / bien / quier / pas / mañan / varic / 


[[ Cluster 5 ]]

   WORDS /// quier turn / turn campañ gratuit / turn campañ / quier turn campañ / quier turn campañ gratuit / campañ gratuit / 


[[ Cluster 6 ]]

   WORDS /// graci / much graci / much / ok / ok graci / buen / 


