# Clustering *La Voz del Interior* with a COOC
* preprocessing
* clustering tools used
* parameters
Why?
What outcome did I expect?
* final clusters

#### Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy
import pickle
import nltk
import os
import re
import sys
import sklearn.manifold
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from collections import Counter, defaultdict
from nltk.cluster import kmeans, cosine_distance
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
nltk.download("punkt")

[nltk_data] Downloading package punkt to /users/bjames/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

this code is if you want to just download the data directly

In [None]:
%%bash

mkdir -p ./data
python -m spacy download es
curl -L -o ./data/lavoz.txt.tar.gz https://cs.famaf.unc.edu.ar/~laura/corpus/lavoztextodump.txt.tar.gz
tar xvf ./data/lavoz.txt.tar.gz -C ./data
ls ./data
head -n 6 ./data/lavoztextodump.txt

prepare the articles for the pipeline

In [2]:
def corpus_iterator(corpus_file):
    document = {
        "title": None,
        "body": None
    }

    with open(corpus_file, "r") as fh:
        for line in fh:
            if line.strip() == "-":
                new_document = True
                document = {
                    "title": None,
                    "body": None
                }
            elif new_document:
                document["title"] = line.strip()
                new_document = False
            else:
                document["body"] = line.strip()
                yield document #this yields a dictionary with title numbered under "title" and the body of the text under "body"

-------------------------------------------------------------------------
Helper Code for non GPU: how to use a smaller dataset

In [3]:
!wc ./data/lavoztextodump.txt 

   38809  5615246 34886711 ./data/lavoztextodump.txt


In [4]:
!head -n 5000 ./data/lavoztextodump.txt > ./data/lavoztextodump-short2.txt
# by changing the value following '-n' you can control your input
!wc ./data/lavoztextodump-short2.txt

   5000  745195 4646557 ./data/lavoztextodump-short2.txt


-------------------------------------------------------------------------


In [5]:
filename = "data/lavoztextodump-short2.txt"
text_file = open(filename, "r")
dataset = text_file.read()
text_file.close()

## Preprocess text with nltk
1. sentence tokenize
2. word tokenize
3. eliminate numbers
4. eliminate stopwords
5. lowercase
6. lemmatize

In [47]:
# "stopwords" high frequency
stopwords = nltk.corpus.stopwords.words('spanish')
# Let's add a few pesky stopwords
punctuation = '.', '$','%','*1','*2','*asesor','*concejal','*director','*docente','*doctor','*economista','.','.', '-c','*', '&','#','!',"''",':','?', ';', ')', '(', '``', ',', '-'
stopwords.extend(punctuation)

In [48]:
# create a python dictionary from a real dictionary of lemmas in Spanish (uploaded)
lemma_file = open("lemmatization-es.txt", "r")
lemma_raw = lemma_file.read()
lemma = lemma_raw.split("\n")

lemma_dict = {}
for pair in lemma:
    w = pair.split("\t")
    if len(w) == 2:
        lemma_dict[w[1]] = w[0]

In [49]:
def lemmatize(word): # create a lemmatizing function
    if word in lemma_dict:
        word = lemma_dict[word]
    return word

In [50]:
# create a tokenizing function
def tokenize_text(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if token.isdigit():
            continue
        low_token = token.lower()
        lemma = lemmatize(low_token)
        if lemma not in stopwords:
            filtered_tokens.append(lemma)
    tok = [t for t in filtered_tokens]
    return tok

In [51]:
# execute the preprocessing process
the_news = []
for key, value in enumerate(corpus_iterator("data/lavoztextodump-short2.txt")):
    p = tokenize_text(value["body"])
    the_news.append(p)
print("We are looking at ",len(the_news),"articles.")

We are looking at  1666 articles.


# COOC: four-word window
Context is a essential to meaning in human language. A **co-occurrence matrix** gives us semantic and idiomatic insight into the language by showing which words appear together often and if any dependencies are present between words. 
* Example: **"Cristina"** often appears with **"Fernández"**


In [52]:
def dict_cooc_gen(e,w):
    """Takes as input a list of strings, where each string is a tokenized sentence.
        Returns a list of dictionaries of coocurrences of words and a list of all of words as the index for a matrix. 
        Window, words to the left and the right, is chosen when executing."""    
   
    cooc = [] # our list of dictionaries
    idx = [] # our list of words (for the matrix index)
    for sent in e:
        for m,tok in enumerate(sent):
            
            vecinos = []
            # vecinos a la derecha
            for l in range(1,w+1):
                if m+l > len(sent)-1:
                    break
                else:
                    vecinos.append(sent[m+l])
            # vecinos a la izquierda
            for l in range(1,w+1):
                if m-l < 0:
                    break
                else:
                    vecinos.append(sent[m-l])
            
            
            # para agregar los vecinos de una nueva palabra
            if tok in idx:
                i = idx.index(tok)
                cooc[i][tok] +=1
            else :
                cooc.append({tok:1}) 
                idx.append(tok)
                i = idx.index(tok)

            for v in vecinos:
                if not v in cooc[i]:
                    cooc[i][v] =1
                else:
                    cooc[i][v] +=1
                        

    return cooc, idx

In [53]:
coocs, idx = dict_cooc_gen(the_news,4) 
coocs[9:10]# lets look at one of the dictionaries

[{'hacer': 2491,
  'año': 335,
  'casar': 53,
  'tener': 145,
  'cuatro': 27,
  'claro': 11,
  'decir': 100,
  'pensarlo': 1,
  'preferir': 8,
  'vez': 65,
  'ser': 537,
  'indeciso': 1,
  'acostumbrar': 5,
  'hablar': 19,
  'querer': 47,
  'enojar': 2,
  'disgustar': 1,
  'mil': 23,
  'día': 120,
  'junto': 11,
  'nieto': 3,
  'morir': 13,
  'unir': 34,
  'novio': 2,
  'comentar': 6,
  'precisión': 1,
  'haber': 286,
  'virgen': 1,
  'leyenda': 1,
  'habitar': 1,
  'hogar': 6,
  'entrar': 59,
  'ayacucho': 1,
  'pobrecito': 1,
  'quedar': 20,
  'desaparecer': 4,
  'conversar': 2,
  'saber': 43,
  'compañero': 4,
  'chico': 20,
  'grande': 17,
  'contar': 25,
  'preguntar': 15,
  'crecer': 5,
  'asar': 4,
  'conectarse': 1,
  'plantar': 6,
  'tomar': 34,
  'caminar': 9,
  'sierra': 4,
  'salir': 26,
  'mañana': 13,
  'vario': 32,
  'plaza': 5,
  'próspero': 1,
  'rock': 1,
  'cosquín': 2,
  'suceder': 10,
  'estilar': 4,
  'excepcionar': 2,
  'cachar': 2,
  'yerom': 1,
  'veterano': 1,

# To vector space!
A mathematical representation of our words allows us to apply statistical and probabilistic formulas to them, such as the **k-means** clustering algoritm, which will enable us to see the underlying structure of the language use in *La Voz del Interior*, the newspaper of record in Córdoba, Argentina.

In [54]:
vectorizer = DictVectorizer()
vec = vectorizer.fit_transform(coocs)

In [55]:
terms = vectorizer.get_feature_names() # 'terms' will be necessary for when we print out the clusters
for i in terms[4000:4020]:# let's just look at a snapshot of our key words
    print(i)

bisagra
bisbiseantes
bischoff
biscuit
bismarck
bisnieto
bissutti
bizantino
bizzi
bié
biólogo
blair
blanc
blancaflor
blanco
blancuzco
blandir
blando
blanquear
blanquecino


In [56]:
matriz = pd.DataFrame(vec.toarray(), columns=terms)
matriz = matriz.set_axis(idx,axis=0, inplace=False)
matriz.head(10)

Unnamed: 0,*ex,*experto,*fiscal,*horacio,*integrante,*legislador,*lic,*ministro,*médica,*pablo,...,óxido,úlcera,últimamente,último,única-,únicamente,único,útero,útil,﻿1
claro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
crespo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rodolfo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
martínez,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
imaginar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,11.0,0.0,0.0,4.0,0.0,0.0,7.0
preferir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
pensarlo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
decir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,21.0,0.0,0.0,6.0,0.0,2.0,10.0
hacer,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,28.0,0.0,0.0,15.0,0.0,0.0,9.0


Our matrix is extremly sparse (lots of zeros) so we must **normalize** by scaling the matrix as decimals (float numbers) in stead of zeros

In [57]:
matrix_normed = matriz / matriz.max(axis=0)
matrix_normed.head(50) # just a small glipse of the matrix

Unnamed: 0,*ex,*experto,*fiscal,*horacio,*integrante,*legislador,*lic,*ministro,*médica,*pablo,...,óxido,úlcera,últimamente,último,única-,únicamente,único,útero,útil,﻿1
claro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001256,0.0,0.0,0.0,0.0,0.0,0.0
crespo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rodolfo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
martínez,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003413,0.0,0.0,0.002611
imaginar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013819,0.0,0.0,0.013652,0.0,0.0,0.018277
preferir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.003413,0.0,0.0,0.0
pensarlo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
decir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.026382,0.0,0.0,0.020478,0.0,0.1,0.02611
hacer,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.035176,0.0,0.0,0.051195,0.0,0.0,0.023499


In [21]:
filename = "trained/lavoz_matrix.pickle" # save our normalized matrix just in case
fileObj = open(filename, 'wb')
pickle.dump(matrix_normed, fileObj)
fileObj.close()

OverflowError: cannot serialize a bytes object larger than 4 GiB

In [None]:
filename = "trained/lavoz_matrix.pickle"
with open(filename, 'rb') as f:
    matrix_normed = pickle.load(f)

## Word2Vec: it's not a vectorization, it's an embedding!
We will set aside our hand-made cooccurence matrix now in favor of the more 'information-rich' **neural embeddings** of Word2Vec. 
This two-layer neural network provides us with much more than our counts-basd cooccurrence matrix. These neural embeddings carry much lower demensionality and have semantic representation, useful in processes such as [Latent Semantic Analysis](https://en.wikipedia.org/wiki/Latent_semantic_analysis).
    


In [58]:
def gen_vectors(normalized_text):
    print("\ninitialized")
    model = Word2Vec(normalized_text,min_count=1)
    vects = []
    for word in model.wv.vocab:
        vects.append(model.wv[word])

    matrix = numpy.array(vects)
    print("Matrix shape:",matrix.shape)
    print("finished")
    return model.wv.vocab,matrix



In [59]:
vocabulary, vectors = gen_vectors(the_news)


initialized
Matrix shape: (23470, 100)
finished


# Clusters!
We will now begin to take apart our vector space with clustering. 

In [60]:
def gen_clusters(vectors):
    print("\nclustering started")
    vectors = preprocessing.normalize(vectors)
    km_model = KMeans(n_clusters=CLUSTERS_NUMBER)
    km_model.fit(vectors)
    print("clustering finished")
    return km_model

In [61]:
CLUSTERS_NUMBER=50
km_model = gen_clusters(vectors)


clustering started
clustering finished


### Visualize the results

In [62]:
def show_results(vocabulary,model):
    # Show results
    c = Counter(sorted(model.labels_))
    print("\nTotal clusters:",len(c))
    for cluster in c:
        print ("Cluster#",cluster," - Total words:",c[cluster])

    # Show top terms and words per cluster
    print("Top words per cluster:")
    print()

    keysVocab = list(vocabulary.keys())
    for n in range(len(c)):
        print("Cluster %d" % n)
        print("Words:", end='')
        word_indexs = [i for i,x in enumerate(list(model.labels_)) if x == n]
        for i in word_indexs:
            print(' %s' % keysVocab[i], end=',')
        print()
        print()

    print()

In [63]:
show_results(vocabulary,km_model)


Total clusters: 50
Cluster# 0  - Total words: 91
Cluster# 1  - Total words: 3143
Cluster# 2  - Total words: 175
Cluster# 3  - Total words: 170
Cluster# 4  - Total words: 219
Cluster# 5  - Total words: 18
Cluster# 6  - Total words: 1494
Cluster# 7  - Total words: 88
Cluster# 8  - Total words: 64
Cluster# 9  - Total words: 26
Cluster# 10  - Total words: 400
Cluster# 11  - Total words: 472
Cluster# 12  - Total words: 356
Cluster# 13  - Total words: 148
Cluster# 14  - Total words: 1778
Cluster# 15  - Total words: 26
Cluster# 16  - Total words: 379
Cluster# 17  - Total words: 36
Cluster# 18  - Total words: 374
Cluster# 19  - Total words: 583
Cluster# 20  - Total words: 5
Cluster# 21  - Total words: 1796
Cluster# 22  - Total words: 80
Cluster# 23  - Total words: 72
Cluster# 24  - Total words: 670
Cluster# 25  - Total words: 38
Cluster# 26  - Total words: 214
Cluster# 27  - Total words: 15
Cluster# 28  - Total words: 51
Cluster# 29  - Total words: 74
Cluster# 30  - Total words: 11
Cluster# 3

Words: sorprender, descubrir, grande, muerte, separar, parejo, proyecto, c, hermoso, durar, duración, importante, mantener, escolar, mejorar, ojo, obstante, unidad, refacción, miel, reinar, rato, nieto, virgen, leyenda, construir, fábrica, avión, ameno, café, completar, parar, embargar, enamorar, noche, desaparecer, tormenta, honestar, generoso, reflejo, conjuntar, requerir, revisión, respectar, justificar, generar, atentar, sufrir, diferente, distinto, comprometer, época, generalmente, chocar, matar, caminar, asar, sol, tranquilizante, vivo, percepción, factor, aparecer, regularidad, laguna, anécdota, relativo, einstein, célebre, viejo, muchacho, alimentar, cachar, columna, confesar, cumplir, encontrar, profe, tren, suceder, vario, exhausto, desfallecer, jovencito, hombro, cantar, excepcional, descansar, final, lápida, siglo, especie, patrimonio, manzana, reconocimiento, restaurar, otorgar, extranjero, particularidad, seriar, recuperación, funcionar, tras, restauración, allí, planific

 balaceados, revocatorio, liso, miríada, world, fogh, lisboa, damiana, bastille, lucien, ilegalmente, dietílico, moladora, perímetro, píparo, caratulada, gota, putrefacto, 0-800, fantasmagórico, claudicante, desatino, monopólico, silenciarlo, apold, danés, jorgen, estampida, infográfico, 5,72, banana, grafica, cliba, reclamándole, telgopor, pucho, irritabilidad, depresivo, marital, puerperio, posparto, diana, oxhídrico, platense, efraín, inconfesable, lustroso, virginidad, sobrepasar, empañar, prestamista, mariotto, cn23, canillitas, atvc, panaholma, federala, brocheriano, klan, ramin, mohammad, notoriedad, perpetuar, racionamiento, saadismo, antisecuestros, hermetismo, narco, narcos, corporal, right, or, vibración, detenerse, dt, engorroso, valdivieso, ropero, ed, submenú, www.mundod.com.ar, perfeccionista, caverna, macizar, reproducirse, swiss, básquet, changuitos, porro, cáritas, 79/10, perderla, montal, bipartidismo, progresivamente, definirlo, whitecross, villavicencio, pulir, ais

Words: capuchino, engranar, ¡esos, rastrearse, salubridad, jaqueline, coincidentemente, tenerlas, solidarizar, fhc, manganeso, desvestirse, spas, sobresaltar, ¡adelante, walsh, comunicarme, balbucear, encarnadura, laspina, censal, pelotudo, 8.33, imponerlo, preanuncio, tecnoburocracia, bibliotecario, aridez, bezzato, schiare-tti, viveza, compararlo, paulatinamente, 8,6, perinatal, tribunalicias, atenuarse, tanqueta, cumplirlos, prémoli, armarle, autismo, transatlántico, impunemente, lopez, cantina, fregadísimo, federicos, nuevita, uepc, cachetada, argerich, acefalía, encolumnados, radiola, caratulado, uniéndolas, epitafio, profusamente, bandazo, démosle, urzuía, postrero, revitalización, leonés, descoordinación, cirricione, cachoíto, sojera, repudiable, ¿iría, llevadero, aprovisionar, mallorca, mamami, inquirir, intimista, benéfico, volcarla, fasto, preadjudicación, tácitamente, 5.485, atracar, mccain, lúgubre, prolongarse, sollozo, vita, museólogo, royce, 0-810-888-3368, acaloradament

Words: mchardy, march, bancalari, ferrari, entregarlo, aspirina, bein, quincenal, galíndez, irrestricto, precandidatura, ¿con, memorioso, zannini, frizza, ollocco, empréstito, visionario, apremiante, lluch, marianacci, judicializarlos, belaunde, transformarla, beneplácito, patrocinantes, ¿va, cierne, accastellista, avigorar, microatentados, puntal, dormilón, yuspe, hablarlo, rívolo, decirla, latiguillo, ídola, diseñarlo, comparándose, cartonero, criticándolo, tica, agendó, longevo, antikirchneristas, mestrista, benévolo, 2000-2007, bordón, rimbombante, epmr, ensuciarme, hablás, enojarse, fogoneadas, brindarles, balbastre, institucionalización, fortalecernos, nanini, descalzar, facturarle, cecilio, referenciarse, exteriorización, videncia, cta., sutileza, reu-nido, infernal, baipasea, peirotti, chisporrotear, inconducente, irremediable, spinozzi, oncología, pollicita, antiguerrillera, triguero, abstenerse, comodín, ponderación, abriéndole, interpelación, golondrina,

Cluster 8
Words: gr

Words: año, cuatro, personar, do, cada, tres, cincar, mesar, casi, ocho, seis, 6,6, desconcertar, reeducación, pescador, muerta, neelmanin, irreal, desendeudar, 33,7, luctuoso, ripiar, omóplato, incomunicar, abonarse, desolar, bioma, pétalo, atacándolo, 2.700, bronquitis, lanzarlas, 24,2, 0,15, condicionante, calígrafo, bismarck, methol, 45,3, 3,66, 47,6, ambulante, lona, 5.629.000, emancipación, 106.360, subirla, 14,3, imponiéndole, 96,5, cuantificar, anticipo, amamantar, prosecretaria, retroactivo, 19.636.359, 19,33, oscuridad, respirador, incubadora, hispanoamérica, desparejo, imaginable, congelarlos, cobayo, mirarnos, sostenés, anularlo, desafectada, 29,4, 38,5, interrumpirles, 11,67, milani, pasionaria, precipitación, refaccionado, 1975-2002, preimpresa, 20,8, balcánico, 43,8, cuatrimestre, cisjordano, 26,3, 127,4, acuérdate, 3,18, 2,69, abaratamiento, cortoplacistas, afición, frisar, 213.990, lamento, 16,1, 8.544, 2.535,5, masoud, 2.172, ubicándose, papel-cartón, ventiluz, 543,63

Words: ateneo, nuevo, dolor, iniciar, plan, martes, gordita, barrio, secar, callar, edificio, reparación, construcción, existente, entidad, rosa, junior, mañana, exterior, envejecimiento, luego, pastillita, cosquín, rock, plaza, próspero, terrorífico, estancia, jesuíticas, jesuítica, museo, promoción, estadístico, superior, departamento, monumento, zona, realizar, visitar, bodega, salar, energía, ruta, ubicar, punilla, capilla, remodelación, circundar, obraje, indicar, oficial, reclutar, capacitación, subcoordinadores, instructor, informatización, quilino, hacendar, nativo, norte, trabajo, tulumba, amarillo, estrato, defensoría, informar, corte, calle, terciario, anexar, cenma, domingo, sabattini, bioy, rivadavia, avenir, colón, jueves, economía, estudio, transmisor, balneario, aguardar, higiénico, marzo, comercial, industriar, procórdoba, exportador, sur, embarcar, sifcos, afip, asesoramiento, julio, almacenero, minorista, numeroso, cerac, limar, consultor, evento, rosario, recambiar,

Words: cumplimentarse, desarrollarnos, desagregación, granario, criopreservación, vitrificación, subejecutar, etéreo, estirpe, vanguardista, incansable, 2009-2011, abalanzar, marginalmente, ecofin, igualdad.com.ar, proscripción, aprhi, interpartidario, hipersexualidad, cairo, megaplán, 25.916, brasilero, ¿complicidades, alexis, 1.556,9, cocaleros, sibona, franquiciados, platillo, 8.016, proactivo, emisario, anticonstitucional, agradecerle,

Cluster 18
Words: luis, gobernar, entel, provincial, daniel, presidente, humidad, radical, justicia, 26.331, 8.113, ministro, complement, asertivo, cristino, fernández, kirchner, 9.693, cívico, néstor, candidato, scioli, senador, juez, diputar, oscar, federal, peronismo, secretario, ¿yasky, ¿insistirán, megafón, nadia, ¿participará, precandidato, giacomino, aníbal, pluripartidismo, orjan, diferenciación, gabeira, azzaro, verificarse, delia, marchionne, giorgi, bañuelos, turros, passerini, galvarino, patriótico, cirielli, propugnar, meguira, mujica, 

Words: corrimiento, gaza-egipto, consolidarnos, lamentarse, subalterno,

Cluster 21


Words: preferir, juvenil, asegurar, junto, gustar, enojar, acordar, conversar, molestar, dudar, darle, sostener, ilusionar, ¿es, triunfar, comunicación, elegir, económico, conflicto, manejar, ¿la, decidir, institución, medir, reclamar, discutir, debatir, advertir, escuela, abrir, forzar, llamar, agradecer, relacionar, risa, compañero, discusión, pelear, avatar, seguridad, protestar, pendiente, brindar, curricular, continuidad, manifestar, democrático, colateralmente, interpretación, educativo, diálogo, respetar, actor, información, consenso, responsable, ciencia, aceptar, subrayar, apalear, puerta, tranquilizar, locura, posiblemente, visión, excusar, reiterar, postular, mozalbete, ¡nos, estilar, espaciar, frank, madonna, demi, escuchar, resolver, explicar, temer, inclusión, existir, discurso, esclavo, lenarduzzi, profundizar, mantenerla, espinoso, sancionar, incertidumbre, ejecutivo, reglamentación, iniciativo, consultar, intervenir, preservar, sociedad, legislatura, ambientalista, nor

Words: córdoba, ciudad, rasguño, gordillo, rujinsky, liprandi, tasación, molinari, lanzarse, congruente, desdibujar, borocotización, yan, afligir, arregui, 39.322,46, californiano, cobe, dialoguistas, multifunción, ultraizquierda, llevándolo, daniza, imitarla, compañía-, unvm, sicioli, quintitas, azalais, acopiadoras, sistemáticamente, puebladas, 1983-1991, efe, brocherense, andanada, a-5, rusconi, labriego, colocarlas, invocarla, usandivaras, ariete, 1.208.127, deficit, 14,9, pecho, 15.700, regatta, kuka, urrutia, niza, massy-palaiseau, perplejo, *ministro, reformulación, 594.405, 1.497, 42.558, caserío, 1.375, inpe, refuncionalización, reiniciadas, 2011-2016, fénix, 2,20, maria, 21,72, jesus, cij, 140.300,

Cluster 24
Words: litro, crecer, ingreso, entrar, llegar, comprar, consumir, total, facturar, aportar, charly, subir, apenar, mayor, aumentar, estimar, recibir, menor, registrar, 57.500, superar, afrodescendientes, hectárea, mitad, producción, vender, mientras, período, variable, 

Words: apoplejía, pekín, visación, realista, licuación, veintidós, revender, 65.560, 185,37, sorprenderse, durísimas, infrecuente, golpizas, confusional, ayunar, novelesco, examinarlo, colosal, 2004-2009, 4.45, metabolismo, 20,7, escolarizar, 67,6, almuerzo, conflictividad, 51,2, 35,6, anteceder, 7,6, conosur, 3.636, campanada, cosquillear, esbozo, 4.662, alargar, ponernos, impersonal, terminarlo, 35,67, obtenerlos, rotary, expulsarlos, jardinería, prejudicial, basurero, perfección, 9,09, 14,35, 0,446,

Cluster 29
Words: si, chau, bombón, jesuitas., queriéndole, interprétese, ¿milita, distraerse, prometerle, manito, espadón, requetetristeza, verne, consolarme, ción, involucrándonos, arbitrar, detestable, vesania, represenmtan, autorreveló, preocupándose, análogo, concubino, dejarlas, superstars, tropelía, tenerle, implorar, vigilador, volear, mostrándose, representarlos, bamos, megaoperativos, ¡cuánto, apagarlo, expedirse, brocherenses, apoyarse, indubitable, cachetear, anímicamente, p

Words: confesándose, pedacitos, cortometraje, 217,89, procesarla, confiarles, viviente, habernos, eldebate, relicitar, videojuego, cabecita, titularidad, 77,3, centro-derecha, fitr, hipertensivo, dml, auspiciantes, peligrosamente, excarcelar, visitarla, 30,7, chip, boxear,

Cluster 37
Words: promediar, inversión, habitante, pagar, tonelada, 6.954, 4.019, 10.973, 38,8, 12,3, 10,3, 16,78, 11,16, mes, cierto-, 3,6, 7,5, 1996-1997, 17,5, 11,35, 13,34, 31,10, 30,99, 9,4, anual, 8,5, acnur, neto, giro, 1.100, 8.512, 6.832, rondar, 14,5, nordestinos, mensual, incobrable, 18,5, 4.656,1, 42,3, disyuntivo, 51.723, nocheros, cratón, 1.200, oscilar, desembolsar, suscripción, 34,6, 2,5, 0,9, 44.212,10, cotizar, 2.300, 10.300, cien, asatej, 2.398, 3.071, euro, 16.400, 3,8, 3,4, absorber, 53,9, 24,5, 5,6, 109,80, 6,87, 1.028, 137,1, 12,5, e-53, pase, emmel, 47,5, 32,6, 19,5, ram, exportable, estatizado, 5,2, cesanteados, 2,28, 8,35, ansal, jacobo, accesibilidad, 1,60, raspar, acopiadores, 2010/11, 6,

Words: crespo, juventus, movimiento, alegrar, gran, emprendedor, optimista, circunstancia, rodear, compartir, permanente, matrimoniar, refugiar, inseguridad, orgullo, r, lado, postergar, personal, recurso, prioridad, pasión, felicidad, secundario, levantar, colegio, edilicio, suscribir, pelar, centro, victoria, grupo, denominar, acta, continuar, abrazar, comentar, precisión, anticipar, cimiento, militar, charlar, besar, ¡no, ¡salir, coincidir, nervioso, admitir, noble, diverso, causar, motivación, conjugar, sugerir, acción, tendientes, destrabar, distinguir, confusión, infraestructura, desarrollar, reclamo, participación, especialmente, docente, necesidad, canal, irrumpir, pues, argumento, colar, mostrar, desconocimiento, teórico, acercar, vincular, tornar, posicionar, basar, sostenido, mediante, ramírez, cuñado, secreto, convivencia, motivar, ceder, frenar, virtud, maridar, disfrutar, subjetivo, biológico, adolescencia, consolidar, inquietante, cintura, cambio, memoria, íntimo, relata

 neofascista, véneto, burke, ops/oms, externar, turbocam, elegancia, vinaza, marapa, gefitinib, broncear, blusa, ¿les, morterito, cantera, majar, formón, tenazas, autodenomino, acervo, bibiana, icp, cisneros, mentalidad, parapolítica, 1868-1936, inclaudicable, respirarla, explosivamente, calcomanía, taberna, dvd, freeman, sac-d, torrusio, radiómetro, microondas, pesquero, sac-d., largavistas, collins, electromagnético, cofia, drug, ¿san, campazú, nura, syc, beneficiario-, /esposo, ¡esperemos, candangueros, raptar, juramento, universalidad, enigmático, indefenso, autoprotege, placenta, jerome, anegamiento, gontrán, ellauri, radiofonía, canelo, gigliola, bis-choff, clonación, military, zameel, pompilio, regodear, 66,45, scarpín,

Cluster 43
Words: unir, casar, hijo, hoy, lograr, madre, parir, cambiar, edad, mismo, perder, vida, sólo, trabajar, ganar, plazo, faltar, dejar, aunque, empezar, familia, contar, venir, tabú, solo, salir, quedar, adolescente, certero, chico, viceversa, caso, can

Words: ayudarse, prevenirlo, aire, bachiller, soslayarse, roussoniana, interpretarla, deseable, palpar, provecho, juscelino, jk, caminata, dedicatoria, derretir, recepción, predisposición, coimero, metodológico, salamín, frankfurt, exógeno, vanella, csjn, semblante, libremercadista, royalty, etiquetar, fructificar, ¿aun, presbítero, augurio, talante, informalidad, perdés, dormirse, pinamar, abordarlas, marcojuarenses, sabio, achacarle, diabético, insulinodependiente, sarampión, reconversión, ángel, recibirse, circulante, alineándose, homofóbicas, obstinación, prontamente, aunqueadmitieron, puntospara, trizar, principito, psicoterapia, córdova, respaldo, intencionado, anteponerse, 1.625, general-una, tandilense, pavura, puntualización, lanzarlos, intranquilidad, baquero, valorarte, polvillo, gervasio, escopetazo, ginecología, infanto, obstetricia, ¿, fleming, ferretero, empujándolos, tamame, ¡sigan,

Cluster 45
Words: presidio, descollar, desobstruirlos,

Cluster 46
Words: podar, rareza

Words: ameritar, enquistar, valiosísima, destituirla, tapón, radio-televisión, ramognino, colarse, tramitarse, ultraortodoxas, alcaldía, morgue, bochazos, curricula, científico-tecnológico, *fiscal, conceptual, delos,

Cluster 48


Words: rodolfo, martínez, través, obrar, normativo, aglutinar, estudiante, calificar, firmar, deán, funes, nicolás, copérnico, coordinador, interestudiantil, estudiantil, convocar, pedro, luna, carta, radiar, isabel, barrionuevo, aguar, aconsejar, defensa, planificación, proponer, participar, intempestivo, esgrimir, redacción, infundado, legitimar, decano, facultar, presbicia, observador, asesorar, justar, perdidamente, morocha, molina, garcía, veterano, cecilia, moore, curvo, descendente, unesco, juntar, eje, colonia, jesuítico, alfredo, conti, conjunto, nación, declaración, centrar, posta, subterráneo, jesuita, tajamar, designación, adriana, revalorizar, integrar, ente, reunir, lanzamiento, dirección, coordinar, marcial, cragnolini, expediente, aprobar, ambiental, universitario, ambientar, santiago, sobremonte, rural, marco, aprovechamiento, ganadería, industrial, comunicar, ala, esterar, spilimbergo, roberto, arlt, figueroa, alcorta, marino, waissman, agustín, tosco, asamblea, colle

In [64]:
# save clusters to file
filename = "trained/lavoz_clusters.pickle"
fileObj = open(filename, 'wb')
pickle.dump(km_model, fileObj)
fileObj.close()