# Clustering La Voz del Interior with a COOC
* preprocessing
* clustering tools used
* parameters
Why?
What outcome did I expect?
* final clusters

#### Imports

In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy
import pickle
import nltk
import os
import re
import sys
import sklearn.manifold
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from collections import Counter, defaultdict
from nltk.cluster import kmeans, cosine_distance
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
nltk.download("punkt")

[nltk_data] Downloading package punkt to /users/bjames/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

this code is if you want to just download the data directly

In [None]:
%%bash

mkdir -p ./data
python -m spacy download es
curl -L -o ./data/lavoz.txt.tar.gz https://cs.famaf.unc.edu.ar/~laura/corpus/lavoztextodump.txt.tar.gz
tar xvf ./data/lavoz.txt.tar.gz -C ./data
ls ./data
head -n 6 ./data/lavoztextodump.txt

prepare the articles for the pipeline

In [10]:
def corpus_iterator(corpus_file):
    document = {
        "title": None,
        "body": None
    }

    with open(corpus_file, "r") as fh:
        for line in fh:
            if line.strip() == "-":
                new_document = True
                document = {
                    "title": None,
                    "body": None
                }
            elif new_document:
                document["title"] = line.strip()
                new_document = False
            else:
                document["body"] = line.strip()
                yield document #this yields a dictionary with title numbered under "title" and the body of the text under "body"

-------------------------------------------------------------------------
Helper Code for non GPU: how to use a smaller dataset

In [2]:
!wc ./data/lavoztextodump.txt 

   38809  5615246 34886711 ./data/lavoztextodump.txt


In [11]:
!head -n 1000 ./data/lavoztextodump.txt > ./data/lavoztextodump-short2.txt
# by changing the value following '-n' you can control your input
!wc ./data/lavoztextodump-short2.txt

  1000 153493 958085 ./data/lavoztextodump-short2.txt


-------------------------------------------------------------------------


In [12]:
filename = "data/lavoztextodump-short2.txt"
text_file = open(filename, "r")
dataset = text_file.read()
text_file.close()

## Preprocess text with nltk
1. sentence tokenize
2. word tokenize
3. eliminate numbers
4. eliminate stopwords
5. 

In [13]:
# "stopwords" high frequency
stopwords = nltk.corpus.stopwords.words('spanish')
# Let's add a few pesky stopwords
punctuation = '.', '-c','*', '&','#','!',"''",':','?', ';', ')', '(', '``', ',', '-'
stopwords.extend(punctuation)

In [14]:
# create a python dictionary from a real dictionary of lemmas in Spanish (uploaded)
lemma_file = open("lemmatization-es.txt", "r")
lemma_raw = lemma_file.read()
lemma = lemma_raw.split("\n")

lemma_dict = {}
for pair in lemma:
    w = pair.split("\t")
    if len(w) == 2:
        lemma_dict[w[1]] = w[0]

In [15]:
def lemmatize(word): # create a lemmatizing function
    if word in lemma_dict:
        word = lemma_dict[word]
    return word

In [7]:
# create a tokenizing function
def tokenize_text(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if token.isdigit():
            continue
        low_token = token.lower()
        lemma = lemmatize(low_token)
        if lemma not in stopwords:
            filtered_tokens.append(lemma)
    tok = [t for t in filtered_tokens]
    return tok

In [16]:
# execute the preprocessing process
the_news = []
for key, value in enumerate(corpus_iterator("data/lavoztextodump-short2.txt")):
    p = tokenize_text(value["body"])
    the_news.append(p)
print("We are looking at ",len(the_news),"articles.")

We are looking at  333 articles.


# COOC: four-word window
Context is a essential to meaning in human language. A **co-occurrence matrix** gives us semantic and idiomatic insight into the language by showing which words appear together often and if any dependencies are present between words. 
* Example: **"Cristina"** often appears with **"Fernández"**


In [17]:
def dict_cooc_gen(e,w):
    """Takes as input a list of strings, where each string is a tokenized sentence.
        Returns a list of dictionaries of coocurrences of words and a list of all of words as the index for a matrix. 
        Window, words to the left and the right, is chosen when executing."""    
   
    cooc = [] # our list of dictionaries
    idx = [] # our list of words (for the matrix index)
    for sent in e:
        for m,tok in enumerate(sent):
            
            vecinos = []
            # vecinos a la derecha
            for l in range(1,w+1):
                if m+l > len(sent)-1:
                    break
                else:
                    vecinos.append(sent[m+l])
            # vecinos a la izquierda
            for l in range(1,w+1):
                if m-l < 0:
                    break
                else:
                    vecinos.append(sent[m-l])
            
            
            # para agregar los vecinos de una nueva palabra
            if tok in idx:
                i = idx.index(tok)
                cooc[i][tok] +=1
            else :
                cooc.append({tok:1}) 
                idx.append(tok)
                i = idx.index(tok)

            for v in vecinos:
                if not v in cooc[i]:
                    cooc[i][v] =1
                else:
                    cooc[i][v] +=1
                        

    return cooc, idx

In [19]:
coocs, idx = dict_cooc_gen(the_news,4) 

# To vector space!
A mathematical representation of our words allows us to apply statistical and probabilistic formulas to them, such as the **k-means** clustering algoritm, which will enable us to see the underlying structure of the language use in *La Voz del Interior*, the newspaper of record in Córdoba, Argentina.

In [23]:
vectorizer = DictVectorizer()
vec = vectorizer.fit_transform(coocs)

In [24]:
terms = vectorizer.get_feature_names() # 'terms' will be necessary for when we print out the clusters
for i in terms[4000:4020]:# let's just look at a snapshot of our key words
    print(i)

eslavo
eslogan
eslógan
esmerar
espacial
espaciar
espacio
espacioso
espadachín
espadar
espadón
espalda
espantoso
espasmódicamente
españa
españa-córdoba
español
españolar
especial
especialidad


our matrix is extremly sparse (lots of zeros) so we must normalize by scaling the matrix as decimals (float numbers) in stead of zeros

In [29]:
matriz = pd.DataFrame(vec.toarray(), columns=terms)
matriz = matriz.set_axis(idx,axis=0, inplace=False)
# Normalize, although it is still extremely sparse
matrix_normed = matriz / matriz.max(axis=0)
matrix_normed.head(50) # just a small glipse of the matrix

Unnamed: 0,$,%,*ex,*lic,*profesor,*psicopedagoga,-a,-de,-esa,-grupo,...,óseo,óvulo,óxido,último,única-,únicamente,único,útero,útil,﻿1
claro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
crespo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rodolfo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
martínez,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
imaginar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.006369,0.0,0.0,0.0,0.0,0.0,0.035714
preferir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pensarlo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
decir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.006369,0.0,0.0,0.067797,0.0,0.166667,0.02381
hacer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.012739,0.0,0.0,0.050847,0.0,0.0,0.02381


In [30]:
filename = "trained/lavoz_matrix.pickle" # save our normalized matrix just in case
fileObj = open(filename, 'wb')
pickle.dump(matrix_normed, fileObj)
fileObj.close()

In [31]:
filename = "trained/lavoz_matrix.pickle"
with open(filename, 'rb') as f:
    matrix_normed = pickle.load(f)

## Word2Vec: it's not a vectorization, it's an embedding!
We will set aside our hand-made cooccurence matrix now in favor of the more 'information-rich' **neural embeddings** of Word2Vec. 
This two-layer neural network provides us with much more than our counts-basd cooccurrence matrix. These neural embeddings carry much lower demensionality and have semantic representation, useful in processes such as [Latent Semantic Analysis](https://en.wikipedia.org/wiki/Latent_semantic_analysis).
    


In [32]:
def gen_vectors(normalized_text):
    print("\ninitialized")
    model = Word2Vec(normalized_text,min_count=1)
    vects = []
    for word in model.wv.vocab:
        vects.append(model.wv[word])

    matrix = numpy.array(vects)
    print("Matrix shape:",matrix.shape)
    print("finished")
    return model.wv.vocab,matrix



In [35]:
vocabulary, vectors = gen_vectors(the_news)


initialized
Matrix shape: (10278, 100)
finished


# Clusters!
We will now begin to take apart our vector space with clustering. 

In [36]:
def gen_clusters(vectors):
    print("\nclustering started")
    vectors = preprocessing.normalize(vectors)
    km_model = KMeans(n_clusters=CLUSTERS_NUMBER)
    km_model.fit(vectors)
    print("clustering finished")
    return km_model

In [37]:
CLUSTERS_NUMBER=50
km_model = gen_clusters(vectors)


Clustering started
Clustering finished


### Visualize the results

In [38]:
def show_results(vocabulary,model):
	# Show results
	c = Counter(sorted(model.labels_))
	print("\nTotal clusters:",len(c))
	for cluster in c:
		print ("Cluster#",cluster," - Total words:",c[cluster])

	# Show top terms and words per cluster
	print("Top words per cluster:")
	print()

	keysVocab = list(vocabulary.keys())
	for n in range(len(c)):
		print("Cluster %d" % n)
		print("Words:", end='')
		word_indexs = [i for i,x in enumerate(list(model.labels_)) if x == n]
		for i in word_indexs:
			print(' %s' % keysVocab[i], end=',')
		print()
		print()

	print()

In [228]:
show_results(vocabulary,km_model)


Total clusters: 50
Cluster# 0  - Total words: 328
Cluster# 1  - Total words: 439
Cluster# 2  - Total words: 2
Cluster# 3  - Total words: 1046
Cluster# 4  - Total words: 3018
Cluster# 5  - Total words: 29
Cluster# 6  - Total words: 67
Cluster# 7  - Total words: 2
Cluster# 8  - Total words: 27
Cluster# 9  - Total words: 214
Cluster# 10  - Total words: 1
Cluster# 11  - Total words: 16
Cluster# 12  - Total words: 71
Cluster# 13  - Total words: 1348
Cluster# 14  - Total words: 83
Cluster# 15  - Total words: 3173
Cluster# 16  - Total words: 468
Cluster# 17  - Total words: 301
Cluster# 18  - Total words: 37
Cluster# 19  - Total words: 3
Cluster# 20  - Total words: 427
Cluster# 21  - Total words: 134
Cluster# 22  - Total words: 1737
Cluster# 23  - Total words: 237
Cluster# 24  - Total words: 19
Cluster# 25  - Total words: 51
Cluster# 26  - Total words: 72
Cluster# 27  - Total words: 2
Cluster# 28  - Total words: 10
Cluster# 29  - Total words: 90
Cluster# 30  - Total words: 697
Cluster# 31  - 

Words: unir, pensarlo, casar, hijo, hoy, llevar, lograr, madre, servicial, vivir, perder, después, grande, muerte, trabajar, ganar, largar, hermoso, durar, faltar, agregar, empezar, rato, familia, contar, recién, tabú, solo, quedar, esperar, comienzo, joven, sufrir, -totalmente, apalear, llegás, ciertamente, viejo, alimentar, ocurrir, vario, charly, punto, desfallecer, hombre, viceversa, caso, vaticinio, década, atrás, cantidad, turista, datar, funcionar, rescatar, etapa, anterior, lugar, discapacidad, finar, picar, sustancial, comparación, adulto, terminar, largo, fascinación, redefinir, criar, -quitan, familiar, dinero, repetir, pequeño, doltó, niño, impedimento, hans, mundo, caer, alcanzar, resultar, penalización, considerarlo, encimar, igual, pobreza, personalismo, idéntico, patrón, móvil, verano, producir, alternar, cordón, siguiente, retirarse, vehículo, anonimato, ocasional, yong-ho, víctima, prisión, doceno, detenido, pesar, detener, constantemente, incitar, minero, atrapar, se

Words: nuevo, dolor, iniciar, suscribir, plan, obrar, centro, deán, funes, denominar, unidad, refacción, radiar, anticipar, fábrica, café, coincidir, conjuntar, conjugar, edificio, reparación, construcción, existente, especialmente, canal, irrumpir, mostrar, teórico, entidad, redacción, junior, motivar, chocar, mañana, ¿el, exterior, envejecimiento, adolescencia, luego, cachar, yerom, columna, bello, veterano, george, pancho, madonna, cecilia, demi, curvo, estancia, juntar, manzana, sitio, cordobés, lucir, museo, optimizar, otorgar, promoción, sitiar, circuito, candelaria, accesible, investigación, conjunto, jurisdicción, privar, tras, restauración, histórico, salar, energía, visual, capilla, remodelación, aporte, tajamar, actualidad, encargar, área, indicar, ente, regularmente, oficial, instrucción, b, espectáculo, homosexual, proporcionar, hacendar, blanco, implantar, bosque, nativo, campo, sector, congelar, autorizar, alambrar, instalación, incendio, ischilín, sobremonte, ganadero, 

, recibimiento, france, 6.50, dax, adaptación, descontextualizar, fusco, joseph, rgen, habermas, respetuosamente, caín, auriga, lobular, reingresar, viral, carranza, ansia, greenpace, bacteria, silbido, ngela, gentile, anglosajón, neutrón, radiofármacos, testear, fierro, neurorrehabilitación, neuropsicólogo, shopping, terapistas, patear, caicedo, benéfico, alterman, analógico, opal, disertante, controlnet, trisoil, balaceados, geval, ob, marfrig, argentine, breeders, reasignar, cospelazo, sat, negociable, estrellarse, world, outreach, gainesville, ovelar, lucien, repatriación, tensar, dietílico, perímetro, píparo, edith, drogadicción, putrefacto, valiéndose, monte, descomunal, visca, decker, apold, ars, aidis, iswa, bettina, jorgen, haukohl, ramboll, fcc, pirólisis, 3,16, 2,94, banana, silfrut, deliveries, telgopor, insertar, psicopatológico, incoherente, ivana, bischoff, mezclándose, acebo, catar, rumboso, partitura, asiduamente, arenaza, sunchales, desacelerándose, recriminar, reorde

Words: subcoordinadores, conscientemente, mamani, neelmanin, disparidad, ¿vio, desarrollarnos, polución, rawson, cundir, puntoen, serna, tecnoburocracia, criopreservación, viveza, renquear, paulatinamente, 39.322,46, conductual, limitante, prosecretaria, político-educativos, dificultarles, calentamiento, 11, borrón, 7.05, panteón, confiarles, escasísima, emugas, consolidarnos, gravoso, estrasburgo, reencuentro, eldebate, reemplazo, 20.50, spitale, juanes, transgénico, h., organizándose, titularidad, 15.700, 128,5, nostalgitrippen, 99,9999, proeza, edema, cultivarse, aparejar, remate, guarida, peirotti, lavarme, amperímetro, ¿al, canasto, levantarme, coaccionar, prejudicial, fornido, delos, cupo, villarruel, 0,447,

Cluster 7
Words: evaluativo, oncología,

Cluster 8
Words: mussi, dramatismo, ia63-pampa, argentino-alemana, imitarla, unvm, felpeto, accastellista, acopiadoras, aeroportuario, puebladas, 16.05, etchenique, emviado, tramitarse, criticándolo, labriego, cairo, olorcito, nodeber

Words: preferir, movimiento, juvenil, junto, alegrar, enojar, optimista, acordar, circunstancia, molestar, compartir, ilusionar, permanente, ¿es, refugiar, triunfar, r, comunicación, lado, postergar, recurso, económico, prioridad, manejar, administrarlo, ¿la, secundario, institución, colegio, edilicio, debatir, normativo, estudiante, victoria, abrir, principiar, grupo, forzar, llamar, acta, luna, abrazar, comentar, precisión, leyenda, militar, relacionar, ¡no, risa, compañero, aconsejar, pelear, admitir, inigualable, diverso, causar, acción, tendientes, destrabar, cuestión, pendiente, proponer, docente, necesidad, argumento, esgrimir, colar, desconocimiento, interpretación, acercar, basar, sostenido, información, mediante, responsable, *lic, convivencia, misterioso, flojo, frenar, virtud, subrayar, conectarse, locura, posiblemente, consolidar, cintura, cambio, visión, presbicia, íntimo, reiterar, anécdota, observador, albert, célebre, teoría, profesor, fantasía, justar, perdidamente, r

, deshonrar, cobayos, guatemala, condenable, prostituta, consentimiento, riguroso, conscripto, extraño, defunción, lavoz, pastor, carlín, elenco, guerrilla, lucrar, cocción, glorioso, fusil, disconforme, expedir, pesimista, pedagógico, respetarse, refundar, descreimiento, subdirección, conformidad, territorial, salaberry, cpi, ginebra, ¿pueden, detalladamente, juzgamiento, infinitamente, replantear, compulsión, hiperpesimista, pegoteados, ¿ve, improvisación, richa, supervisar, macroeconómico, botonar, limitador, planeamiento, inusitado, armamentista, burguesía, chrd, carvajal, banquero, murmullo, revestir, erótico, minibombacha, erróneo, cuerdo, callejón, cinismo, mixturar, peronista-sindical, devaluar, mafioso, enredar, comprenderlos, acoger, percance, globalizar, prosperidad, subyacer, callejero, moderado-leve, invocar, digestivo, anemia, diarrea, meckel, malformación, x/y, pediatría, transacción, cuero, laura, aburrir, molecular, dermatólogo, fernanda, toxicidad, reticencia, prefect

Words: cuatro, personar, crecer, tres, entrar, cincar, total, aumentar, promediar, inversión, casi, 34.500, 21.500, ocho, 63.500, 57.500, seis, habitante, hectárea, pagar, tonelada, cuota, 6.954, 4.019, semestre, 10.973, 10,3, 1.550, importador, 11,16, 1.016, mes, 6,6, costar, 3,6, pescador, horrorizar, cifrar, qatar, 7,5, devaluación, porcentual, anticuerpo, emigración, afirmativo, 13,8, novillito, 17,5, 11,35, 13,34, nalga, 31,10, híper, 30,99, 2.500, equivaler, comcord, anual, 8,5, acnur, neto, ffs, giro, 1.080, 1.100, 2.274, interanual, 546,4, 1.602, rondar, mensual, traducirse, 2.608,6, 46,6, 1.167,6, 18,5, 4.656,1, 5.046,4, 81,3, 18,7, 51,4, consabido, ornamentación, 13,2, 51.723, cervecero, nocheros, $, rastra, omt, 2.200, ¿pero, desembolsar, quesero, 2.213,1, punitorios, aguinaldo, suscripción, 34,6, 7,7, bioma, 2,5, sufragar, 0,9, cotizar, bronquitis, giardino, eph, 10.300, cien, mantenerlo, trimestre, 1,2, 3,4, vitorear, absorber, tablón, 53,9, 20,2, 4,7, 3,5, 5,6, sachar, 10

Words: si, bombón, profesionalmente, ameritar, individualmente, pluripartidismo, salubridad, refiriéndose, despojarse, 13a, solidarizar, generarse, desvestirse, mátenme, beligerante, manito, audazmente, pensándolo, roscar, involucrándonos, emboscar, ¿fadea, anecdótico, menospreciarse, mostrarlo, solventarlo, respresentar, divagar, publicarlo, intercorp, contrariedad, autorreveló, endémico, preventivamente, rebañar, evadirlos, denunciarla, mundod, obnubilación, ponerlas, gratificar, verborragia, despedirme, invernadero, seguidismo, preguntarles, saraceno, revoltoso, intrascendente, escasísimo, linga, desazón, vigilador, 5,15, darwinismo, representarlos, presentás, controlarla, frenesí, retenerlo, dándole, disney, erosión, presionándonos, 19.07, entregadora, finitud, desescolarizados, desnutrir, pagarla, cubanizarse, enfoque, afectivamente, separarme, recibirse, recepcionar, territorialmente, tenaza, sesentón, sustancioso, escaramuzar, lulismo, paloccismo, campechano, capos, megaboliche,

Words: francamente, aceptarse, puridad, enfrentándola, warcalde, asegurarme, desobturar, rusconi, distribuirlas, alcaldía, reposicionarse, cabecita, ¿ves, accidentológicos, franquiciados, terminarla, medialuna, sappia, 8.759,

Cluster 25
Words: bueno, aire, mimetización, deseable, caminata, derretir, predisposición, piqueteros, salamín, c., ¡pobre, costosísima, merlo, azzaretti, 20.353, semblante, ¿varió, shan, zarpar, zurich, presbítero, trascolar, talante, informalidad, torcuato, tella, perdés, dormirse, achacarle, reconversión, astuto, ángel, homofóbicas, obstinación, pseudociencia, aunqueadmitieron, riachuelo, murgueros, córdoba-buenos, radioservicio, respaldo, 1.625, muestreo, general-una, berazategui, polvillo, rufino, aplastarlos, fleming, ferretero, ¡sigan,

Cluster 26
Words: -¿los, ¿milita, impartirse, quejarnos, arranquémosle, tomémoslo, certidumbre, obras.no, macristas, ¿con, ción, suprahumana, visionario, ínterin, divo, sinceramente, bancaron, vapulear, relevarla, desopilan

Words: levantándose, sorprenderse, errándole, viviente, bocacalle, oreclamaran, tintorería, condenándolo, zanjón, íntimamente, oh, autoválidos, finalizarlas, 0,13, excarcelar, 30,7,

Cluster 33
Words: enardecerlos, establecerlo, epíteto, prepararnos, medirse, solucionable, mascarilla, volear, mostrándose, tolderías, ¿reprochan, antártico, paramio, remitirla, zé, ruidito, autorizarlo, tufillo, esgrimirlas, inversionista, invalidante, reflujo, donarlas, desarrollarlas,

Cluster 34
Words: capuchino, liberalización, gabeira, buaccar, tenerlas, dañino, kohen, pender, decaimiento, cámaras-, témpore, persuadir, guarapuava, miniturismo, soares, gremialismo, hitleriano, adscripta, bichar, rociarlo, granario, tapón, estofar, boeto, holding, gholamhosein, nutritivo, 11.658, premiarlos, ultracompactas, complejiza, railways, rodriguista, seringueira, daniella, subejecutar, molécula, po, etéreo, eslavo, autismo, napolitano, impunemente, anhelo, convirtiéndose, imprevisión, fiore, odetti, junot, itay

Words: ayudarse, volviéndole, traccionaría, sobrevalorar, sobrenatural, acabadamente, irrestricto, repulsar, deslindar, valiosísima, asumirla, embaucar, penoso, piyama, corrimiento, abstraerse, profesionalidad, antiparras, pertrechos, 3a, castigarlos, espejitos, seductor, alojándose, discodermolida, defenderme, mancillar, superstars, organícense, incompatibilidad, bienintencionado, nimiedad, reinvención, profesionalización, tilo, consulfen, atenderlos, precitado, decisorio, fechoría, maiztegui, biosfera, expedirse, omnímodo, controversial, ¿han, acuérdense, calcaterra, 423-3992, puntospara, dejarle, anímicamente, boston, somnolencia, tirarse, intencionado, bioquímico, perseguible, ¿cobran, provocarla, atornillarse, multicultural, equivocarnos, ponernos, fortalecernos, laxitud, resignificación, lacónico, 1.413.000, esclarecedor, ¿hacen, separarnos, causarle, alquilarse, -públicos, sobresueldo, brasileña-psdb, empujándolos, negarnos, estropicio, comodín, cajoneada, reinician, impostergab

Words: mujer, vez, sólo, cada, día, menos, primero, gordita, ameno, pasar, engranar, pastillita, vengativo, último, hora, fin, semana, matadero, próximo, desconcertar, tinellizados, hospitalizar, whiskería, parroquiano, muerta, indexar, anticipación, perforarse, irreal, festividad, trompa, animador, rezongar, ¿cambió, empalmar, anortosita, neviscar, 16.10, diez, asustás, basándonos, clasificador, minucioso, calvario, desolar, reyerta, pétalo, connacionales, 5.200, celar, comunicarme, paredón, laburo, cuentapropistas, lanzarlas, asatej, condicionante, invocación, granero, materializable, porcinitos, cogote, veinteañeros, 24,6, conseguirse, 24,5, 3,66, champú, agarrarlo, gancho, medidor, usina, multisectorial, yocsina, imponerlo, eslógan, notablemente, 106.360, compactación, aridez, docencia, desgobernar, fantasmal, ¡hoy, letargo, despertarse, cuantificar, anticipo, atenuarse, confort, pavor, lograrlo, esponjar, infrecuente, invasiva, psicópata, sorpresivamente, culminarlo, 4.811, respir

Words: emprendedor, rareza, sábado, miércoles, martes, emplear, avión, ﻿1, noche, callar, honestar, nivel, adolescente, revisión, mediano, gracia, sierra, plantar, tranquilizante, regularidad, aportar, tren, próspero, jovencito, unesco, comparar, implicar, visitante, extranjero, incluir, argentino, mundial, bodega, municipio, actualmente, ejecución, circundar, multiplicar, bajar, ingresar, cerrar, jesuíticos, negro, viernes, operativo, poblacional, país, reclutar, censar, lunes, capacitación, sumarle, ejecutar, tardar, minuto, feriar, comercio, afrodescendientes, originario, predio, quilino, disminuir, abril, lluvia, ancho, árbol, productor, activar, trabajo, tulumba, reservar, rojo, reducción, curso, salino, arbusto, categoría, actividad, sembrar, fijo, elevar, establecimiento, primario, pabellón, sabattini, entregar, diario, racionar, jueves, ¿cuánto, crisis, inicial, capacidad, nacimiento, transmisor, frigorífico, balneario, obtener, bovino, cerdo, viajar, restricción, marzo, invert

Words: juventus, asegurar, cariñoso, darle, proyectar, elegir, decidir, reclamar, pedir, anteproyecto, calificar, estudiantil, carta, agradecer, defensa, seguridad, responder, cuestionar, protestar, cortar, brindar, curricular, continuidad, intempestivo, espasmódicamente, manifestar, democrático, representante, educativo, diálogo, actor, sobrar, infundado, legitimar, públicamente, consenso, ciencia, facultar, olga, sentirme, morocha, exhausto, afirmar, actual, presenciar, temer, gestión, declaración, guión, derecho, reunión, panic, sancionar, iniciativo, sociedad, legislatura, ambientalista, oficialismo, asunto, definir, inconstitucional, rechazar, arranz, asamblea, collegium, anulación, estrategia, ¿cuáles, descalificar, carácter, vaivén, decisión, propio, ejercer, imitar, compromiso, colegiar, plantear, resolutivo, *psicopedagoga, administrativo, frente, reglamentario, control, congreso, transitoriamente, civil, democracia, opinar, impulsar, mensurable, gestionar, agenda, institucion

Words: fe, general, ipem, belgrano, alejandro, carbó, jerónimo, cabrero, ministerio, cornú, rodríguez, maría, encabezar, acto, consejo, jefe, seducción, jesús, interior, capital, cruz, caroya, radical, director, censo, paz, francisco, funcionario, radicalismo, legislador, comisión, universidad, inconstitucionalidad, unc, instituto, preuniversitario, garzón, agulla, pablar, mauro, trettel, dante, grahovac, raquel, martín, referente, salud, vicepresidente, unión, cámara, titular, presentar, municipal, repuesteros, catamarca, 453-1919, gral, cerac, gallego, bonaerense, candidatura, mentiroso, oscar, justicialismo, oficialista, conurbano, porteño, solá, cgt, hugo, peronista, gorila, mengarelli, gennaro, villamariense, justicialista, menem, adjuntar, micheli, arbitral, abogar, ¿yasky, tablita, mestre, apoyar, respaldar, peraltar, vigo, capitalino, eduardo, mondino, testar, walter, saieg, precandidato, blocar, ricardo, marcelino, intendencia, germán, kammerath, ucr, ¡esos, intendenta, angelo

In [160]:
# save clusters to file
filename = "trained/lavoz_clusters.pickle"
fileObj = open(filename, 'wb')
pickle.dump(km_model, fileObj)
fileObj.close()