# Clustering La Voz del Interior with a COOC

#### Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import nltk
import os
import re
import sys
import sklearn.manifold
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from collections import Counter, defaultdict
from nltk.cluster import kmeans, cosine_distance
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
nltk.download("punkt")

[nltk_data] Downloading package punkt to /users/bjames/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### download the data

In [5]:
%%bash

mkdir -p ./data
python -m spacy download es
curl -L -o ./data/lavoz.txt.tar.gz https://cs.famaf.unc.edu.ar/~laura/corpus/lavoztextodump.txt.tar.gz
tar xvf ./data/lavoz.txt.tar.gz -C ./data
ls ./data
head -n 6 ./data/lavoztextodump.txt


[93m    Linking successful[0m
    /users/bjames/miniconda3/envs/gpu_for_u/lib/python3.7/site-packages/es_core_news_sm
    -->
    /users/bjames/miniconda3/envs/gpu_for_u/lib/python3.7/site-packages/spacy/data/es

    You can now load the model via spacy.load('es')

lavoztextodump.txt
lavoztextodump.txt
lavoz.txt.tar.gz
-
"Lo que sostiene a la pareja es el amor"
Clara Crespo (50) y Rodolfo Martínez (54) no se imaginan uno sin el otro. "Prefiero ni pensarlo", dice Clara. Hace 26 años que están casados, y tienen cuatro hijas mujeres. Se conocieron en el Ateneo Juventus, el movimiento juvenil de Capuchinos. Hoy aseguran no estar sorprendidos del tiempo que llevan juntos sino de haber logrado entenderse tan bien. &#226;&#8364;&#8220;¿Qué les gusta y disgusta del otro? ¿Qué quisieran cambiarle?  &#226;&#8364;&#8220;Rodolfo: Me gusta que sea cariñosa, alegre y esté siempre pensando en mí, y que es una gran madre. Me disgustaba que cuando se enojaba no quería hablar, pero ya no lo hace más.

You are using pip version 19.0.3, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  8 11.5M    8 1056k    0     0  16.6M      0 --:--:-- --:--:-- --:--:-- 16.3M100 11.5M  100 11.5M    0     0  36.2M      0 --:--:-- --:--:-- --:--:-- 36.1M


### Data wrangling
#### Seperate the body of the article from the rest

In [2]:
def corpus_iterator(corpus_file):
    document = {
        "title": None,
        "body": None
    }

    with open(corpus_file, "r") as fh:
        for line in fh:
            if line.strip() == "-":
                new_document = True
                document = {
                    "title": None,
                    "body": None
                }
            elif new_document:
                document["title"] = line.strip()
                new_document = False
            else:
                document["body"] = line.strip()
                yield document

In [11]:
!wc ./data/lavoztextodump.txt

   38809  5615246 34886711 ./data/lavoztextodump.txt


#### Before using the entire data set, create a test set to make sure the code works

In [6]:
!head -n 100 ./data/lavoztextodump.txt > ./data/lavoztextodump-short2.txt
#now we are under 1,000,000 words (-n 1000)
!wc ./data/lavoztextodump-short2.txt

   100  17251 107222 ./data/lavoztextodump-short2.txt


In [7]:
filename = "data/lavoztextodump-short2.txt"
text_file = open(filename, "r")
dataset = text_file.read()
text_file.close()

#### stopwords

In [8]:
# lista de "stopwords"
stopwords = nltk.corpus.stopwords.words('spanish')
# Let's add a few pesky stopwords
new_stopwords = '.', '-c','8.113', '&','#','!',"''",':','?', '226', ';','8364', ')', '(', '``', ',', '-'
stopwords.extend(new_stopwords)

# initial filtering, lemmitazation of the text

In [9]:
lemma_file = open("lemmatization-es.txt", "r")
lemma_raw = lemma_file.read()
lemma = lemma_raw.split("\n")

lemma_dict = {}
for pair in lemma:
    w = pair.split("\t")
    if len(w) == 2:
        lemma_dict[w[1]] = w[0]

In [10]:
def lemmatize(word):
    if word in lemma_dict:
        word = lemma_dict[word]
    return word

In [11]:
# preparar funciones de procesamiento de texto
def tokenize_text(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if token.isdigit():
            continue
        low_token = token.lower()
        lemma = lemmatize(low_token)
        if lemma not in stopwords:
            filtered_tokens.append(lemma)
    tok = [t for t in filtered_tokens]
    
    return tok

In [13]:
articles = []
for key, value in enumerate(corpus_iterator("data/lavoztextodump-short2.txt")):
    p = tokenize_text(value["body"])
    articles.append(p)
print(type(articles),len(articles) )

<class 'list'> 33


## The COOC: coocurrence matrix with two-word window

In [14]:
def dict_cooc(e):
    """Toma como entrada una lista de strings, cada string es una oración.
        Devuelve una lista de diccionarios con las coocurrencias y una lista de palabras para los indices de la matriz. 
        Ventana tamaño 2. sin tokens de inicio y final ya que no contribuyen al problema de clusters de palabras."""
    
    cooc = [] # lista de diccionarios
    idx = [] # lista de palabras
    for sent in e:
        for m,tok in enumerate(sent):
            vecinos = []
        
            if m==0:
                if m+1 == len(sent)-1:
                    vecinos.append(sent[m+1])
                else:
                    vecinos.extend([sent[m+1],sent[m+2]])
                                
            elif m==len(sent)-1 :
                if m-1 == 0:
                    vecinos.append(sent[m-1])
                else:
                    vecinos.extend([sent[m-2],sent[m-1]])
                
            else :
                if m+1 == len(sent)-1:
                    vecinos.append(sent[m+1])
                else:
                    vecinos.extend([sent[m+1],sent[m+2]])
                if m-1 == 0:
                    vecinos.append(sent[m-1])
                else:
                    vecinos.extend([sent[m-2],sent[m-1]])
            
            # para agregar los vecinos de una nueva palabra
            if tok in idx:
                i = idx.index(tok)
                cooc[i][tok] +=1
            else :
                cooc.append({tok:1}) 
                idx.append(tok)
                i = idx.index(tok)

            for v in vecinos:
                if not v in cooc[i]:
                    cooc[i][v] =1
                else:
                    cooc[i][v] +=1
                        

    return cooc, idx

In [16]:
coocs, idx = dict_cooc(articles)
coocs # idx is only the words as indexes (keys) in the python dictionary 

[{'claro': 10,
  'crespo': 1,
  'rodolfo': 1,
  'hacer': 1,
  'año': 1,
  'pensarlo': 1,
  'decir': 1,
  'gustar': 1,
  'ser': 5,
  'comer': 2,
  'asegurar': 1,
  'sostener': 1,
  'trabajar': 1,
  'rody': 1,
  'c': 1,
  'siempre': 1,
  'día': 1,
  'llevar': 1,
  'haber': 2,
  'nadie': 1,
  'respuesta': 1,
  'vaco': 1,
  'reglar': 1,
  'comercial': 1,
  'cómo': 1,
  'industriar': 1,
  'ejemplo': 1,
  'medir': 1,
  'dirigencia': 1,
  'avina': 1,
  'indicio': 1,
  'líder': 1,
  'interpretar': 1},
 {'crespo': 1, 'rodolfo': 1, 'martínez': 1, 'claro': 1},
 {'rodolfo': 2,
  'martínez': 1,
  'imaginar': 1,
  'claro': 1,
  'crespo': 1,
  'gustar': 1,
  'ser': 1,
  'querer': 1,
  'cambiarle': 1},
 {'martínez': 3,
  'imaginar': 1,
  'unir': 1,
  'crespo': 1,
  'rodolfo': 1,
  'coordinador': 1,
  'museo': 1,
  'córdoba': 1,
  'adriana': 1,
  'área': 1,
  'bosque': 1,
  'nativo': 1,
  'santiago': 1},
 {'imaginar': 3,
  'unir': 1,
  'preferir': 1,
  'rodolfo': 1,
  'martínez': 1,
  'ir': 1,
  'durar

In [17]:
# def gen_vectors(normalized_text):
# 	# Generate word vectors using neural word embeddings
# 	print("\nGenerating word vectors")
# 	model = Word2Vec(normalized,min_count=1)
# 	vects = []
# 	for word in model.wv.vocab:
# 		vects.append(model.wv[word])

# 	matrix = numpy.array(vects)
# 	print("Matrix shape:",matrix.shape)
# 	print("Vectors generated")
# 	return model.wv.vocab,matrix

### Transport our python dictionary to vector space

In [20]:
# Se obtiene la matriz de coocurrencias
vectorizer = DictVectorizer()
vec = vectorizer.fit_transform(coocs)
terms = vectorizer.get_feature_names()

In [21]:
#dataframe
matriz = pd.DataFrame(vec.toarray(), columns=terms)
matriz = matriz.set_axis(idx,axis=0, inplace=False)
matriz.head(20)

Unnamed: 0,*lic,*psicopedagoga,-de,-quitan,-te,-totalmente,-¿los,...,1,"1,19",....1,ánimo,árbol,área,época,éxito,íntimo,ómnibus,último,único,﻿1
claro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
crespo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rodolfo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
martínez,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
imaginar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unir,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
preferir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pensarlo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
decir,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hacer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# normalize the matrix
matrix_normed = matriz / matriz.max(axis=0)
matrix_normed.shape

(2923, 2923)

### save it for safety

In [26]:
filename = "trained/lavoz_matrix.pickle"
fileObj = open(filename, 'wb')
pickle.dump(matrix_normed, fileObj)
fileObj.close()

In [27]:
filename = "trained/lavoz_matrix.pickle"
with open(filename, 'rb') as f:
    matrix_normed = pickle.load(f)

## Let's make the Clusters!!!!

In [28]:
num_clusters = 50
km = KMeans(n_clusters=num_clusters)
km.fit(matrix_normed)
clusters = km.labels_.tolist()
# print(clusters)

In [30]:
# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))

El cluster 0 tiene 7 elementos
El cluster 1 tiene 1 elementos
El cluster 2 tiene 1 elementos
El cluster 3 tiene 1 elementos
El cluster 4 tiene 1 elementos
El cluster 5 tiene 1 elementos
El cluster 6 tiene 7 elementos
El cluster 7 tiene 1 elementos
El cluster 8 tiene 1 elementos
El cluster 9 tiene 1 elementos
El cluster 10 tiene 5 elementos
El cluster 11 tiene 1 elementos
El cluster 12 tiene 1 elementos
El cluster 13 tiene 1 elementos
El cluster 14 tiene 1 elementos
El cluster 15 tiene 1 elementos
El cluster 16 tiene 8 elementos
El cluster 17 tiene 1 elementos
El cluster 18 tiene 6 elementos
El cluster 19 tiene 5 elementos
El cluster 20 tiene 6 elementos
El cluster 21 tiene 5 elementos
El cluster 22 tiene 1 elementos
El cluster 23 tiene 6 elementos
El cluster 24 tiene 1 elementos
El cluster 25 tiene 1 elementos
El cluster 26 tiene 6 elementos
El cluster 27 tiene 5 elementos
El cluster 28 tiene 6 elementos
El cluster 29 tiene 1 elementos
El cluster 30 tiene 6 elementos
El cluster 31 tien

In [31]:
dist = 1 - cosine_similarity(matriz)

In [32]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("[[ Cluster %d ]]" % i, end='\n\n')
    
    print("   WORDS /// ", end='')
    
    for ind in order_centroids[i, :10]: #replace 6 with n words per cluster
        print(terms[ind], end=' / ')
    print('\n\n')

Top terms per cluster:

[[ Cluster 0 ]]

   WORDS /// coerción / violencia / moto / actuar / aprender / impactar / través / enganchar / violentar / escena / 


[[ Cluster 1 ]]

   WORDS /// federación / jesuíticos / municipalidad / preparar / buscarla / estimación / delegación / ala / córdoba / 12,3 / 


[[ Cluster 2 ]]

   WORDS /// generoso / implementar / mutual / lágrima / estatuto / neutral / galardonar / grahovac / desastre / argumentar / 


[[ Cluster 3 ]]

   WORDS /// 9.693 / provincial / geográfico / arranz / término / holmberg / nadia / solar / federalismo / promediar / 


[[ Cluster 4 ]]

   WORDS /// maredo / conducción / gastronómico / valoración / basto / delantero / injusticia / doblar / aniversario / autonóma / 


[[ Cluster 5 ]]

   WORDS /// decisivo / moore / ajustarse / festejar / confusión / dibujar / concretar / indispensable / concentrar / industrial / 


[[ Cluster 6 ]]

   WORDS /// peraltar / mandatario / desobedecer / santacruceño / daniel / acto / gobernado

#### Save clusters to file

In [33]:
filename = "trained/lavoz_cluster01.pickle"
fileObj = open(filename, 'wb')
pickle.dump(clusters, fileObj)
fileObj.close()

In [34]:
filename = "trained/lavoz_cluster01.pickle"
with open(filename, 'rb') as f:
    clusters = pickle.load(f)