### Clustering de palabras

In [66]:
import spacy
import numpy as np
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from nltk.stem import WordNetLemmatizer

In [51]:
# Load corpus from disk

class SentencesIterator:
    def __init__(self, path):
        self.path = path

    def __iter__(self):
        with open(self.path, 'r') as f:
            for l in f.readlines():
                yield l.strip()

In [52]:
sentences = SentencesIterator('corpus')
sents = list(sentences)

In [124]:
class FeatureExtractor:
    def __init__(self, disable_pipe=['parser', 'ner'], max_sents=600, spacy_model='es_core_news_sm'):
        self._nlp = spacy.load(spacy_model)
        self.vocabulary = []
        self._max_sents = max_sents
        self._disable_pipe = disable_pipe
    
    @staticmethod
    def _tag_sent(sent, window=2):
        start_tag = 'start_tag ' * window
        end_tag = 'end_tag ' * window
        return '{} {} {}'.format(start_tag.strip(), sent, end_tag.strip())
    
    def _processing(self, sents):
        tokens = []
        for sent in sents[:self._max_sents]:
            if len(sent) > 7:
                tokens.append([token for token in self._nlp(self._tag_sent(sent), disable=self._disable_pipe)])
        return tokens

        
    def _make_feature_dict(self, doc):
        document_features = []
        wordnet_lemmatizer = WordNetLemmatizer()
        
        for i, token in enumerate(doc):
            if token.text not in ['start_tag', 'end_tag']:
                features = {
                    'word': token.text,
                    'pword': doc[i-1].text,
                    'ppword': doc[i-2].text,
                    'nword': doc[i+1].text,
                    'nnword': doc[i+2].text,
                    'pword_tag': doc[i-1].pos_,
                    'ppword_tag': doc[i-2].pos_,
                    'nword_tag': doc[i+1].pos_,
                    'nnword_tag': doc[i+2].pos_,
                    'is_lower': token.is_lower,
                    'word_tag': token.pos_,
                    'lemma': wordnet_lemmatizer.lemmatize(token.text, pos='v'),
                    'is_stop': token.is_stop,
                    'is_oov': token.is_oov,
                }
                document_features.append(features)
                self.vocabulary.append(token.text)
        return document_features
    
    #Return self nothing else to do here    
    def fit(self, X, y=None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, sents, y=None):
        tokens = []
        for doc in self._processing(sents):
            tokens += self._make_feature_dict(doc)

        return np.array(tokens)

In [125]:
class ExtendedKMeans(KMeans):
    def get_clusters(self, vocabulary):
        clusters = defaultdict(set)
        
        for i, label in enumerate(self.labels_):
            clusters[label].add(vocabulary[i])
        return dict(clusters).values()

In [126]:
## Configs for each pipeline step

f_e_config = {
    'max_sents' : 200,
}

vectorizer_config = {
        'sparse': False,
}

svd_config = {
        'n_components': 11,
        'n_iter': 5,
}
kmeans_config = {
        'n_clusters': 120,
        'init': 'k-means++',
        'precompute_distances': False,
}


In [127]:
pipeline = Pipeline([
                ('preprocessor', FeatureExtractor(**f_e_config)),
                ('vect', DictVectorizer(**vectorizer_config)),
                ('svd', TruncatedSVD(**svd_config)),
                ('kmeans', ExtendedKMeans(**kmeans_config)),
    
            ])
pipeline.fit(sents)

Pipeline(memory=None,
     steps=[('preprocessor', <__main__.FeatureExtractor object at 0x7fccf04c80f0>), ('vect', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('svd', TruncatedSVD(algorithm='randomized', n_components=11, n_iter=5,
       random_state=666, tol=0.0)), ('kmeans...init=10, n_jobs=None, precompute_distances=False,
        random_state=666, tol=0.0001, verbose=0))])

In [128]:
vocabulary = pipeline.named_steps['preprocessor'].vocabulary
clusters = pipeline.named_steps['kmeans'].get_clusters(vocabulary)

In [130]:
for i in clusters:
    print(sorted(i))

['Abu', 'Alfred', 'Argentina', 'Bolivar', 'Bolívar', 'Caraglio', 'Carrie', 'Cherry', 'Clasificaron', 'Comunista', 'Copa', 'Cruz', 'Deseado', 'Dique', 'Dirección', 'Donovan', 'Dunois', 'España', 'Ferrocarril', 'Française', 'Geneviève', 'Hills', 'Huapi', 'Intercontinental', 'Internacional', 'Keystone', 'Kun', 'Louis', 'Montevideo', 'N', 'Nacional', 'National', 'Naval', 'Nikolái', 'Nueva', 'Pampa', 'Polar', 'Radek', 'René', 'Rolland', 'Rosmer', 'Rushmore', 'SCRA', 'Sainte', 'Santiago', 'Seco', 'Sheikh', 'Simon', 'Simón', 'Sindicalista', 'Sur', 'Unidos', 'Valley', 'York', 'Zayed', 'Álvarez']
['Amanita', 'Aurora', 'Cardinals', 'Comunista', 'Contrario', 'Contó', 'DIMAT', 'Dakota', 'Democrática', 'Deseado', 'Díez', 'Ejecutivo', 'Ejemplar', 'Entra', 'Evangelista', 'Franchet', 'Ingalls', 'Ligas', 'Mamore', 'Marbán', 'Mundial', 'Museo', 'Nacional', 'Origen', 'Patagónico', 'Pedro', 'Provincia', 'Zinóviev', 'dessus', 'Ártico']
['En', 'como', 'del', 'en', 'entre', 'por', 'que']
['Alfred', 'Amédée',