In [27]:
import spacy
from tqdm import tqdm
from collections import defaultdict
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import numpy as np



class SentencesIterator:
    def __init__(self, path):
        self.path = path

    def __iter__(self):
        with open(self.path, 'r') as f:
            for l in f.readlines():
                yield l.strip()

In [28]:
sentences = SentencesIterator('xaa')
sents = list(sentences)

In [30]:
def tag_sent(sent):
    return '< < {} > >'.format(sent)

def processing(sents, disable_pipe=['parser', 'ner'], max_sents=50000):
    nlp = spacy.load('es_core_news_sm')
    tokens = []
    for sent in tqdm(sents[:max_sents]):
        if len(sent) > 7:
            tokens.append([token for token in nlp(tag_sent(sent), disable=disable_pipe)])
    return tokens
    
    
    
    

In [51]:
tokens = processing(sents, max_sents=300)

100%|██████████| 300/300 [00:01<00:00, 258.21it/s]


In [52]:


def make_feature_dict(doc):
    wordnet_lemmatizer = WordNetLemmatizer()
    document_features = []
    words = []
    for i, token in enumerate(doc):
        if token.text not in ['<', '>']:
            features = {
                'word': token.text,
                'pword': doc[i-1].text,
                'ppword': doc[i-2].text,
                'nword': doc[i+1].text,
                'nnword': doc[i+2].text,
                'pword_tag': doc[i-1].pos_,
                'ppword_tag': doc[i-2].pos_,
                'nword_tag': doc[i+1].pos_,
                'nnword_tag': doc[i+2].pos_,
                'is_lower': token.is_lower,
                'POS': token.pos_,
                'lemma': wordnet_lemmatizer.lemmatize(token.text, pos='v'),
                #'word_len': len(token.text),
                #'prob': token.prob,
                'is_stop': token.is_stop,
                'is_oov': token.is_oov,
            }
            document_features.append(features)
            words.append(token.text)
#             document_features_dict[token.text].append(features)
#             check_list.append(token.text)
    return document_features, words

In [53]:
tokens_f = [make_feature_dict(doc) for doc in tokens]
# tokens_f = [make_word_feature_dict(doc) for doc in tokens]
# make_feature_dict(tokens[0])

In [54]:
tokens_f = []
words = []
for doc in tokens:
    toks, words_ = make_feature_dict(doc) 
    tokens_f += toks
    words += words_    

In [55]:
tokens_f = np.array(tokens_f)

In [56]:
vectorizer_config = {
        'sparse': True
}
svd_config = {
        'n_components': 14,
        'random_state': 666,
        'n_iter': 15,
}
kmeans_config = {
        'n_clusters': 45,
        'random_state': 666,
        'init': 'random',
        'precompute_distances': False,
}


pipeline = Pipeline([
                ('vect', DictVectorizer(**vectorizer_config)),
                ('svd', TruncatedSVD(**svd_config)),
                ('kmeans', KMeans(**kmeans_config)),
            ])



In [57]:
pipeline

Pipeline(memory=None,
     steps=[('vect', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)), ('svd', TruncatedSVD(algorithm='randomized', n_components=14, n_iter=15,
       random_state=666, tol=0.0)), ('kmeans', KMeans(algorithm='auto', copy_x=True, init='random', max_iter=300,
    n_clusters=45, n_init=10, n_jobs=None, precompute_distances=False,
    random_state=666, tol=0.0001, verbose=0))])

In [58]:
labels = pipeline.fit_predict(tokens_f)

In [59]:
from collections import defaultdict
words_dict = defaultdict(set)
for i, label in enumerate(labels):
    words_dict[label].add(words[i])
words_dict = dict(words_dict)


In [60]:
for i in words_dict.values():
    print(i)

{'Alfred', 'Trotsky', 'Anima', 'Contó', 'Igualmente', 'Dique', 'Stevenson', 'Morris', 'Swanzey', 'Fue', 'Copa', 'Milford', 'Creación', 'Viajan', 'Laurens', 'Se', 'Aparecerían', 'A', 'Gilbertsville', 'Louis', 'Contrario', 'Rosmer', 'Habilitado', 'Entra', 'Siempre', 'Fallece', 'Richfield', 'Santiago', 'Treint', 'Tras', 'Esto', 'Apoya', 'Nacional', 'Clasificaron', 'Cuenta', 'Suele', 'Carrie', 'No', 'Miembro', 'Le', 'Hoy', 'Produce', 'Ejemplar', 'Cooperstown', 'Keystone', 'Nunca', 'Alrededor', 'Vuelve', 'CGT', 'Cuando', 'Desaprobado', 'Final', 'Otego', 'En', 'Retoma', 'Traducción', 'Situado', 'Recibía', 'Cherry', 'También', 'Giovacchini', 'Además', 'Sinónimos', 'Es', 'Monatte', 'Christian', 'Participa', 'Rechazan', 'Va', 'Origen', 'Amanita'}
{'Tintoretto', 'Monnate', 'Central', 'Stevenson', 'Mauá', 'Martov', 'Griot', 'ouvrière', 'Vérité', 'Ferrocarril', 'Ártico', 'Norte', 'Deseado', 'DIGITO', 'Universitario', 'VO', 'Grasset', 'México', 'Ingalls', 'Limacella', 'L', 'Franchet', 'Bolivar', 'R