In [5]:
import spacy
from tqdm import tqdm
from collections import defaultdict
from sklearn.pipeline import Pipeline
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
import numpy as np
from collections import defaultdict


In [8]:
class SentencesIterator:
    def __init__(self, path):
        self.path = path

    def __iter__(self):
        with open(self.path, 'r') as f:
            for l in f.readlines():
                yield l.strip()

In [9]:
sentences = SentencesIterator('xaa')
sents = list(sentences)

FileNotFoundError: [Errno 2] No such file or directory: 'xaa'

In [10]:

class FeatureExtractor:
    def __init__(self, disable_pipe=['parser', 'ner'], max_sents=3000, spacy_model='es_core_news_sm'):
        self._nlp = spacy.load(spacy_model)
        self.vocabulary = []
        self._max_sents = max_sents
        self._disable_pipe = disable_pipe
    
    @staticmethod
    def _tag_sent(sent):
        return '< < {} > >'.format(sent)
    
    def _processing(self, sents):
        tokens = []
        for sent in sents[:self._max_sents]:
            if len(sent) > 7:
                tokens.append([token for token in self._nlp(self._tag_sent(sent), disable=self._disable_pipe)])
        return tokens
        
        
    def _make_feature_dict(self, doc):
        wordnet_lemmatizer = WordNetLemmatizer()
        document_features = []
        
        for i, token in enumerate(doc):
            if token.text not in ['<', '>']:
                features = {
                    'word': token.text,
                    'pword': doc[i-1].text,
                    'ppword': doc[i-2].text,
                    'nword': doc[i+1].text,
                    'nnword': doc[i+2].text,
                    'pword_tag': doc[i-1].pos_,
                    'ppword_tag': doc[i-2].pos_,
                    'nword_tag': doc[i+1].pos_,
                    'nnword_tag': doc[i+2].pos_,
                    'is_lower': token.is_lower,
                    'POS': token.pos_,
                    'lemma': wordnet_lemmatizer.lemmatize(token.text, pos='v'),
                    'is_stop': token.is_stop,
                    'is_oov': token.is_oov,
                }
                document_features.append(features)
                self.vocabulary.append(token.text)
        return document_features
    
    #Return self nothing else to do here    
    def fit(self, X, y=None):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform(self, sents, y=None):
        tokens = []
        for doc in self._processing(sents):
            tokens += self._make_feature_dict(doc)

        return np.array(tokens)

In [11]:
a = FeatureExtractor().transform(sents)

NameError: name 'sents' is not defined

In [44]:
a[:3]

array([{'word': 'Keystone', 'pword': '<', 'ppword': '<', 'nword': 'Dakota', 'nnword': 'del', 'pword_tag': 'NOUN', 'ppword_tag': 'VERB', 'nword_tag': 'PROPN', 'nnword_tag': 'ADP', 'is_lower': False, 'POS': 'PROPN', 'lemma': 'Keystone', 'is_stop': False, 'is_oov': True},
       {'word': 'Dakota', 'pword': 'Keystone', 'ppword': '<', 'nword': 'del', 'nnword': 'Sur', 'pword_tag': 'PROPN', 'ppword_tag': 'NOUN', 'nword_tag': 'ADP', 'nnword_tag': 'PROPN', 'is_lower': False, 'POS': 'PROPN', 'lemma': 'Dakota', 'is_stop': False, 'is_oov': True},
       {'word': 'del', 'pword': 'Dakota', 'ppword': 'Keystone', 'nword': 'Sur', 'nnword': '>', 'pword_tag': 'PROPN', 'ppword_tag': 'PROPN', 'nword_tag': 'PROPN', 'nnword_tag': 'PROPN', 'is_lower': True, 'POS': 'ADP', 'lemma': 'del', 'is_stop': True, 'is_oov': True}],
      dtype=object)

In [6]:
class ExtendedKMeans(KMeans):
    def __init__(self):
        self.clusters = defaultdict(set)
        super().__init__()
        
    def get_clusters(self, labels, vocabulary):
        for i, label in enumerate(labels):
            self.clusters[label].add(vocabulary[i])
        return dict(self.clusters).values()

In [7]:
f_e_config = {
    'max_sents' : 300,
}

vectorizer_config = {
        'sparse': False
}
svd_config = {
        'n_components': 14,
        'random_state': 666,
        'n_iter': 15,
}
kmeans_config = {
        'n_clusters': 45,
        'random_state': 666,
        'init': 'random',
        'precompute_distances': False,
}


pipeline = Pipeline([
                ('preprocessor', FeatureExtractor(**f_e_config)),
                ('vect', DictVectorizer(**vectorizer_config)),
                ('svd', TruncatedSVD(**svd_config)),
                ('kmeans', ExtendedKMeans(**kmeans_config)),
            ])



NameError: name 'FeatureExtractor' is not defined

In [None]:
pipeline

In [None]:
labels = pipeline.fit_predict(sents)

In [65]:
from collections import defaultdict
words = pipeline.named_steps['preprocessor'].vocabulary
words_dict = defaultdict(set)
for i, label in enumerate(labels):
    words_dict[label].add(words[i])
words_dict = dict(words_dict)


In [66]:
for i in words_dict.values():
    print(i)

{'Recibía', 'Louis', 'Contó', 'Nacional', 'Apoya', 'Carrie', 'Situado', 'Viajan', 'Le', 'Trotsky', 'Cooperstown', 'Siempre', 'Habilitado', 'Dique', 'Amanita', 'También', 'Traducción', 'No', 'Se', 'Laurens', 'Gilbertsville', 'Miembro', 'Copa', 'Retoma', 'Alrededor', 'Rosmer', 'Aparecerían', 'Keystone', 'Origen', 'Rechazan', 'Ejemplar', 'Contrario', 'Stevenson', 'Alfred', 'Morris', 'Monatte', 'Cuando', 'Cherry', 'CGT', 'Vuelve', 'Anima', 'Es', 'Fallece', 'Richfield', 'Clasificaron', 'Además', 'A', 'Produce', 'Suele', 'Christian', 'Entra', 'Treint', 'Participa', 'Va', 'En', 'Sinónimos', 'Santiago', 'Hoy', 'Esto', 'Cuenta', 'Otego', 'Igualmente', 'Desaprobado', 'Fue', 'Swanzey', 'Nunca', 'Final', 'Milford', 'Giovacchini', 'Creación', 'Tras'}
{'Real', 'Provincia', 'Grasset', 'VO', 'Tungus', 'Trotsky', 'Horak', 'Marguerite', 'Molinier', 'Formación', 'Díez', 'Étudiants', 'Cachin', 'Rosmer', 'Deseado', 'SFIC', 'Congo', 'DIMAT', 'Alfred', 'du', 'Compañía', 'Ejecutivo', 'Democrática', 'DIGITO', 