In [1]:
class SentencesIterator:
    def __init__(self, path):
        self.path = path

    def __iter__(self):
        with open(self.path, 'r') as f:
            for l in f.readlines()[:3000]:
                yield l.strip().split()

In [2]:
sentences = SentencesIterator('clean_corpus/spanish_billion_words/spanish_billion_words_48')
sents = list(sentences)

In [134]:
from nltk.tokenize import RegexpTokenizer
pattern = r'''(?x)    # set flag to allow verbose regexps
   (?:\d{1,3}(?:\.\d{3})+)  # numbers with '.' in the middle
   | (?:[Ss]r\.|[Ss]ra\.|art\.)  # common spanish abbreviations
   | (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*        # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
   | \.\.\.            # ellipsis
   | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
'''
tokenizer = RegexpTokenizer(pattern)


In [151]:
tokenizer = RegexpTokenizer(pattern)
sents = [tokenizer.tokenize(' '.join(sent)) for sent in sents]
# sents

In [159]:
import re, string, unicodedata
import nltk
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    return [word.lower() for word in words] 

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('spanish'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

# words = normalize(words)

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas
from tqdm import tqdm
lemma_sents = [normalize(sent) for sent in sents[:5000]]
# print('Stemmed:\n', stems)
# print('\nLemmatized:\n', lemmas)


 11%|█         | 545/5000 [00:15<00:35, 127.27it/s][A

In [160]:
lemma_sents[:3]

[['fiestas',
  'alquiler',
  'harlem',
  'digito',
  'digito',
  'nightclubs',
  'banquetes',
  'grandes',
  'picnics',
  'libertarios',
  'debemos',
  'reconocer',
  'sido',
  'zonas',
  'liberadas',
  'algun',
  'tipo',
  'menos',
  'tazs',
  'potenciales'],
 ['abierta',
  'solo',
  'pocos',
  'amigos',
  'fiesta',
  'cena',
  'miles',
  'participantes',
  'be',
  'in',
  'fiesta',
  'siempre',
  'abierta',
  'regulada',
  'sometida',
  'orden',
  'puede',
  'planeada',
  'menos',
  'suceda',
  'si',
  'misma',
  'fracaso'],
 ['factor', 'espontaneidad', 'crucial']]

In [8]:
from nltk import pos_tag, word_tokenize

sents2 = [pos_tag(word_tokenize(' '.join(sent))) for sent in sents]

In [19]:
sents3 = [[' '.join(words)]  for sent in sents2 for words in sent]

In [20]:
import gensim
model = gensim.models.Word2Vec(sents3, min_count = 5, size=32)
# # gensim.models.Word2Vec?

In [21]:

word_vectors = model.wv.vectors
len(word_vectors)

3053

In [22]:
from sklearn.cluster import KMeans
num_clusters = 40

# Initalize a k-means object and use it to extract centroids

kmeans_clustering = KMeans(n_clusters=num_clusters, max_iter=100)

idx = kmeans_clustering.fit_predict(word_vectors)

word_centroid_map = dict(zip(model.wv.index2word, idx))

In [23]:
from collections import defaultdict
res = defaultdict(set)
for key, value in word_centroid_map.items():
    res[value].add(key)
res

defaultdict(set,
            {18: {'Administrativos NNP',
              'Ahora NNP',
              'CCAAP NNP',
              'DIGITO NNP',
              'Durante NNP',
              'ICEF NNP',
              'Jurídica NNP',
              'Ley NNP',
              'Nos NNP',
              'Organizaciones NNP',
              'Proyecto NNP',
              'Se NNP',
              'Tamang NNP',
              'acto FW',
              'actos NN',
              'adopte NN',
              'ahora FW',
              'alcanzados NN',
              'aprobado NN',
              'asistencia FW',
              'categoría NN',
              'celebrará NN',
              'clara NN',
              'compatible JJ',
              'consideración NN',
              'consumo FW',
              'crecimiento NN',
              'cura NN',
              'de NNP',
              'dichas NN',
              'diciembre FW',
              'dólares NNS',
              'establezcan JJ',
              'estén NN',
        