In [11]:
import spacy
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import TruncatedSVD
from collections import defaultdict
from tqdm import tqdm
import os
import re
import pickle

In [30]:
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import RegexpTokenizer

In [35]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /users/ldefrancesca/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [138]:
pattern = r'''(?x)                          # set flag to allow verbose regexps
   (?:\d{1,3}(?:\.\d{3})+)                  # numbers with '.' in the middle
   | (?:[Ss]r\.|[Ss]ra\.|[Aa]rt\.)          # common spanish abbreviations
   | (?:[A-Z]\.)+                           # abbreviations, e.g. U.S.A.
   | \w+(?:-\w+)*                           # words with optional internal hyphens
   | \$?\d+(?:\.\d+)?%?                     # currency and percentages, e.g. $12.40, 82%
   | \.\.\.                                 # ellipsis
   | [][.,;"'?():-_`]                       # these are separate tokens; includes ], [
'''

In [139]:
tokenizer = RegexpTokenizer(pattern)
corpus = PlaintextCorpusReader('../corpus/infoleg_text', '59712.txt', word_tokenizer=tokenizer)
tokens = list(corpus.sents())

In [140]:
tokens

[['Secretaría',
  'de',
  'Industria',
  ',',
  'Comercio',
  'y',
  'Minería',
  'LEALTAD',
  'COMERCIAL',
  'Resolución',
  '640',
  '99',
  'Modificación',
  'de',
  'la',
  'Resolución',
  'Nº',
  '431',
  '99',
  ',',
  'a',
  'fin',
  'de',
  'establecer',
  'un',
  'plazo',
  'mayor',
  'para',
  'dictaminar',
  'sobre',
  'las',
  'solicitudes',
  'de',
  'organismos',
  'de',
  'certificación',
  ',',
  'laboratorios',
  'de',
  'ensayos',
  'y',
  'de',
  'calibración',
  'y',
  'organismos',
  'de',
  'inspección',
  'interesados',
  'en',
  'participar',
  'en',
  'los',
  'regímenes',
  'de',
  'certificación',
  'obligatoria',
  '.'],
 ['Bs', '.'],
 ['As',
  '.',
  ',',
  '1',
  '9',
  '99',
  'VISTO',
  'el',
  'Expediente',
  'Nº',
  '045-001026',
  '99',
  'del',
  'Registro',
  'del',
  'MINISTERIO',
  'DE',
  'ECONOMIA',
  'Y',
  'OBRAS',
  'Y',
  'SERVICIOS',
  'PUBLICOS',
  ',',
  'y',
  'CONSIDERANDO',
  ':',
  'Que',
  'con',
  'el',
  'objeto',
  'de',
  'asegur

In [131]:
doc = [token for token in nlp(tokens[0]) if not token.is_stop]

TypeError: Argument 'string' has incorrect type (expected str, got list)

In [40]:
def preProcessing(ncorpus):
    """
    input: pathfile corpus
    output: tokenizer doc, without stop-words and DIGITO word
    """
    nlp = spacy.load('es')
    # pipeline spacy
    pipe = ["parser", "tagger"]

    tokenizer_doc = []
    with open(ncorpus) as fd:
        for document in tqdm(fd.readlines()):
            document = document.strip()
            nlp.max_length = max(len(document), nlp.max_length)
            
            # nlp create doc object(token sequence)
            # disable pipeline from spacy
            
            tokens = [ token for token in nlp(document, disable=pipe) 
                       if not token.is_stop
                     ]
            
            # delete documents long less than 8
            if len(tokens) > 8:
                tokenizer_doc.append(tokens)
    return tokenizer_doc

In [97]:
t = preProcessing('../corpus/infoleg_text/59712.txt')





100%|██████████| 2/2 [00:00<00:00, 42.77it/s]A[A


In [150]:
def corpus_processor(dir_corpus):
    nlp = spacy.load('es')
    tokenizer_doc = []
    for fname in tqdm(os.listdir(dir_corpus)):
        with open(os.path.join(dir_corpus, fname), "r") as fh:
            # Careful with this for very large docs
            document = re.sub(r"\s+", " ", fh.read())

        nlp.max_length = max(len(document), nlp.max_length)

        tokens = [
            token for token in nlp(document, disable=["tagger", "parser"])
            if not (token.is_stop or token.ent_type_ == '')
        ]
        
        if len(tokens) > 8:
                tokenizer_doc.append(tokens)
    return tokenizer_doc

In [151]:
tokenizers =  corpus_processor("../corpus/infoleg_text/mini/")





  0%|          | 0/3 [00:00<?, ?it/s][A[A[A[A



100%|██████████| 3/3 [00:00<00:00, 19.62it/s][A[A[A[A


In [152]:
tokenizers

[[Secretaría,
  Hacienda,
  PRESUPUESTO,
  Resolución,
  Presupuesto,
  Administración,
  Nacional,
  Ejercicio,
  2004,
  Tesorería,
  Nación,
  Jurisdicciones,
  Entidades,
  Administración,
  Nacional,
  Ejercicio,
  Artículo,
  12,
  Decisión,
  Administrativa,
  Nº,
  2,
  Bs,
  .,
  As,
  VISTO,
  PRESUPUESTO,
  NACIONAL,
  Ejercicio,
  2004,
  Ley,
  N,
  Decisión,
  Administrativa,
  N,
  CONSIDERANDO,
  Jurisdicciones,
  Entidades,
  NACIONAL,
  Ejercicio,
  TESORERIA,
  NACION,
  SUBSECRETARIA,
  PRESUPUESTO,
  SECRETARIA,
  HACIENDA,
  MINISTERIO,
  Artículo,
  12,
  Decisión,
  Administrativa,
  N,
  percepción,
  Aplicaciones,
  Financieras,
  Artículo,
  12,
  Decisión,
  Administrativa,
  N,
  SECRETARIO,
  Artículo,
  1,
  Modifícase,
  PRESUPUESTO,
  NACIONAL,
  Ejercicio,
  2004,
  Planillas,
  Anexas,
  Art,
  Jurisdicciones,
  Entidades,
  Artículo,
  1,
  TESORERIA,
  NACION,
  SUBSECRETARIA,
  PRESUPUESTO,
  SECRETARIA,
  HACIENDA,
  MINISTERIO,
  Ejercicio,
  200