# The Natural Language Tool Kit (version 3.4)
### Pros: basic preprocessing tools, tokenizing, and stemming work well, and fast; many corpora are available and in multiple languages, including Spanish
### Cons: NLTK fails with more advanced processing in Spanish, such as POS tagging, chunking, named entity reckognition, and text classification



In [1]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.corpus import europarl_raw
from nltk.tokenize import PunktSentenceTokenizer

In [None]:
#DOWNLOADS; not necessary if already loaded
#Also, we will use the NLTK Spanish language corpus from the European Parliament and matplotlib for visualization.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [2]:
#start with a message from our dataset
ex_message = "Sres de organización centro: por favor necesito de forma urgente que se informe al sr responsable de la grúa, que se me envíe el recibo de pago por mail o por la vía que le sea más conveniente. Necesito presentarla a mi lugar de trabajo. De haber sabido que no tenía recibo, no abonaba. Siempre actuó de buena fe. Hasta soporte ni sólo el riesgo de vida de mi mascota de 5 kilos, si no el hecho de que el sr en cuestión hablaba permanentemente por celular de cuestiones personales. Debo enviar pruebas de esto también? Gracias"

## TOKENIZE (good)

In [3]:
sent = sent_tokenize(ex_message)
sent

['Sres de organización centro: por favor necesito de forma urgente que se informe al sr responsable de la grúa, que se me envíe el recibo de pago por mail o por la vía que le sea más conveniente.',
 'Necesito presentarla a mi lugar de trabajo.',
 'De haber sabido que no tenía recibo, no abonaba.',
 'Siempre actuó de buena fe.',
 'Hasta soporte ni sólo el riesgo de vida de mi mascota de 5 kilos, si no el hecho de que el sr en cuestión hablaba permanentemente por celular de cuestiones personales.',
 'Debo enviar pruebas de esto también?',
 'Gracias']

In [4]:
words = word_tokenize(ex_message)
words

['Sres',
 'de',
 'organización',
 'centro',
 ':',
 'por',
 'favor',
 'necesito',
 'de',
 'forma',
 'urgente',
 'que',
 'se',
 'informe',
 'al',
 'sr',
 'responsable',
 'de',
 'la',
 'grúa',
 ',',
 'que',
 'se',
 'me',
 'envíe',
 'el',
 'recibo',
 'de',
 'pago',
 'por',
 'mail',
 'o',
 'por',
 'la',
 'vía',
 'que',
 'le',
 'sea',
 'más',
 'conveniente',
 '.',
 'Necesito',
 'presentarla',
 'a',
 'mi',
 'lugar',
 'de',
 'trabajo',
 '.',
 'De',
 'haber',
 'sabido',
 'que',
 'no',
 'tenía',
 'recibo',
 ',',
 'no',
 'abonaba',
 '.',
 'Siempre',
 'actuó',
 'de',
 'buena',
 'fe',
 '.',
 'Hasta',
 'soporte',
 'ni',
 'sólo',
 'el',
 'riesgo',
 'de',
 'vida',
 'de',
 'mi',
 'mascota',
 'de',
 '5',
 'kilos',
 ',',
 'si',
 'no',
 'el',
 'hecho',
 'de',
 'que',
 'el',
 'sr',
 'en',
 'cuestión',
 'hablaba',
 'permanentemente',
 'por',
 'celular',
 'de',
 'cuestiones',
 'personales',
 '.',
 'Debo',
 'enviar',
 'pruebas',
 'de',
 'esto',
 'también',
 '?',
 'Gracias']

## STOPWORDS (good)

In [5]:
stop_words = stopwords.words("spanish")
print(len(stop_words),' total stopwords')
print(stop_words[:10],'...')

313  total stopwords
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se'] ...


In [6]:
#remove stopwords
filtered_sentence = []
print('before removing stopwords: ',len(words),' words')
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
print('after: ',len(filtered_sentence),' words')

before removing stopwords:  107  words
after:  61  words


In [7]:
#the famous one-liner: does the same as above
filtered_sentence = [w for w in words if not w in stop_words]
print('after: ',len(filtered_sentence),' words')

after:  61  words


### STEMMING (good)

In [8]:
ss = SnowballStemmer('spanish')

In [9]:
for s in filtered_sentence:
    print(ss.stem(s[:10]))

sres
organizaci
centr
:
favor
necesit
form
urgent
inform
sr
responsabl
gru
,
envi
recib
pag
mail
via
convenient
.
necesit
presentarl
lug
trabaj
.
de
hab
sab
recib
,
abon
.
siempr
actu
buen
fe
.
hast
soport
sol
riesg
vid
mascot
5
kil
,
si
hech
sr
cuestion
habl
permanent
celul
cuestion
personal
.
deb
envi
prueb
?
graci


### LEMMATIZING (not good)
#### lemmatizer.lemmatize('word', pos='n')

In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
#Spanish
lemma_one = lemmatizer.lemmatize('quiso', pos='v')
lemma_two = lemmatizer.lemmatize('gatas')
#English works!
lemma_three = lemmatizer.lemmatize('wanted', pos='v')
lemma_four = lemmatizer.lemmatize('better', pos='a')
lemma_five = lemmatizer.lemmatize('cats')
lemma_six = lemmatizer.lemmatize('geese')
print(lemma_one)
print(lemma_two)
print(lemma_three)
print(lemma_four)
print(lemma_five)
print(lemma_six)

quiso
gatas
want
good
cat
goose


## Importing a Spanish language corpus for training a tokenizer

#### Here we import the NLTK Spanish language corpus from the European Parliament to train for part of speech and entity reckognition

In [15]:
spanish = europarl_raw.spanish
spanish

<EuroparlCorpusReader in '/Users/brandonjanes/nltk_data/corpora/europarl_raw/spanish'>

In [16]:
train_text = spanish.raw('ep-00-01-17.es')
print(len(train_text))

155329


In [17]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(ex_message)

## POS TAGGING (not good)

In [18]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
        print(str(e))

In [19]:
#NLTK performs poorly with Spanish language text 
#POS TAG LIST below
process_content()

[('Sres', 'NNS'), ('de', 'VBP'), ('organización', 'FW'), ('centro', 'NN'), (':', ':'), ('por', 'NN'), ('favor', 'NN'), ('necesito', 'FW'), ('de', 'FW'), ('forma', 'FW'), ('urgente', 'JJ'), ('que', 'NN'), ('se', 'JJ'), ('informe', 'NN'), ('al', 'NN'), ('sr', 'NN'), ('responsable', 'JJ'), ('de', 'FW'), ('la', 'FW'), ('grúa', 'FW'), (',', ','), ('que', 'FW'), ('se', 'FW'), ('me', 'PRP'), ('envíe', 'VBP'), ('el', 'JJ'), ('recibo', 'NN'), ('de', 'IN'), ('pago', 'FW'), ('por', 'JJ'), ('mail', 'NN'), ('o', 'NN'), ('por', 'NN'), ('la', 'NN'), ('vía', 'FW'), ('que', 'NN'), ('le', 'FW'), ('sea', 'NN'), ('más', 'NN'), ('conveniente', 'NN'), ('.', '.')]
[('Necesito', 'NNP'), ('presentarla', 'VBZ'), ('a', 'DT'), ('mi', 'JJ'), ('lugar', 'NN'), ('de', 'IN'), ('trabajo', 'NN'), ('.', '.')]
[('De', 'NNP'), ('haber', 'NNP'), ('sabido', 'NN'), ('que', 'NN'), ('no', 'DT'), ('tenía', 'NN'), ('recibo', 'NN'), (',', ','), ('no', 'DT'), ('abonaba', 'NN'), ('.', '.')]
[('Siempre', 'NNP'), ('actuó', 'NN'), ('de

In [None]:
#POS TAG LIST: DO NOT RUN THIS BLOCK
CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent's
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when

## Chunking (not good)

In [20]:
#Using regular expressions, we can search for phrases in the form: verb + proper noun
#Using matplotlib we will also create a dependency tree for each chunk(sentence). Visualization only works if using Jupyter Notebook, but careful: opens several windows.
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = """Chunk: {<VB.?>*<NNP>}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            chunked.draw()
            
    except Exception as e:
        print(str(e))

In [21]:
#lots of errors because of Spanish
process_content()

(S
  Sres/NNS
  de/VBP
  organización/FW
  centro/NN
  :/:
  por/NN
  favor/NN
  necesito/FW
  de/FW
  forma/FW
  urgente/JJ
  que/NN
  se/JJ
  informe/NN
  al/NN
  sr/NN
  responsable/JJ
  de/FW
  la/FW
  grúa/FW
  ,/,
  que/FW
  se/FW
  me/PRP
  envíe/VBP
  el/JJ
  recibo/NN
  de/IN
  pago/FW
  por/JJ
  mail/NN
  o/NN
  por/NN
  la/NN
  vía/FW
  que/NN
  le/FW
  sea/NN
  más/NN
  conveniente/NN
  ./.)
(S
  (Chunk Necesito/NNP)
  presentarla/VBZ
  a/DT
  mi/JJ
  lugar/NN
  de/IN
  trabajo/NN
  ./.)
(S
  (Chunk De/NNP)
  (Chunk haber/NNP)
  sabido/NN
  que/NN
  no/DT
  tenía/NN
  recibo/NN
  ,/,
  no/DT
  abonaba/NN
  ./.)
(S (Chunk Siempre/NNP) actuó/NN de/FW buena/FW fe/NN ./.)
(S
  (Chunk Hasta/NNP)
  soporte/NN
  ni/NN
  sólo/NN
  el/FW
  riesgo/NN
  de/FW
  vida/FW
  de/FW
  mi/FW
  mascota/FW
  de/FW
  5/CD
  kilos/NN
  ,/,
  si/VBZ
  no/DT
  el/JJ
  hecho/NN
  de/IN
  que/FW
  el/FW
  sr/FW
  en/FW
  cuestión/NN
  hablaba/NN
  permanentemente/NN
  por/FW
  celular/NN
  de/IN
  c

## Name Entity Reckognition (bad)

In [22]:
#Zero percent accuracy (the abbreviation GPE refers to geographical location... nada que ver)
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            entities = nltk.ne_chunk(tagged)
            print(entities)
    except Exception as e:
        print(str(e))

In [23]:
process_content()

(S
  Sres/NNS
  de/VBP
  organización/FW
  centro/NN
  :/:
  por/NN
  favor/NN
  necesito/FW
  de/FW
  forma/FW
  urgente/JJ
  que/NN
  se/JJ
  informe/NN
  al/NN
  sr/NN
  responsable/JJ
  de/FW
  la/FW
  grúa/FW
  ,/,
  que/FW
  se/FW
  me/PRP
  envíe/VBP
  el/JJ
  recibo/NN
  de/IN
  pago/FW
  por/JJ
  mail/NN
  o/NN
  por/NN
  la/NN
  vía/FW
  que/NN
  le/FW
  sea/NN
  más/NN
  conveniente/NN
  ./.)
(S
  (GPE Necesito/NNP)
  presentarla/VBZ
  a/DT
  mi/JJ
  lugar/NN
  de/IN
  trabajo/NN
  ./.)
(S
  (PERSON De/NNP)
  haber/NNP
  sabido/NN
  que/NN
  no/DT
  tenía/NN
  recibo/NN
  ,/,
  no/DT
  abonaba/NN
  ./.)
(S (GPE Siempre/NNP) actuó/NN de/FW buena/FW fe/NN ./.)
(S
  (GPE Hasta/NNP)
  soporte/NN
  ni/NN
  sólo/NN
  el/FW
  riesgo/NN
  de/FW
  vida/FW
  de/FW
  mi/FW
  mascota/FW
  de/FW
  5/CD
  kilos/NN
  ,/,
  si/VBZ
  no/DT
  el/JJ
  hecho/NN
  de/IN
  que/FW
  el/FW
  sr/FW
  en/FW
  cuestión/NN
  hablaba/NN
  permanentemente/NN
  por/FW
  celular/NN
  de/IN
  cuestiones/NNS

### CONCLUSION: NLTK is great for basic preprocessing and its extensive corpora, but we cannot take advantage of the higher level processing (i.e. POS tags, NER) because, I believe, the library is not trained for the Spanish language.
### One option would be to train a tagger to reckognize parts of speech and named entities in the Spanish language text, or find a 'pre-trained' Spanish text, such as http://www.lsi.upc.edu/~nlp/wikicorpus/. Other modules already exist with more Spanish language capability, such as spaCy and textacy. Perhaps it is best to use them for higher order preprocessing in Spanish. 