<a href="https://colab.research.google.com/github/ademirsantosjr/utfpr-ai-9-text-mining-and-nlp/blob/feature%2Fintro-and-nlp-tasks/wiki_1_intro/week1_text_mining_pln_spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy

## Tokenization & POS Tagger

In [16]:
# Load English simple model based on the web
nlp_en = spacy.load("en_core_web_lg")

# Define document
doc_en = nlp_en("Attitude is a little thing that makes a big difference.")

In [27]:
# Iterate tokens
for token in doc_en:
    print(f'{token.text:20}\t {token.tag_:4}\t {token.lemma_:20}\t {token.is_stop}')


Attitude            	 NN  	 attitude            	 False
is                  	 VBZ 	 be                  	 True
a                   	 DT  	 a                   	 True
little              	 JJ  	 little              	 False
thing               	 NN  	 thing               	 False
that                	 WDT 	 that                	 True
makes               	 VBZ 	 make                	 False
a                   	 DT  	 a                   	 True
big                 	 JJ  	 big                 	 False
difference          	 NN  	 difference          	 False
.                   	 .   	 .                   	 False


In [5]:
# Load portuguese simple model based on news
nlp_pt = spacy.load("pt_core_news_sm")

# Define document
doc_pt = nlp_pt("Maior que a tristeza de não haver vencido é a vergonha de não ter lutado!")

In [29]:
# Iterate tokens
for token in doc_pt:
    print(f'{token.text:20}\t {token.tag_:4}\t {token.lemma_:20}\t {token.is_stop}')

Maior               	 ADJ 	 Maior               	 True
que                 	 SCONJ	 que                 	 True
a                   	 DET 	 o                   	 True
tristeza            	 NOUN	 tristeza            	 False
de                  	 SCONJ	 de                  	 True
não                 	 ADV 	 não                 	 True
haver               	 AUX 	 haver               	 False
vencido             	 VERB	 vencer              	 False
é                   	 AUX 	 ser                 	 True
a                   	 DET 	 o                   	 True
vergonha            	 NOUN	 vergonha            	 False
de                  	 SCONJ	 de                  	 True
não                 	 ADV 	 não                 	 True
ter                 	 AUX 	 ter                 	 True
lutado              	 VERB	 lutar               	 False
!                   	 PUNCT	 !                   	 False


In [20]:
from spacy import displacy

In [31]:
displacy.render(doc_en, jupyter=True)

In [32]:
displacy.render(doc_pt, jupyter=True)

### Named Entities Recognition

In [24]:
# Named Entities
java_doc = nlp_en('''Created in the early 1990s by James Gosling at Sun Microsystems,
in the heart of Silicon Valley, the Java programming language was born with
an ideal: ‘write once, run anywhere.’ But more than a technical slogan,
this promise reflects a philosophy of portability, stability, and responsibility.
Unlike languages that age with hardware, Java stays young by moving across platforms,
generations, and paradigms. James Gosling didn’t just create a
language—he defined a developer’s ethic, where clarity, robustness,
and predictability are not optional virtues, but essential duties. Just as
a writer chooses words with care, a Java developer must choose their classes,
methods, and names as if writing for the future—for others, and for themselves.
''')


for ent in java_doc.ents:
  print(f'{ent.text:20}\t {ent.label_:20}')

the early 1990s     	 DATE                
James Gosling       	 PERSON              
Sun Microsystems    	 ORG                 
Silicon Valley      	 LOC                 
Java                	 ORG                 
Java                	 ORG                 
James Gosling       	 PERSON              
Java                	 ORG                 


In [27]:
displacy.render(java_doc, style='ent', jupyter=True)

### Lemmas & Stopwords

In [26]:
# Lemmas / Steaming
for token in java_doc:
  print(f'{token.text:30} -> {token.lemma_}')

Created                        -> create
in                             -> in
the                            -> the
early                          -> early
1990s                          -> 1990
by                             -> by
James                          -> James
Gosling                        -> Gosling
at                             -> at
Sun                            -> Sun
Microsystems                   -> Microsystems
,                              -> ,

                              -> 

in                             -> in
the                            -> the
heart                          -> heart
of                             -> of
Silicon                        -> Silicon
Valley                         -> Valley
,                              -> ,
the                            -> the
Java                           -> Java
programming                    -> programming
language                       -> language
was                            -> be
born              

In [28]:
# Stop-words
from spacy.lang.en import stop_words

print(stop_words.STOP_WORDS)

{'’s', 'thereupon', 'before', '’re', 'six', 'was', 'or', 'became', 'under', "'d", 'whereby', 'top', 'in', 'sixty', 'which', 'bottom', 'back', 'amount', "n't", 'whoever', 'thereby', 'everyone', 'put', 'himself', 'anyhow', 'and', 'for', 'here', 'herein', 'around', 'ten', 'always', 'the', 'hereby', 'ever', 'have', 'two', 'within', 'wherein', 'by', 'when', 're', 'of', 'beforehand', 'latterly', 'name', 'quite', 'we', 'least', 'what', 'already', 'anyway', 'besides', 'forty', 'along', 'another', 'less', 'toward', 'below', 'where', 'me', 'mostly', 'seems', 'make', 'its', 'nobody', 'someone', 'until', 'against', 'cannot', 'well', 'used', 'n’t', 'among', '’m', 'doing', 'your', 'may', 'very', 'therefore', 'just', 'hereafter', 'n‘t', 'are', 'whole', 'perhaps', 'hundred', 'at', 'once', 'were', 'had', '‘s', 'themselves', '‘re', 'all', 'will', 'everywhere', 'much', 'who', 'wherever', 'otherwise', 'further', 'she', 'during', 'per', 'does', 'how', 'none', 'do', 'as', 'itself', 'my', 'noone', 'sometimes

In [33]:
# Check if it is "stop-word"
for token in java_doc:
  is_stop = 'Stop Word' if token.is_stop else 'NOT a stop word'
  print(f'{token.text:20} -> {is_stop}')

Created              -> NOT a stop word
in                   -> Stop Word
the                  -> Stop Word
early                -> NOT a stop word
1990s                -> NOT a stop word
by                   -> Stop Word
James                -> NOT a stop word
Gosling              -> NOT a stop word
at                   -> Stop Word
Sun                  -> NOT a stop word
Microsystems         -> NOT a stop word
,                    -> NOT a stop word

                    -> NOT a stop word
in                   -> Stop Word
the                  -> Stop Word
heart                -> NOT a stop word
of                   -> Stop Word
Silicon              -> NOT a stop word
Valley               -> NOT a stop word
,                    -> NOT a stop word
the                  -> Stop Word
Java                 -> NOT a stop word
programming          -> NOT a stop word
language             -> NOT a stop word
was                  -> Stop Word
born                 -> NOT a stop word
with          