In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
text = nlp("Hey there! I'm using a NLP text for studies")

In [10]:
for token in text:
    print(token.text, token.pos_, token.dep_)

Hey INTJ intj
there ADV ROOT
! PUNCT punct
I PRON nsubj
'm AUX aux
using VERB ROOT
a DET det
NLP PROPN compound
text NOUN dobj
for ADP prep
studies NOUN pobj


In [6]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x253da307b30>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x253da33b270>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x253da2c9ee0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x253da35e160>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x253da2db380>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x253da2e4600>)]

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

# Span

In [14]:
text = nlp("In 2017, 'people started talking about Day Zero', a possible time when taps would need to be turned off. Real worries that a city of four and a half million people would need to queue in the streets to get water. 'The Dead Sea in Jordan: 30 years ago, water reached this motorway, but rivers were diverted for farming.'")

In [17]:
text

In 2017, 'people started talking about Day Zero', a possible time when taps would need to be turned off. Real worries that a city of four and a half million people would need to queue in the streets to get water. 'The Dead Sea in Jordan: 30 years ago, water reached this motorway, but rivers were diverted for farming.'

In [18]:
type(text)

spacy.tokens.doc.Doc

In [23]:
span_text = text[3:11]

In [24]:
span_text

'people started talking about Day Zero'

In [25]:
type(span_text)

spacy.tokens.span.Span

In [26]:
for sentences in text.sents:
    print(sentences)

In 2017, 'people started talking about Day Zero', a possible time when taps would need to be turned off.
Real worries that a city of four and a half million people would need to queue in the streets to get water. '
The Dead Sea in Jordan: 30 years ago, water reached this motorway, but rivers were diverted for farming.'


In [31]:
text = nlp("Sentence 1: Test. Sentence 2: Test. Sentence 3: Test.")

In [32]:
for sentences in text.sents:
    print(sentences)

Sentence 1: Test.
Sentence 2: Test.
Sentence 3: Test.


In [34]:
text[6]

2

In [36]:
text[6].is_sent_start

False

## Tokenization

In [37]:
text = nlp("Always bring cinnamon buns on a deep-sea diving expedition. The truth is that you pay for your lifestyle in hours. He always wore his sunglasses at night.")

In [41]:
for token in text:
    print(token.text)

Always
bring
cinnamon
buns
on
a
deep
-
sea
diving
expedition
.
The
truth
is
that
you
pay
for
your
lifestyle
in
hours
.
He
always
wore
his
sunglasses
at
night
.


In [46]:
for entity in text.ents:
    print(entity, entity.label_)
    print(str(spacy.explain(entity.label_)))

lifestyle in hours TIME
Times smaller than a day
night TIME
Times smaller than a day


In [48]:
for chunk in text.noun_chunks:
    print(chunk, chunk.label_)
    print(str(spacy.explain(chunk.label_)))
    print("\n")

cinnamon buns NP
noun phrase


a deep-sea diving expedition NP
noun phrase


The truth NP
noun phrase


you NP
noun phrase


your lifestyle NP
noun phrase


hours NP
noun phrase


He NP
noun phrase


his sunglasses NP
noun phrase


night NP
noun phrase




In [49]:
from spacy import displacy

In [51]:
displacy.render(text, jupyter = True, options = {"distance": 100})

In [52]:
displacy.render(text, jupyter = True, style = "ent")

## Stemmizer

In [54]:
import nltk
nltk.download("rslp")

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\Caio\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping stemmers\rslp.zip.


True

In [55]:
stemmer = nltk.stem.RSLPStemmer()

In [56]:
lista = ["run", "running", "runner", "runs", "ran", "commodities", "fairly"]

In [58]:
for word in lista:
    print(word + " ------> " + stemmer.stem(word))

run ------> run
running ------> running
runner ------> runn
runs ------> rum
ran ------> ran
commodities ------> commoditi
fairly ------> fairly


In [60]:
from nltk.stem.porter import PorterStemmer

In [61]:
p_stemmer = PorterStemmer()

In [65]:
for word in lista:
    print(word + " ------> " + p_stemmer.stem(word))

run ------> run
running ------> run
runner ------> runner
runs ------> run
ran ------> ran
commodities ------> commod
fairly ------> fairli
