In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
text = nlp("Hey there! I'm using a NLP text for studies")

In [5]:
for token in text:
    print(token.text, token.pos_, token.dep_)

Hey INTJ intj
there ADV ROOT
! PUNCT punct
I PRON nsubj
'm AUX aux
using VERB ROOT
a DET det
NLP PROPN compound
text NOUN dobj
for ADP prep
studies NOUN pobj


In [6]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2510cbb3b30>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2510cbe7f90>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2510cba6520>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2510cba6880>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2510cb82dc0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2510cbe1340>)]

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

# Span

In [8]:
text = nlp("In 2017, 'people started talking about Day Zero', a possible time when taps would need to be turned off. Real worries that a city of four and a half million people would need to queue in the streets to get water. 'The Dead Sea in Jordan: 30 years ago, water reached this motorway, but rivers were diverted for farming.'")

In [9]:
text

In 2017, 'people started talking about Day Zero', a possible time when taps would need to be turned off. Real worries that a city of four and a half million people would need to queue in the streets to get water. 'The Dead Sea in Jordan: 30 years ago, water reached this motorway, but rivers were diverted for farming.'

In [10]:
type(text)

spacy.tokens.doc.Doc

In [11]:
span_text = text[3:11]

In [12]:
span_text

'people started talking about Day Zero'

In [13]:
type(span_text)

spacy.tokens.span.Span

In [14]:
for sentences in text.sents:
    print(sentences)

In 2017, 'people started talking about Day Zero', a possible time when taps would need to be turned off.
Real worries that a city of four and a half million people would need to queue in the streets to get water. '
The Dead Sea in Jordan: 30 years ago, water reached this motorway, but rivers were diverted for farming.'


In [15]:
text = nlp("Sentence 1: Test. Sentence 2: Test. Sentence 3: Test.")

In [16]:
for sentences in text.sents:
    print(sentences)

Sentence 1: Test.
Sentence 2: Test.
Sentence 3: Test.


In [17]:
text[6]

2

In [18]:
text[6].is_sent_start

False

## Tokenization

In [19]:
text = nlp("Always bring cinnamon buns on a deep-sea diving expedition. The truth is that you pay for your lifestyle in hours. He always wore his sunglasses at night.")

In [20]:
for token in text:
    print(token.text)

Always
bring
cinnamon
buns
on
a
deep
-
sea
diving
expedition
.
The
truth
is
that
you
pay
for
your
lifestyle
in
hours
.
He
always
wore
his
sunglasses
at
night
.


In [21]:
for entity in text.ents:
    print(entity, entity.label_)
    print(str(spacy.explain(entity.label_)))

lifestyle in hours TIME
Times smaller than a day
night TIME
Times smaller than a day


In [22]:
for chunk in text.noun_chunks:
    print(chunk, chunk.label_)
    print(str(spacy.explain(chunk.label_)))
    print("\n")

cinnamon buns NP
noun phrase


a deep-sea diving expedition NP
noun phrase


The truth NP
noun phrase


you NP
noun phrase


your lifestyle NP
noun phrase


hours NP
noun phrase


He NP
noun phrase


his sunglasses NP
noun phrase


night NP
noun phrase




In [23]:
from spacy import displacy

In [24]:
displacy.render(text, jupyter = True, options = {"distance": 100})

In [25]:
displacy.render(text, jupyter = True, style = "ent")

## Stemmizer

In [26]:
import nltk
nltk.download("rslp")

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\Caio\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

In [27]:
stemmer = nltk.stem.RSLPStemmer()

In [28]:
lista = ["run", "running", "runner", "runs", "ran", "commodities", "fairly"]

In [34]:
for word in lista:
    print(word + " ------> " + stemmer.stem(word))

TypeError: Argument 'string' has incorrect type (expected str, got list)

In [30]:
from nltk.stem.porter import PorterStemmer

In [31]:
p_stemmer = PorterStemmer()

In [32]:
for word in lista:
    print(word + " ------> " + p_stemmer.stem(word))

run ------> run
running ------> run
runner ------> runner
runs ------> run
ran ------> ran
commodities ------> commod
fairly ------> fairli


## Lemma

In [37]:
text = nlp("runner is running and We always like to run and he ran yesterday again at 20pm")

In [43]:
for word in text:
    print(f"{word.text} -----> {word.lemma_}")

runner -----> runner
is -----> be
running -----> run
and -----> and
We -----> we
always -----> always
like -----> like
to -----> to
run -----> run
and -----> and
he -----> he
ran -----> run
yesterday -----> yesterday
again -----> again
at -----> at
20pm -----> 20pm


## Stopwords 

In [44]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [45]:
len(nlp.Defaults.stop_words)

326

In [46]:
type(nlp.Defaults.stop_words)

set

In [47]:
nlp.Defaults.stop_words.add("btw")

In [48]:
nlp.vocab["btw"].is_stop = True

In [49]:
len(nlp.Defaults.stop_words)

327