In [1]:
import spacy

In [2]:
# Load the model

nlp = spacy.load("en_core_web_sm")

In [4]:
#Let's try to print a POS for each token

doc = nlp("Asma work very hard. I hope she can have a better life")

for token in doc:
    print(token, "|", token.pos_)

Asma | PROPN
work | VERB
very | ADV
hard | ADV
. | PUNCT
I | PRON
hope | VERB
she | PRON
can | AUX
have | VERB
a | DET
better | ADJ
life | NOUN


In [6]:
#Let's explain the POS

doc = nlp("Asma work very hard. I hope she can have a better life")

for token in doc:
    print(token, "|", token.pos_, spacy.explain(token.pos_))

Asma | PROPN proper noun
work | VERB verb
very | ADV adverb
hard | ADV adverb
. | PUNCT punctuation
I | PRON pronoun
hope | VERB verb
she | PRON pronoun
can | AUX auxiliary
have | VERB verb
a | DET determiner
better | ADJ adjective
life | NOUN noun


In [11]:
#Tags

doc = nlp("Asma worked very hard. I hope she can have a better life")

for token in doc:
    print(token, "|", token.pos_, spacy.explain(token.pos_), "|",token.tag_, "|",spacy.explain(token.tag_) )

Asma | PROPN proper noun | NNP | noun, proper singular
worked | VERB verb | VBD | verb, past tense
very | ADV adverb | RB | adverb
hard | ADV adverb | RB | adverb
. | PUNCT punctuation | . | punctuation mark, sentence closer
I | PRON pronoun | PRP | pronoun, personal
hope | VERB verb | VBP | verb, non-3rd person singular present
she | PRON pronoun | PRP | pronoun, personal
can | AUX auxiliary | MD | verb, modal auxiliary
have | VERB verb | VB | verb, base form
a | DET determiner | DT | determiner
better | ADJ adjective | JJR | adjective, comparative
life | NOUN noun | NN | noun, singular or mass


This way you can have a tag for more information about the POS and explanation about the tag.

For example Worked is a verb in the past tense! the tag was VBD

In [13]:
# You can use indexing and nlp pipeline

doc = nlp("He works in LG")

doc[0]

print(doc[0].text, "|", doc[1].tag_, '|', spacy.explain(doc[1].tag_))


He | VBZ | verb, 3rd person singular present


# Let's do some text analysis

In [21]:
money_text = """Apple today announced  financial results for its fiscal 2022 third quarter ended June 25, 2022. The Company posted a June quarter revenue record of $83.0 billion, up 2 percent year over year, and quarterly earnings per diluted share of $1.20."""

In [23]:
doc = nlp(money_text)

for token in doc:
    print(token, "|", token.pos_, "|", spacy.explain(token.pos_))
    
# Now we can auto delete some tags
# You can also detect space

Apple | PROPN | proper noun
today | NOUN | noun
announced | VERB | verb
  | SPACE | space
financial | ADJ | adjective
results | NOUN | noun
for | ADP | adposition
its | PRON | pronoun
fiscal | ADJ | adjective
2022 | NUM | numeral
third | ADJ | adjective
quarter | NOUN | noun
ended | VERB | verb
June | PROPN | proper noun
25 | NUM | numeral
, | PUNCT | punctuation
2022 | NUM | numeral
. | PUNCT | punctuation
The | DET | determiner
Company | PROPN | proper noun
posted | VERB | verb
a | DET | determiner
June | PROPN | proper noun
quarter | NOUN | noun
revenue | NOUN | noun
record | NOUN | noun
of | ADP | adposition
$ | SYM | symbol
83.0 | NUM | numeral
billion | NUM | numeral
, | PUNCT | punctuation
up | ADV | adverb
2 | NUM | numeral
percent | NOUN | noun
year | NOUN | noun
over | ADP | adposition
year | NOUN | noun
, | PUNCT | punctuation
and | CCONJ | coordinating conjunction
quarterly | ADJ | adjective
earnings | NOUN | noun
per | ADP | adposition
diluted | ADJ | adjective
share | NOU

In [25]:
# Detect them

for token in doc:
    if token.pos_ in ["SPACE","PUNCT"]:
        print(token, "|", token.pos_, spacy.explain(token.pos_))

  | SPACE space
, | PUNCT punctuation
. | PUNCT punctuation
, | PUNCT punctuation
, | PUNCT punctuation
. | PUNCT punctuation


In [27]:
# Get ride of them

filtered_tokens = []

for token in doc:
    if token.pos_ not in ["SPACE","PUNCT"]:
        filtered_tokens.append(token)
        

In [29]:
# We don't have garbage anymore

filtered_tokens

[Apple,
 today,
 announced,
 financial,
 results,
 for,
 its,
 fiscal,
 2022,
 third,
 quarter,
 ended,
 June,
 25,
 2022,
 The,
 Company,
 posted,
 a,
 June,
 quarter,
 revenue,
 record,
 of,
 $,
 83.0,
 billion,
 up,
 2,
 percent,
 year,
 over,
 year,
 and,
 quarterly,
 earnings,
 per,
 diluted,
 share,
 of,
 $,
 1.20]

In [31]:
count_POS = doc.count_by(spacy.attrs.POS)
count_POS

{96: 4,
 92: 11,
 100: 3,
 103: 1,
 84: 5,
 85: 5,
 95: 1,
 93: 7,
 97: 5,
 90: 2,
 99: 2,
 86: 1,
 89: 1}

In [33]:
# What is 96
doc.vocab[96].text

# So we have 96 PROPN in the text

'PROPN'

In [40]:
# Count everything

for x,y in count_POS.items():
    print(doc.vocab[x].text, '|',y)

PROPN | 4
NOUN | 11
VERB | 3
SPACE | 1
ADJ | 5
ADP | 5
PRON | 1
NUM | 7
PUNCT | 5
DET | 2
SYM | 2
ADV | 1
CCONJ | 1
