In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [4]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1ec28fdf7d0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1ec27f42330>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1ec28fd6570>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1ec292e2ed0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1ec292c33d0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1ec28fd63b0>)]

In [5]:
doc = nlp('Elon Musk flew to the mars yesterday. He carried biriyani masala with him.')

In [8]:
for token in doc:
    print(token,' | ',token.pos,' | ',token.pos_,' | ',spacy.explain(token.pos_),' | ',token.tag_,' | ',spacy.explain(token.tag_))

Elon  |  96  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Musk  |  96  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
flew  |  100  |  VERB  |  verb  |  VBD  |  verb, past tense
to  |  85  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  90  |  DET  |  determiner  |  DT  |  determiner
mars  |  92  |  NOUN  |  noun  |  NNS  |  noun, plural
yesterday  |  92  |  NOUN  |  noun  |  NN  |  noun, singular or mass
.  |  97  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
He  |  95  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
carried  |  100  |  VERB  |  verb  |  VBD  |  verb, past tense
biriyani  |  96  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
masala  |  92  |  NOUN  |  noun  |  NN  |  noun, singular or mass
with  |  85  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
him  |  95  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
.  |  97  |  PUNCT  |  punctuation 

In [10]:
earning_text = '''
 Microsoft Corp. today announced the following results for the quarter ended December 31, 2022, as compared to the corresponding period of last fiscal year:

·        Revenue was $52.7 billion and increased 2%  

·        Operating income was $20.4 billion GAAP and $21.6 billion non-GAAP, and decreased 8% and 3%, respectively

·        Net income was $16.4 billion GAAP and $17.4 billion non-GAAP, and decreased 12% and 7%, respectively

·        Diluted earnings per share was $2.20 GAAP and $2.32 non-GAAP, and decreased 11% and 6%, respectively
'''

In [11]:
doc = nlp(earning_text)

In [17]:
filter_tokens = []
for token in doc:
    if token.pos_ not in ['SPACE','X','PUNCT']:
        filter_tokens.append(token)

In [18]:
filter_tokens

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 December,
 31,
 2022,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 52.7,
 billion,
 and,
 increased,
 2,
 %,
 Operating,
 income,
 was,
 $,
 20.4,
 billion,
 GAAP,
 and,
 $,
 21.6,
 billion,
 non,
 -,
 GAAP,
 and,
 decreased,
 8,
 %,
 and,
 3,
 %,
 respectively,
 Net,
 income,
 was,
 $,
 16.4,
 billion,
 GAAP,
 and,
 $,
 17.4,
 billion,
 non,
 -,
 GAAP,
 and,
 decreased,
 12,
 %,
 and,
 7,
 %,
 respectively,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 2.20,
 GAAP,
 and,
 $,
 2.32,
 non,
 -,
 GAAP,
 and,
 decreased,
 11,
 %,
 and,
 6,
 %,
 respectively]

In [19]:
for token in filter_tokens:
    print(token,' | ',token.pos,' | ',token.pos_,' | ',spacy.explain(token.pos_),' | ',token.tag_,' | ',spacy.explain(token.tag_))

Microsoft  |  96  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Corp.  |  96  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
today  |  92  |  NOUN  |  noun  |  NN  |  noun, singular or mass
announced  |  100  |  VERB  |  verb  |  VBD  |  verb, past tense
the  |  90  |  DET  |  determiner  |  DT  |  determiner
following  |  100  |  VERB  |  verb  |  VBG  |  verb, gerund or present participle
results  |  92  |  NOUN  |  noun  |  NNS  |  noun, plural
for  |  85  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  90  |  DET  |  determiner  |  DT  |  determiner
quarter  |  92  |  NOUN  |  noun  |  NN  |  noun, singular or mass
ended  |  100  |  VERB  |  verb  |  VBD  |  verb, past tense
December  |  96  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
31  |  93  |  NUM  |  numeral  |  CD  |  cardinal number
2022  |  93  |  NUM  |  numeral  |  CD  |  cardinal number
as  |  98  |  SCONJ  |  subordinating conjunction  |  IN  | 

In [21]:
 count = doc.count_by(spacy.attrs.POS)

In [22]:
count

{103: 10,
 96: 3,
 92: 29,
 100: 10,
 90: 3,
 85: 4,
 93: 21,
 97: 13,
 98: 1,
 84: 4,
 87: 4,
 99: 7,
 89: 10,
 86: 3}

In [26]:
doc.vocab[99].text

'SYM'

In [29]:
for k,y in count.items():
    print(doc.vocab[k].text,' -> ',y)

SPACE  ->  10
PROPN  ->  3
NOUN  ->  29
VERB  ->  10
DET  ->  3
ADP  ->  4
NUM  ->  21
PUNCT  ->  13
SCONJ  ->  1
ADJ  ->  4
AUX  ->  4
SYM  ->  7
CCONJ  ->  10
ADV  ->  3


In [30]:
doc = nlp('Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n\nRemoving volatile food and ene')

In [34]:
Nouns = []
for token in doc:
    if token.pos_ in ['NOUN']:
        Nouns.append(token)

In [35]:
Nouns

[Inflation,
 climb,
 consumers,
 brink,
 expansion,
 consumer,
 price,
 index,
 measure,
 prices,
 goods,
 services,
 %,
 year,
 estimate,
 %,
 gain,
 ease,
 Marchâ€,
 ™,
 peak,
 level,
 summer,
 food,
 ene]

In [36]:
count = doc.count_by(spacy.attrs.POS)
count

{92: 25,
 100: 9,
 86: 4,
 85: 11,
 96: 7,
 97: 9,
 90: 12,
 95: 2,
 87: 3,
 89: 4,
 84: 6,
 103: 2,
 93: 3,
 94: 1,
 98: 1}

In [37]:
for k,y in count.items():
    print(doc.vocab[k].text,' -> ',y)

NOUN  ->  25
VERB  ->  9
ADV  ->  4
ADP  ->  11
PROPN  ->  7
PUNCT  ->  9
DET  ->  12
PRON  ->  2
AUX  ->  3
CCONJ  ->  4
ADJ  ->  6
SPACE  ->  2
NUM  ->  3
PART  ->  1
SCONJ  ->  1
