In [1]:
import spacy

In [2]:
nlp=spacy.load('en_core_web_sm')

In [3]:
doc=nlp('Ali flew to mars yesterday. He carried biryani masala with him')

In [4]:
for token in doc:
    print(token)

Ali
flew
to
mars
yesterday
.
He
carried
biryani
masala
with
him


In [5]:
for token in doc:
    print(token,' | ',token.pos_,' | ',spacy.explain(token.pos_))

Ali  |  PROPN  |  proper noun
flew  |  VERB  |  verb
to  |  ADP  |  adposition
mars  |  NOUN  |  noun
yesterday  |  NOUN  |  noun
.  |  PUNCT  |  punctuation
He  |  PRON  |  pronoun
carried  |  VERB  |  verb
biryani  |  ADJ  |  adjective
masala  |  NOUN  |  noun
with  |  ADP  |  adposition
him  |  PRON  |  pronoun


In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
doc=nlp('Wow! Dr.Strange made 265 million $ on the very first day')
for token in doc:
    print(token,' | ',token.pos_,' | ',spacy.explain(token.pos_),' | ',token.tag_,' | ',spacy.explain(token.tag_))

Wow  |  INTJ  |  interjection  |  UH  |  interjection
!  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer
Dr.  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
Strange  |  PROPN  |  proper noun  |  NNP  |  noun, proper singular
made  |  VERB  |  verb  |  VBD  |  verb, past tense
265  |  NUM  |  numeral  |  CD  |  cardinal number
million  |  NUM  |  numeral  |  CD  |  cardinal number
$  |  NUM  |  numeral  |  CD  |  cardinal number
on  |  ADP  |  adposition  |  IN  |  conjunction, subordinating or preposition
the  |  DET  |  determiner  |  DT  |  determiner
very  |  ADV  |  adverb  |  RB  |  adverb
first  |  ADJ  |  adjective  |  JJ  |  adjective (English), other noun-modifier (Chinese)
day  |  NOUN  |  noun  |  NN  |  noun, singular or mass


In [8]:
doc=nlp('He quits the job')

In [9]:
doc[1]

quits

In [10]:
print(doc[1],' | ',doc[1].tag_,' | ',spacy.explain(doc[1].tag_))

quits  |  VBZ  |  verb, 3rd person singular present


In [11]:
doc=nlp('He quit the job')
print(doc[1],' | ',doc[1].tag_,' | ',spacy.explain(doc[1].tag_))

quit  |  VBD  |  verb, past tense


In [12]:
earning_text='''Microsoft Corp. today announced the following results for the quarter ended March 31, 2024, as compared to the corresponding period of last fiscal year:

·        Revenue was $61.9 billion and increased 17%

·        Operating income was $27.6 billion and increased 23%

·        Net income was $21.9 billion and increased 20%

·        Diluted earnings per share was $2.94 and increased 20%'''

In [13]:
doc=nlp(earning_text)

In [14]:
for token in doc:
    print(token,' | ',token.tag_,' | ',spacy.explain(token.tag_))

Microsoft  |  NNP  |  noun, proper singular
Corp.  |  NNP  |  noun, proper singular
today  |  NN  |  noun, singular or mass
announced  |  VBD  |  verb, past tense
the  |  DT  |  determiner
following  |  VBG  |  verb, gerund or present participle
results  |  NNS  |  noun, plural
for  |  IN  |  conjunction, subordinating or preposition
the  |  DT  |  determiner
quarter  |  NN  |  noun, singular or mass
ended  |  VBN  |  verb, past participle
March  |  NNP  |  noun, proper singular
31  |  CD  |  cardinal number
,  |  ,  |  punctuation mark, comma
2024  |  CD  |  cardinal number
,  |  ,  |  punctuation mark, comma
as  |  IN  |  conjunction, subordinating or preposition
compared  |  VBN  |  verb, past participle
to  |  IN  |  conjunction, subordinating or preposition
the  |  DT  |  determiner
corresponding  |  JJ  |  adjective (English), other noun-modifier (Chinese)
period  |  NN  |  noun, singular or mass
of  |  IN  |  conjunction, subordinating or preposition
last  |  JJ  |  adjective (E

In [15]:
for token in doc: 
    if spacy.explain(token.tag_)!='superfluous punctuation':
        print(token)

Microsoft
Corp.
today
announced
the
following
results
for
the
quarter
ended
March
31
,
2024
,
as
compared
to
the
corresponding
period
of
last
fiscal
year
:



       
Revenue
was
$
61.9
billion
and
increased
17
%



       
Operating
income
was
$
27.6
billion
and
increased
23
%



       
Net
income
was
$
21.9
billion
and
increased
20
%



       
Diluted
earnings
per
share
was
$
2.94
and
increased
20
%


In [16]:
filtered_token=[]
for token in doc:
    if token.pos_ not in ['SPACE','X','PUNCT']:
        filtered_token.append(token)

In [17]:
filtered_token

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 March,
 31,
 2024,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 61.9,
 billion,
 and,
 increased,
 17,
 %,
 Operating,
 income,
 was,
 $,
 27.6,
 billion,
 and,
 increased,
 23,
 %,
 Net,
 income,
 was,
 $,
 21.9,
 billion,
 and,
 increased,
 20,
 %,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 2.94,
 and,
 increased,
 20,
 %]

In [18]:
count=doc.count_by(spacy.attrs.POS)
count

{96: 3,
 92: 14,
 100: 10,
 90: 3,
 85: 4,
 93: 13,
 97: 7,
 98: 1,
 84: 4,
 103: 8,
 87: 4,
 99: 4,
 89: 4}

In [19]:
for k,v in count.items():
    print(doc.vocab[k].text,' | ',v)

PROPN  |  3
NOUN  |  14
VERB  |  10
DET  |  3
ADP  |  4
NUM  |  13
PUNCT  |  7
SCONJ  |  1
ADJ  |  4
SPACE  |  8
AUX  |  4
SYM  |  4
CCONJ  |  4


In [21]:
with open('news_story.txt') as f:
    text=f.readlines()
text

['Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n',
 '\n',
 'The consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n',
 '\n',
 'Removing volatile food and energy prices, so-called core CPI still rose 6.2%, against expectations for a 6% gain, clouding hopes that inflation had peaked in March.\n',
 '\n',
 'The month-over-month gains also were higher than expectations â€” 0.3% on headline CPI versus the 0.2% estimate and a 0.6% increase for core, against the outlook for a 0.4% gain.\n',
 '\n',
 'The price gains also meant that workers continued to lose ground. Real wages adjusted for inflation decreased 0.1% on the month despit

In [22]:
text=' '.join(text)
text

'Inflation rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the Bureau of Labor Statistics reported Wednesday.\n \n The consumer price index, a broad-based measure of prices for goods and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain. That represented a slight ease from Marchâ€™s peak but was still close to the highest level since the summer of 1982.\n \n Removing volatile food and energy prices, so-called core CPI still rose 6.2%, against expectations for a 6% gain, clouding hopes that inflation had peaked in March.\n \n The month-over-month gains also were higher than expectations â€” 0.3% on headline CPI versus the 0.2% estimate and a 0.6% increase for core, against the outlook for a 0.4% gain.\n \n The price gains also meant that workers continued to lose ground. Real wages adjusted for inflation decreased 0.1% on the month despite a nominal increase of 0.3% in a

In [23]:
nlp=spacy.load('en_core_web_sm')

In [24]:
doc=nlp(text)

In [25]:
nouns=[token for token in doc if token.pos_=='NOUN']

In [26]:
nouns

[Inflation,
 climb,
 consumers,
 brink,
 expansion,
 consumer,
 price,
 index,
 measure,
 prices,
 goods,
 services,
 %,
 year,
 estimate,
 %,
 gain,
 ease,
 Marchâ€,
 ™,
 peak,
 level,
 summer,
 food,
 energy,
 prices,
 core,
 %,
 expectations,
 %,
 gain,
 hopes,
 inflation,
 month,
 month,
 gains,
 expectations,
 %,
 headline,
 %,
 estimate,
 %,
 increase,
 core,
 outlook,
 %,
 gain,
 price,
 gains,
 workers,
 ground,
 wages,
 inflation,
 %,
 month,
 increase,
 %,
 earnings,
 year,
 earnings,
 %,
 earnings,
 %,
 Inflation,
 threat,
 recovery,
 pandemic,
 economy,
 stage,
 year,
 growth,
 level,
 prices,
 pump,
 grocery,
 stores,
 problem,
 inflation,
 areas,
 housing,
 auto,
 sales,
 host,
 areas,
 officials,
 problem,
 interest,
 rate,
 hikes,
 year,
 pledges,
 inflation,
 %,
 goal,
 ™,
 data,
 job,
 Credits]

In [29]:
print('The number of nouns in the text is: ',len(nouns))

The number of nouns in the text is:  98


In [30]:
num=[token for token in doc if token.pos_=='NUM']

In [31]:
num

[8.3,
 8.1,
 1982,
 6.2,
 6,
 â€,
 0.3,
 0.2,
 0.6,
 0.4,
 0.1,
 0.3,
 2.6,
 5.5,
 2021,
 1984,
 one,
 two,
 two,
 2]

In [32]:
print('The number of numbers in the text is: ',len(num))

The number of numbers in the text is:  20


In [35]:
count2=doc.count_by(spacy.attrs.POS)
count2

{92: 98,
 100: 27,
 86: 15,
 85: 39,
 96: 17,
 97: 32,
 90: 34,
 95: 4,
 87: 13,
 89: 10,
 84: 23,
 103: 7,
 93: 20,
 94: 4,
 98: 8,
 101: 1}

In [36]:
for k,v in count2.items():
    print(doc.vocab[k].text,' | ',v)

NOUN  |  98
VERB  |  27
ADV  |  15
ADP  |  39
PROPN  |  17
PUNCT  |  32
DET  |  34
PRON  |  4
AUX  |  13
CCONJ  |  10
ADJ  |  23
SPACE  |  7
NUM  |  20
PART  |  4
SCONJ  |  8
X  |  1
