# 3.2 Parts of Speech (POS) Tagging

In [42]:
import spacy
import pandas as pd

In [43]:
nlp = spacy.load('en_core_web_sm')
# if you are running this for the first time, or recieve an error "Can't find model 'en_core_web_sm'", 
# then please run the follwing in your terminal: python -m spacy download en_core_web_sm
print(nlp)

<spacy.lang.en.English object at 0x00000245C78DA000>


In [73]:
# our text is from jane austin's 'emma'
# we have removed punctuation, lowercased but left in stop words
#emma_ja = "emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of authority being now long passed away they had been living together as friend and friend very mutually attached and emma doing just what she liked highly esteeming miss taylors judgment but directed chiefly by her own"
#print(emma_ja)

import requests
# let's fetch an article from rediff.com.
# you can replace this with any other article url.
url = "https://m.rediff.com/news/commentary/2025/jun/22/strikes-obliterated-irans-nuclear-ambitions-us/3377db9392bcb2ccaffde3f69c56f2c5"
response = requests.get(url)

# check if the request was successful
if response.status_code != 200:
    print(f"Failed to retrieve article: {response.status_code}")
else:
    print(f"Successfully retrieved article: {response.status_code}")    
#print(response.text)  # Print the first 1000 characters of the response text
text = response.text

# parse the html to extract the main content
from bs4 import BeautifulSoup 
soup = BeautifulSoup(text, 'html.parser')
# print the title of the article
title = soup.title.string if soup.title else 'No title found'
print(f"Title of the article: {title}")

# find the main content of the article
# this will depend on the structure of the html, you may need to adjust this.
# for example, if the main content is in a <div> with class "article-content", you would do:
#main_content = soup.find_all('div')          
#print(main_content)  # Print the first 1000 characters of the main content
# if the main content is in a <p> tag, you can do:
main_content = soup.find_all('p')
para_content = ""
if not main_content:
    para_content = "No main content found."
else:      
    for paragraph in main_content:
        para_content += paragraph.get_text().lower() + "\n"
# print the first 1000 characters of the text
print(f"Extracted text: {para_content[:1000]}...")  # Print the first 1000 characters of the text

#remove stop words
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
# assign our stop words to a variable
en_stopwords = stopwords.words('english')
# keep the words in the sentance if the word is not in the list of stop words
para_content = ' '.join([word for word in para_content.split() if word not in (en_stopwords)])
print(f"Text after removing stop words: {para_content[:1000]}...")  # Print the first 1000 characters of the text

#remove punctuation
para_content_no_punc = ""
pattern_to_find = r"[^\w\s]" 
# [^ ] means "not", \w means word and \s means whitespace: so find anything that is not a word or a space
import re
for string in para_content:
    no_punct_string = re.sub(pattern_to_find, "", string)
    para_content_no_punc = ' '.join(no_punct_string)
print(f"Text after removing punctuation: {para_content_no_punc[:1000]}...")  # Print the first 1000 characters of the text

# now we can process the text with spacy

Successfully retrieved article: 200
Title of the article: Strikes 'obliterated' Iran's nuclear ambitions: US
Extracted text: 
aviation watchdog directorate general of civil aviation (dgca) on tuesday said surveillance conducted at major airports revealed several defects in the aviation ecosystem, including multiple cases wherein the defects reappeared on...

the rajya sabha secretariat has verified the signatures of 44 of the 55 mps who had signed a notice to bring a motion for the removal of allahabad high court judge shekhar yadav over his 'hate speech' even as kapil sibal and nine others...
as per israeli media, at least four people have been killed and several injured after an iranian missile struck a residential building in beersheba.
news
business
movies
cricket
sports
get ahead
...
Text after removing stop words: aviation watchdog directorate general civil aviation (dgca) tuesday said surveillance conducted major airports revealed several defects aviation ecosystem, including mu

In [55]:
# create a spacy doc from our text - this will generate tokens and their assosciated pos tags
#spacy_doc = nlp(emma_ja)
spacy_doc = nlp(para_content)

In [56]:
# extract the tokens and pos tags into a dataframe
pos_df = pd.DataFrame(columns=['token', 'pos_tag'])

In [57]:
for token in spacy_doc:
    pos_df = pd.concat([pos_df,
                       pd.DataFrame.from_records([{'token': token.text, 'pos_tag': token.pos_}])], ignore_index=True)

In [58]:
pos_df.head(15)

Unnamed: 0,token,pos_tag
0,us,PROPN
1,attack,VERB
2,iranian,ADJ
3,nuclear,ADJ
4,facilities,NOUN
5,triggered,VERB
6,fears,NOUN
7,wider,ADJ
8,regional,ADJ
9,conflict,NOUN


In [59]:
# token frequency count
pos_df_counts = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

Unnamed: 0,token,pos_tag,counts
0,'s,PART,3
1,",",PUNCT,2
3,.,PUNCT,2
45,party,PROPN,2
2,-,PUNCT,1
5,aadmi,PROPN,1
6,aam,PROPN,1
7,ahead,ADV,1
4,...,PUNCT,1
9,also,ADV,1


In [60]:
# counts of pos_tags
pos_df_poscounts = pos_df_counts.groupby(['pos_tag'])['token'].count().sort_values(ascending=False)
pos_df_poscounts.head(10)

pos_tag
NOUN     19
PROPN    17
VERB     13
ADJ       8
PUNCT     4
ADV       3
PART      1
Name: token, dtype: int64

In [61]:
# see most common nouns
nouns = pos_df_counts[pos_df_counts.pos_tag == "NOUN"][0:10]
nouns

Unnamed: 0,token,pos_tag,counts
10,area,NOUN,1
15,business,NOUN,1
17,conflict,NOUN,1
20,countries,NOUN,1
21,cricket,NOUN,1
22,day,NOUN,1
24,facilities,NOUN,1
25,fears,NOUN,1
16,calling,NOUN,1
37,man,NOUN,1


In [62]:
# see most common verbs
verbs = pos_df_counts[pos_df_counts.pos_tag == "ADJ"][0:10]
verbs

Unnamed: 0,token,pos_tag,counts
28,gopal,ADJ,1
31,iranian,ADJ,1
38,many,ADJ,1
43,nuclear,ADJ,1
42,nilambur,ADJ,1
44,objectionable,ADJ,1
48,regional,ADJ,1
62,wider,ADJ,1
