### Rule based English

In [None]:
'''
Implement POS tagging for given English or Hindi sentence.

'''

In [35]:
paragraph = input("Please enter a small paragraph in English: ")

Please enter a small paragraph in English: Hello everyone I am now performing nlp experiment number 6. POS tagging is a foundational component of NLP, where each word in a sentence is assigned a specific grammatical label, such as noun, verb, adjective, or adverb, to help us better understand the syntactic structure and meaning of sentences.


In [36]:
print("You entered the following paragraph:")
print(paragraph)

You entered the following paragraph:
Hello everyone I am now performing nlp experiment number 6. POS tagging is a foundational component of NLP, where each word in a sentence is assigned a specific grammatical label, such as noun, verb, adjective, or adverb, to help us better understand the syntactic structure and meaning of sentences.


In [37]:
import re
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', paragraph)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

In [38]:
print(cleaned_text)

Hello everyone I am now performing nlp experiment number 6 POS tagging is a foundational component of NLP where each word in a sentence is assigned a specific grammatical label such as noun verb adjective or adverb to help us better understand the syntactic structure and meaning of sentences


In [39]:
import nltk
from nltk import word_tokenize, pos_tag

In [40]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
words = word_tokenize(cleaned_text)

In [42]:
custom_rules = [
    (r'.*ing$', 'VBG'),
    (r'.*ed$', 'VBD'),
    (r'.*es$', 'VBZ'),
    (r'.*ould$', 'MD'),
    (r'.*\'s$', 'NN$'),
    (r'.*s$', 'NNS'),
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
    (r'.*', 'NN')
]

In [43]:
regex_tagger = nltk.RegexpTagger(custom_rules)

In [44]:
pos_tags = regex_tagger.tag(words)

In [45]:
print("Rule Based POS tags for the paragraph:")
print(pos_tags)

Rule Based POS tags for the paragraph:
[('Hello', 'NN'), ('everyone', 'NN'), ('I', 'NN'), ('am', 'NN'), ('now', 'NN'), ('performing', 'VBG'), ('nlp', 'NN'), ('experiment', 'NN'), ('number', 'NN'), ('6', 'CD'), ('POS', 'NN'), ('tagging', 'VBG'), ('is', 'NNS'), ('a', 'NN'), ('foundational', 'NN'), ('component', 'NN'), ('of', 'NN'), ('NLP', 'NN'), ('where', 'NN'), ('each', 'NN'), ('word', 'NN'), ('in', 'NN'), ('a', 'NN'), ('sentence', 'NN'), ('is', 'NNS'), ('assigned', 'VBD'), ('a', 'NN'), ('specific', 'NN'), ('grammatical', 'NN'), ('label', 'NN'), ('such', 'NN'), ('as', 'NNS'), ('noun', 'NN'), ('verb', 'NN'), ('adjective', 'NN'), ('or', 'NN'), ('adverb', 'NN'), ('to', 'NN'), ('help', 'NN'), ('us', 'NNS'), ('better', 'NN'), ('understand', 'NN'), ('the', 'NN'), ('syntactic', 'NN'), ('structure', 'NN'), ('and', 'NN'), ('meaning', 'VBG'), ('of', 'NN'), ('sentences', 'VBZ')]


### Statistical based English

In [46]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [47]:
pos_tags = pos_tag(words)

In [48]:
print("POS tags for the paragraph:")
for word, pos_tag in pos_tags:
    print(f"{word}: {pos_tag}")







POS tags for the paragraph:
Hello: NNP
everyone: NN
I: PRP
am: VBP
now: RB
performing: VBG
nlp: JJ
experiment: JJ
number: NN
6: CD
POS: NNP
tagging: NN
is: VBZ
a: DT
foundational: JJ
component: NN
of: IN
NLP: NNP
where: WRB
each: DT
word: NN
in: IN
a: DT
sentence: NN
is: VBZ
assigned: VBN
a: DT
specific: JJ
grammatical: JJ
label: NN
such: JJ
as: IN
noun: JJ
verb: NN
adjective: NN
or: CC
adverb: NN
to: TO
help: VB
us: PRP
better: JJR
understand: VBP
the: DT
syntactic: JJ
structure: NN
and: CC
meaning: NN
of: IN
sentences: NNS


In [50]:
noun_count=sum([1 for word,pos_tag in pos_tags if pos_tag=="NN" or pos_tag=="NNP" or pos_tag=="NNS"])

In [54]:
verb_count=sum([1 for word,pos_tag in pos_tags if pos_tag=="VB" or pos_tag=='VBP'or pos_tag=='VBG' or pos_tag=="VBN" or pos_tag=="VBZ"])

In [55]:
print(f"Number of nouns : {noun_count}")
print(f"Number of verbs : {verb_count}")

Number of nouns : 16
Number of verbs : 7


### Hindi

In [56]:
nltk.download('indian')

[nltk_data] Downloading package indian to /root/nltk_data...
[nltk_data]   Unzipping corpora/indian.zip.


True

In [58]:
hindi_sentence = input("Please enter a small paragraph in Hindi:")

Please enter a small paragraph in Hindi:सभी को नमस्कार मैं अब एनएलपी प्रयोग संख्या 6 कर रहा हूं पीओएस टैगिंग एनएलपी का एक मूलभूत घटक है जहां एक वाक्य में प्रत्येक शब्द को एक विशिष्ट व्याकरणिक लेबल दिया जाता है जैसे कि संज्ञा क्रिया विशेषण या क्रिया विशेषण हमें वाक्यों की वाक्य संरचना और अर्थ को बेहतर ढंग से समझने में मदद करता है


In [59]:
tokens = nltk.word_tokenize(hindi_sentence)

In [60]:
tokens

['सभी',
 'को',
 'नमस्कार',
 'मैं',
 'अब',
 'एनएलपी',
 'प्रयोग',
 'संख्या',
 '6',
 'कर',
 'रहा',
 'हूं',
 'पीओएस',
 'टैगिंग',
 'एनएलपी',
 'का',
 'एक',
 'मूलभूत',
 'घटक',
 'है',
 'जहां',
 'एक',
 'वाक्य',
 'में',
 'प्रत्येक',
 'शब्द',
 'को',
 'एक',
 'विशिष्ट',
 'व्याकरणिक',
 'लेबल',
 'दिया',
 'जाता',
 'है',
 'जैसे',
 'कि',
 'संज्ञा',
 'क्रिया',
 'विशेषण',
 'या',
 'क्रिया',
 'विशेषण',
 'हमें',
 'वाक्यों',
 'की',
 'वाक्य',
 'संरचना',
 'और',
 'अर्थ',
 'को',
 'बेहतर',
 'ढंग',
 'से',
 'समझने',
 'में',
 'मदद',
 'करता',
 'है']

In [61]:
from nltk.tag import tnt
from nltk.corpus import indian
train_data = indian.tagged_sents('hindi.pos')
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)

In [62]:
tagged_words = (tnt_pos_tagger.tag(nltk.word_tokenize(hindi_sentence)))
print(tagged_words)

[('सभी', 'QF'), ('को', 'PREP'), ('नमस्कार', 'Unk'), ('मैं', 'PRP'), ('अब', 'RB'), ('एनएलपी', 'Unk'), ('प्रयोग', 'NVB'), ('संख्या', 'NN'), ('6', 'Unk'), ('कर', 'VFM'), ('रहा', 'VAUX'), ('हूं', 'VAUX'), ('पीओएस', 'Unk'), ('टैगिंग', 'Unk'), ('एनएलपी', 'Unk'), ('का', 'PREP'), ('एक', 'QFNUM'), ('मूलभूत', 'Unk'), ('घटक', 'Unk'), ('है', 'VAUX'), ('जहां', 'NLOC'), ('एक', 'QFNUM'), ('वाक्य', 'Unk'), ('में', 'PREP'), ('प्रत्येक', 'Unk'), ('शब्द', 'Unk'), ('को', 'PREP'), ('एक', 'QFNUM'), ('विशिष्ट', 'Unk'), ('व्याकरणिक', 'Unk'), ('लेबल', 'Unk'), ('दिया', 'VAUX'), ('जाता', 'VAUX'), ('है', 'VAUX'), ('जैसे', 'PRP'), ('कि', 'CC'), ('संज्ञा', 'NN'), ('क्रिया', 'Unk'), ('विशेषण', 'Unk'), ('या', 'CC'), ('क्रिया', 'Unk'), ('विशेषण', 'Unk'), ('हमें', 'PRP'), ('वाक्यों', 'Unk'), ('की', 'PREP'), ('वाक्य', 'Unk'), ('संरचना', 'Unk'), ('और', 'CC'), ('अर्थ', 'Unk'), ('को', 'PREP'), ('बेहतर', 'Unk'), ('ढंग', 'Unk'), ('से', 'PREP'), ('समझने', 'Unk'), ('में', 'PREP'), ('मदद', 'NVB'), ('करता', 'VFM'), ('है', 'VAUX'