In [2]:
#6A
''' 
Aim: Part of speech Tagging and chunking of user defined text.
Part of speech POS: Labeling each word in a sentence with its corresponding part of speech.
Chunking: Groups words into larger meaningful units, often referred to as chunks'''

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
from nltk import tokenize
from nltk import tag
from nltk import chunk
para = "Natural language processing (NLP) is a machine learning technology that gives computers the ability to interpret, manipulate, and comprehend human language."

sents = tokenize.sent_tokenize(para)
print("\nsentence tokenization\n===================\n",sents)

print("\nword tokenization\n===================\n")
for index in range(len(sents)):
    words = tokenize.word_tokenize(sents[index])

print(words)

#POS tagging
tagged_words = []
for index in range(len(sents)):
    tagged_words.append(tag.pos_tag(words))

print("\nPOS Tagging\n===========\n",tagged_words)

#Chunking
tree = []
for index in range(len(sents)):
    tree.append(chunk.ne_chunk(tagged_words[index]))

print("\nchunking\n========\n")
print("Tree: ",tree)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!



sentence tokenization
 ['Natural language processing (NLP) is a machine learning technology that gives computers the ability to interpret, manipulate, and comprehend human language.']

word tokenization

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'machine', 'learning', 'technology', 'that', 'gives', 'computers', 'the', 'ability', 'to', 'interpret', ',', 'manipulate', ',', 'and', 'comprehend', 'human', 'language', '.']

POS Tagging
 [[('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('machine', 'NN'), ('learning', 'VBG'), ('technology', 'NN'), ('that', 'WDT'), ('gives', 'VBZ'), ('computers', 'NNS'), ('the', 'DT'), ('ability', 'NN'), ('to', 'TO'), ('interpret', 'VB'), (',', ','), ('manipulate', 'VB'), (',', ','), ('and', 'CC'), ('comprehend', 'VBP'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]]

chunking

Tree:  [Tree('S', [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN

In [3]:
#6B
''' 
Aim: Named Entity recognition of user defined text.
Named Entity Recognition: It identifies and classifies entities like names,
locations, organizations, date and more in the given text'''

import spacy
#Loading English tokenizer, tagger, parser, and NER
nlp = spacy.load("en_core_web_sm")

#Processing whole documents
text = (
    "Natural language processing (NLP) is an "
    "interdisciplinary subfield of computer science and information "
    "retrieval. It is primarily concerned with giving computers the "
    "ability to support and manipulate human language. It involves "
    "processing natural language datasets, such as text corpora or "
    "speech corpora, using either rule-based or probabilistic machine "
    "learning approaches."
)
print("Original text:",text,"\n")

#Processing the text with spacy
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Original text: Natural language processing (NLP) is an interdisciplinary subfield of computer science and information retrieval. It is primarily concerned with giving computers the ability to support and manipulate human language. It involves processing natural language datasets, such as text corpora or speech corpora, using either rule-based or probabilistic machine learning approaches. 

Noun phrases: ['Natural language processing', 'NLP', 'an interdisciplinary subfield', 'computer science', 'information retrieval', 'It', 'computers', 'the ability', 'human language', 'It', 'natural language datasets', 'text corpora', 'speech corpora', 'either rule-based or probabilistic machine learning approaches']
Verbs: ['give', 'support', 'manipulate', 'involve', 'process', 'use', 'base', 'learn']


In [4]:
#6C
''' 
Aim: Named Entity recognition with diagram using NLTK corpus – treebank.
The Treebank corpus, part of the Penn
Treebank, consists of annotated linguistic data, 
including POS tags and syntactic structures.

The ne_chunk() function returns a tree structure where
named entities are labeled into categories such as PERSON, ORGANIZATION,
LOCATION, DATE, TIME, GPE (Geopolitical Entity), MONEY, and FACILITY'''

import nltk
nltk.download('treebank') # For parsed sentences
nltk.download('tagsets') # For part-of-speech tags
nltk.download('book') # For additional resources (needed for .draw())

from nltk.corpus import treebank

#Accessing the first tagged sentence
tagged_sentence = treebank.tagged_sents()[0]
print("First tagged sentence:\n", tagged_sentence)

#Accessing the first parsed sentence
parsed_sentence = treebank.parsed_sents()[0]
print("\nFirst parsed sentence structure:\n", parsed_sentence)

print(parsed_sentence) # or parsed_sentence.pretty_print() for a formatted output
print(parsed_sentence.pretty_print())

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.
[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\chat80.zip.
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\Artophilic\AppData\Roaming\nltk_data...
[nltk_data]

First tagged sentence:
 [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]

First parsed sentence structure:
 (S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))
(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))
                                                   

[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | 
[nltk_data]  Done downloading collection book
