In [6]:
#! pip install -q spacy 
#! pip install -q tabulate
#! python -m spacy download en_core_web_lg


import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from collections import Counter
import spacy
from tabulate import tabulate
nlp = spacy.load('en_core_web_lg')


[nltk_data] Downloading package punkt to /home/that-guy-
[nltk_data]     martin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/that-guy-martin/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
2023-02-19 21:12:09.151099: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-19 21:12:09.407607: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-19 21:12:09.407644: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignor

# Basic NLP pipeline


*   Sentence tokenizer
*   Word Tokenizer
*   Parts of speech tagger
*   Noun extraction
*   Verb Extraction



In [7]:
text = u"""
Dealing with textual data is very crucial so to handle these text data we need some 
basic text processing steps. Most of the processing steps covered in this section are 
commonly used in NLP and involve the combination of several steps into a single 
executable flow. This is usually referred to as the NLP pipeline. These flow 
can be a combination of tokenization, stemming, word frequency, parts of 
speech tagging, etc.
"""

sentenses = nltk.sent_tokenize(text)

words = [nltk.word_tokenize(s) for s in sentenses]

tagged_wt = [nltk.pos_tag(w)for w in words]

patternPOS= []
for tag in tagged_wt:
  patternPOS.append([v for k,v in tag])
  
nouns = []  
for tag in tagged_wt:
  nouns.append([k for k,v in tag if v in ['NN','NNS','NNP','NNPS']])


verbs = []  
for tag in tagged_wt:
  verbs.append([k for k,v in tag if v in ['VB','VBD','VBG','VBN','VBP','VBZ']])





In [9]:
print ("Sentences are:", sentenses)
print ("Words are: ", words)
print ("POS are: ", tagged_wt)
print ("POS pattern : ",patternPOS)
print ("Extracted nouns are: ",nouns)
print ("Extracted verbs are: ",verbs)


Sentences are: ['\nDealing with textual data is very crucial so to handle these text data we need some \nbasic text processing steps.', 'Most of the processing steps covered in this section are \ncommonly used in NLP and involve the combination of several steps into a single \nexecutable flow.', 'This is usually referred to as the NLP pipeline.', 'These flow \ncan be a combination of tokenization, stemming, word frequency, parts of \nspeech tagging, etc.']
Words are:  [['Dealing', 'with', 'textual', 'data', 'is', 'very', 'crucial', 'so', 'to', 'handle', 'these', 'text', 'data', 'we', 'need', 'some', 'basic', 'text', 'processing', 'steps', '.'], ['Most', 'of', 'the', 'processing', 'steps', 'covered', 'in', 'this', 'section', 'are', 'commonly', 'used', 'in', 'NLP', 'and', 'involve', 'the', 'combination', 'of', 'several', 'steps', 'into', 'a', 'single', 'executable', 'flow', '.'], ['This', 'is', 'usually', 'referred', 'to', 'as', 'the', 'NLP', 'pipeline', '.'], ['These', 'flow', 'can', 'b

In [10]:
doc = nlp(text)
noun_counter = Counter(token.lemma_ for token in doc if token.pos_ == 'NOUN')

print(tabulate(noun_counter.most_common(5), headers=['Noun', 'Count']))

Noun           Count
-----------  -------
step               3
datum              2
text               2
processing         2
combination        2


# Dependency parsing

In [11]:
doc = nlp(sentenses[2])
spacy.displacy.render(doc,style='dep', options={'distance' : 140}, jupyter=True)

# Name Entity Extraction

In [12]:
# doc = nlp("Jill laughed at John Johnson.")
doc = nlp(sentenses[1])
entity_types = ((ent.text, ent.label_) for ent in doc.ents)
print(tabulate(entity_types, headers=['Entity', 'Entity Type']))
print()
token_entity_info = ((token.text, token.ent_iob_, token.ent_type_,) for token in doc)
print(tabulate(token_entity_info, headers=['Token', 'IOB Annotation', 'Entity Type']))

Entity    Entity Type
--------  -------------
NLP       ORG

Token        IOB Annotation    Entity Type
-----------  ----------------  -------------
Most         O
of           O
the          O
processing   O
steps        O
covered      O
in           O
this         O
section      O
are          O
             O
commonly     O
used         O
in           O
NLP          B                 ORG
and          O
involve      O
the          O
combination  O
of           O
several      O
steps        O
into         O
a            O
single       O
             O
executable   O
flow         O
.            O


In [13]:
doc = nlp(u"My name is Jack and I live in India.")

entity_types = ((ent.text, ent.label_) for ent in doc.ents)
print(tabulate(entity_types, headers=['Entity', 'Entity Type']))

Entity    Entity Type
--------  -------------
Jack      PERSON
India     GPE
