# STEP 1

In [1]:
text_a = "Natural language processing is fun! This text is a sample text ."

In [2]:
text_b = "Jensen Huang, the CEO of Nvidia, the nation’s most valuable semiconductor company, with a stock price of $645 a share and a market cap of $400 billion, is out to create the metaverse, what Huang describes “a virtual world that is a digital twin of ours.” Huang credits author Neal Stephenson’s Snow Crash, filled with collectives of shared 3-D spaces and virtually enhanced physical spaces that are extensions of the Internet, for conjuring the metaverse. This is already playing out with the massively popular online games like Fortnite and Minecraft, where users create richly imagined virtual worlds."

# STEP 2 

In [3]:
import nltk

## part a&b : POS tagging for text_a

In [4]:
tokens = nltk.word_tokenize(text_a)

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
tags = nltk.pos_tag(tokens)

In [7]:
tags

[('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('is', 'VBZ'),
 ('fun', 'RB'),
 ('!', '.'),
 ('This', 'DT'),
 ('text', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sample', 'JJ'),
 ('text', 'NN'),
 ('.', '.')]

## part c&d&e&f : reqular expression chunking and draw it's tree 

In [8]:
grammar = ('''
    NP: {<DT>?<JJ>*<NN>} # NP
    ''')

In [9]:
parser = nltk.RegexpParser(grammar)

In [10]:
tree = parser.parse(tags)

In [11]:
for subtree in tree.subtrees():
    print(subtree)

(S
  (NP Natural/JJ language/NN)
  (NP processing/NN)
  is/VBZ
  fun/RB
  !/.
  (NP This/DT text/NN)
  is/VBZ
  (NP a/DT sample/JJ text/NN)
  ./.)
(NP Natural/JJ language/NN)
(NP processing/NN)
(NP This/DT text/NN)
(NP a/DT sample/JJ text/NN)


In [12]:
tree.draw()

## part g: two other sample for chunking

### example 1 :

In [13]:
text = "hello! my name is Amirreza Seddighin"

In [14]:
tags = nltk.pos_tag(nltk.word_tokenize(text))

In [15]:
tags

[('hello', 'NN'),
 ('!', '.'),
 ('my', 'PRP$'),
 ('name', 'NN'),
 ('is', 'VBZ'),
 ('Amirreza', 'NNP'),
 ('Seddighin', 'NNP')]

In [16]:
grammar = ('''
    NP: {<NNP>*} # NP
    ''')

In [17]:
parser = nltk.RegexpParser(grammar)

In [18]:
tree = parser.parse(tags)

In [19]:
for subtree in tree.subtrees():
    print(subtree)

(S
  hello/NN
  !/.
  my/PRP$
  name/NN
  is/VBZ
  (NP Amirreza/NNP Seddighin/NNP))
(NP Amirreza/NNP Seddighin/NNP)


In [20]:
tree.draw()

### example 2 :

In [21]:
text = "My teacher is very good"

In [22]:
tags = nltk.pos_tag(nltk.word_tokenize(text))

In [23]:
tags

[('My', 'PRP$'),
 ('teacher', 'NN'),
 ('is', 'VBZ'),
 ('very', 'RB'),
 ('good', 'JJ')]

In [24]:
grammar = ('''
    NP: {<PRP\$> <NN>} # NP
    ''')

In [25]:
parser = nltk.RegexpParser(grammar)

In [26]:
tree = parser.parse(tags)

In [27]:
for subtree in tree.subtrees():
    print(subtree)

(S (NP My/PRP$ teacher/NN) is/VBZ very/RB good/JJ)
(NP My/PRP$ teacher/NN)


In [28]:
tree.draw()

# STEP 3:

In [29]:
from nltk.chunk import conlltags2tree, tree2conlltags

## part a&b: IOB encoding for text_b

In [30]:
tags = nltk.pos_tag(nltk.word_tokenize(text_b))

In [31]:
tags

[('Jensen', 'NNP'),
 ('Huang', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('CEO', 'NNP'),
 ('of', 'IN'),
 ('Nvidia', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('nation', 'NN'),
 ('’', 'NNP'),
 ('s', 'VBZ'),
 ('most', 'RBS'),
 ('valuable', 'JJ'),
 ('semiconductor', 'NN'),
 ('company', 'NN'),
 (',', ','),
 ('with', 'IN'),
 ('a', 'DT'),
 ('stock', 'NN'),
 ('price', 'NN'),
 ('of', 'IN'),
 ('$', '$'),
 ('645', 'CD'),
 ('a', 'DT'),
 ('share', 'NN'),
 ('and', 'CC'),
 ('a', 'DT'),
 ('market', 'NN'),
 ('cap', 'NN'),
 ('of', 'IN'),
 ('$', '$'),
 ('400', 'CD'),
 ('billion', 'CD'),
 (',', ','),
 ('is', 'VBZ'),
 ('out', 'RP'),
 ('to', 'TO'),
 ('create', 'VB'),
 ('the', 'DT'),
 ('metaverse', 'NN'),
 (',', ','),
 ('what', 'WP'),
 ('Huang', 'NNP'),
 ('describes', 'VBZ'),
 ('“', 'VB'),
 ('a', 'DT'),
 ('virtual', 'JJ'),
 ('world', 'NN'),
 ('that', 'WDT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('digital', 'JJ'),
 ('twin', 'NN'),
 ('of', 'IN'),
 ('ours.', 'JJ'),
 ('”', 'NNP'),
 ('Huang', 'NNP'),
 ('credits', 'NNS'),
 ('a

In [32]:
grammar = ('''
    NP: {<DT>?<JJ>*<NN>} # NP
    ''')

In [33]:
parser = nltk.RegexpParser(grammar)

In [34]:
tree = parser.parse(tags)

In [35]:
tree2conlltags(tree)

[('Jensen', 'NNP', 'O'),
 ('Huang', 'NNP', 'O'),
 (',', ',', 'O'),
 ('the', 'DT', 'O'),
 ('CEO', 'NNP', 'O'),
 ('of', 'IN', 'O'),
 ('Nvidia', 'NNP', 'O'),
 (',', ',', 'O'),
 ('the', 'DT', 'B-NP'),
 ('nation', 'NN', 'I-NP'),
 ('’', 'NNP', 'O'),
 ('s', 'VBZ', 'O'),
 ('most', 'RBS', 'O'),
 ('valuable', 'JJ', 'B-NP'),
 ('semiconductor', 'NN', 'I-NP'),
 ('company', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('with', 'IN', 'O'),
 ('a', 'DT', 'B-NP'),
 ('stock', 'NN', 'I-NP'),
 ('price', 'NN', 'B-NP'),
 ('of', 'IN', 'O'),
 ('$', '$', 'O'),
 ('645', 'CD', 'O'),
 ('a', 'DT', 'B-NP'),
 ('share', 'NN', 'I-NP'),
 ('and', 'CC', 'O'),
 ('a', 'DT', 'B-NP'),
 ('market', 'NN', 'I-NP'),
 ('cap', 'NN', 'B-NP'),
 ('of', 'IN', 'O'),
 ('$', '$', 'O'),
 ('400', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 (',', ',', 'O'),
 ('is', 'VBZ', 'O'),
 ('out', 'RP', 'O'),
 ('to', 'TO', 'O'),
 ('create', 'VB', 'O'),
 ('the', 'DT', 'B-NP'),
 ('metaverse', 'NN', 'I-NP'),
 (',', ',', 'O'),
 ('what', 'WP', 'O'),
 ('Huang', 'NNP', 'O'),
 

# STEP 4

In [36]:
from nltk.tag.stanford import StanfordNERTagger

In [37]:
model_path ='./stanford_data/english.all.3class.distsim.crf.ser.gz' 
jar_path = './stanford_data/stanford-ner-4.2.0.jar'
ner_tagger = StanfordNERTagger( model_path,jar_path,encoding="utf8")

In [38]:
text_b_tokens = nltk.word_tokenize(text_b)

In [39]:
tags = ner_tagger.tag(text_b_tokens)
tags

[('Jensen', 'PERSON'),
 ('Huang', 'PERSON'),
 (',', 'O'),
 ('the', 'O'),
 ('CEO', 'O'),
 ('of', 'O'),
 ('Nvidia', 'ORGANIZATION'),
 (',', 'O'),
 ('the', 'O'),
 ('nation', 'O'),
 ('’', 'O'),
 ('s', 'O'),
 ('most', 'O'),
 ('valuable', 'O'),
 ('semiconductor', 'O'),
 ('company', 'O'),
 (',', 'O'),
 ('with', 'O'),
 ('a', 'O'),
 ('stock', 'O'),
 ('price', 'O'),
 ('of', 'O'),
 ('$', 'O'),
 ('645', 'O'),
 ('a', 'O'),
 ('share', 'O'),
 ('and', 'O'),
 ('a', 'O'),
 ('market', 'O'),
 ('cap', 'O'),
 ('of', 'O'),
 ('$', 'O'),
 ('400', 'O'),
 ('billion', 'O'),
 (',', 'O'),
 ('is', 'O'),
 ('out', 'O'),
 ('to', 'O'),
 ('create', 'O'),
 ('the', 'O'),
 ('metaverse', 'O'),
 (',', 'O'),
 ('what', 'O'),
 ('Huang', 'PERSON'),
 ('describes', 'O'),
 ('“', 'O'),
 ('a', 'O'),
 ('virtual', 'O'),
 ('world', 'O'),
 ('that', 'O'),
 ('is', 'O'),
 ('a', 'O'),
 ('digital', 'O'),
 ('twin', 'O'),
 ('of', 'O'),
 ('ours.', 'O'),
 ('”', 'O'),
 ('Huang', 'PERSON'),
 ('credits', 'O'),
 ('author', 'O'),
 ('Neal', 'PERSON'),
 

In [40]:
# person tags
[tag[0] for tag in tags if tag[1]=="PERSON"]

['Jensen', 'Huang', 'Huang', 'Huang', 'Neal', 'Stephenson']

In [41]:
# organization tags
[tag[0] for tag in tags if tag[1]=="ORGANIZATION"]

['Nvidia']

# STEP 5

## part a and b

In [42]:
import spacy

In [44]:
nlp = spacy.load('en_core_web_sm')

In [45]:
doc = nlp(text_b)

In [100]:
[item.text for item in doc.noun_chunks]

['Jensen Huang',
 'the CEO',
 'Nvidia',
 'the nation’s most valuable semiconductor company',
 'a stock price',
 'a market cap',
 'the metaverse',
 'what',
 'Huang',
 '“a virtual world',
 'a digital twin',
 'ours',
 'Huang',
 'author Neal Stephenson’s Snow Crash',
 'collectives',
 'shared 3-D spaces',
 'virtually enhanced physical spaces',
 'extensions',
 'the Internet',
 'the metaverse',
 'the massively popular online games',
 'Fortnite',
 'Minecraft',
 'users',
 'virtual worlds']

## part c: PERSON and ORG and MONEY and CARDINAL entities

In [105]:
[item.text for item in doc.ents]

['Jensen Huang',
 'Nvidia',
 '645',
 '$400 billion',
 'Huang',
 'Huang',
 'Neal Stephenson’s',
 'Snow Crash',
 '3']

In [106]:
[item.text for item in doc.ents if item.label_=="PERSON"]

['Jensen Huang', 'Huang', 'Huang', 'Neal Stephenson’s', 'Snow Crash']

In [107]:
[item.text for item in doc.ents if item.label_=="ORG"]

['Nvidia']

In [108]:
[item.text for item in doc.ents if item.label_=="MONEY"]

['645', '$400 billion']

In [109]:
[item.text for item in doc.ents if item.label_=="CARDINAL"]

['3']