In [1]:
import nltk

In [2]:
text = "Britain’s communications intelligence agency GCHQ has issued a statement denying it wiretapped Donald Trump during the US presidential campaign. The unusual move by the agency came after White House Press Secretary Sean Spicer cited claims first made on US TV channel Fox News earlier this week. GCHQ responded that the allegations were “nonsense, utterly ridiculous and should be ignored”. Former judge Andrew Napolitano initially made the claims of GCHQ involvement. Mr Spicer quoted Mr Napolitano: “Three intelligence sources have informed Fox News that President Obama went outside the chain of command.”"

In [3]:
# 1- tokenization
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

#break it down into sentences
sent_tokenizer = PunktSentenceTokenizer()
sents = sent_tokenizer.tokenize(text)
for sent in sents:
    print(sent)
    
print()

#break it down into tokens
tokens = word_tokenize(text)
print(tokens)

Britain’s communications intelligence agency GCHQ has issued a statement denying it wiretapped Donald Trump during the US presidential campaign.
The unusual move by the agency came after White House Press Secretary Sean Spicer cited claims first made on US TV channel Fox News earlier this week.
GCHQ responded that the allegations were “nonsense, utterly ridiculous and should be ignored”.
Former judge Andrew Napolitano initially made the claims of GCHQ involvement.
Mr Spicer quoted Mr Napolitano: “Three intelligence sources have informed Fox News that President Obama went outside the chain of command.”

['Britain', '’', 's', 'communications', 'intelligence', 'agency', 'GCHQ', 'has', 'issued', 'a', 'statement', 'denying', 'it', 'wiretapped', 'Donald', 'Trump', 'during', 'the', 'US', 'presidential', 'campaign', '.', 'The', 'unusual', 'move', 'by', 'the', 'agency', 'came', 'after', 'White', 'House', 'Press', 'Secretary', 'Sean', 'Spicer', 'cited', 'claims', 'first', 'made', 'on', 'US', 'TV

In [4]:
#Removal of stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in tokens if word.lower() not in stop_words]

print(filtered_words)

['Britain', '’', 'communications', 'intelligence', 'agency', 'GCHQ', 'issued', 'statement', 'denying', 'wiretapped', 'Donald', 'Trump', 'US', 'presidential', 'campaign', '.', 'unusual', 'move', 'agency', 'came', 'White', 'House', 'Press', 'Secretary', 'Sean', 'Spicer', 'cited', 'claims', 'first', 'made', 'US', 'TV', 'channel', 'Fox', 'News', 'earlier', 'week', '.', 'GCHQ', 'responded', 'allegations', '“', 'nonsense', ',', 'utterly', 'ridiculous', 'ignored', '”', '.', 'Former', 'judge', 'Andrew', 'Napolitano', 'initially', 'made', 'claims', 'GCHQ', 'involvement', '.', 'Mr', 'Spicer', 'quoted', 'Mr', 'Napolitano', ':', '“', 'Three', 'intelligence', 'sources', 'informed', 'Fox', 'News', 'President', 'Obama', 'went', 'outside', 'chain', 'command', '.', '”']


In [5]:
#synsets, hyponyms, hypernyms
from nltk.corpus import wordnet
synsets = wordnet.synsets('car')
for synset in synsets:
    print("\nSynset = ", synset.name())
    print("\nDefinition = ",synset.definition())
    
    hyponyms = synset.hyponyms()
    if hyponyms:
        print("\nHyponyms = ",hyponyms)
    
    hypernyms = synset.hypernyms()
    if hypernyms:
        print("\nHypernyms = ",hypernyms)


Synset =  car.n.01

Definition =  a motor vehicle with four wheels; usually propelled by an internal combustion engine

Hyponyms =  [Synset('ambulance.n.01'), Synset('beach_wagon.n.01'), Synset('bus.n.04'), Synset('cab.n.03'), Synset('compact.n.03'), Synset('convertible.n.01'), Synset('coupe.n.01'), Synset('cruiser.n.01'), Synset('electric.n.01'), Synset('gas_guzzler.n.01'), Synset('hardtop.n.01'), Synset('hatchback.n.01'), Synset('horseless_carriage.n.01'), Synset('hot_rod.n.01'), Synset('jeep.n.01'), Synset('limousine.n.01'), Synset('loaner.n.02'), Synset('minicar.n.01'), Synset('minivan.n.01'), Synset('model_t.n.01'), Synset('pace_car.n.01'), Synset('racer.n.02'), Synset('roadster.n.01'), Synset('sedan.n.01'), Synset('sport_utility.n.01'), Synset('sports_car.n.01'), Synset('stanley_steamer.n.01'), Synset('stock_car.n.01'), Synset('subcompact.n.01'), Synset('touring_car.n.01'), Synset('used-car.n.01')]

Hypernyms =  [Synset('motor_vehicle.n.01')]

Synset =  car.n.02

Definition =  a

In [6]:
#Lemmatisation
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

print("Original tokens:", tokens)
print()
print("Lemmatized tokens:", lemmatized_tokens)

Original tokens: ['Britain', '’', 's', 'communications', 'intelligence', 'agency', 'GCHQ', 'has', 'issued', 'a', 'statement', 'denying', 'it', 'wiretapped', 'Donald', 'Trump', 'during', 'the', 'US', 'presidential', 'campaign', '.', 'The', 'unusual', 'move', 'by', 'the', 'agency', 'came', 'after', 'White', 'House', 'Press', 'Secretary', 'Sean', 'Spicer', 'cited', 'claims', 'first', 'made', 'on', 'US', 'TV', 'channel', 'Fox', 'News', 'earlier', 'this', 'week', '.', 'GCHQ', 'responded', 'that', 'the', 'allegations', 'were', '“', 'nonsense', ',', 'utterly', 'ridiculous', 'and', 'should', 'be', 'ignored', '”', '.', 'Former', 'judge', 'Andrew', 'Napolitano', 'initially', 'made', 'the', 'claims', 'of', 'GCHQ', 'involvement', '.', 'Mr', 'Spicer', 'quoted', 'Mr', 'Napolitano', ':', '“', 'Three', 'intelligence', 'sources', 'have', 'informed', 'Fox', 'News', 'that', 'President', 'Obama', 'went', 'outside', 'the', 'chain', 'of', 'command', '.', '”']

Lemmatized tokens: ['Britain', '’', 's', 'commu

In [7]:
#Stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]

print("Original tokens:", tokens)
print()
print("Stemmed tokens:", stemmed_tokens)

Original tokens: ['Britain', '’', 's', 'communications', 'intelligence', 'agency', 'GCHQ', 'has', 'issued', 'a', 'statement', 'denying', 'it', 'wiretapped', 'Donald', 'Trump', 'during', 'the', 'US', 'presidential', 'campaign', '.', 'The', 'unusual', 'move', 'by', 'the', 'agency', 'came', 'after', 'White', 'House', 'Press', 'Secretary', 'Sean', 'Spicer', 'cited', 'claims', 'first', 'made', 'on', 'US', 'TV', 'channel', 'Fox', 'News', 'earlier', 'this', 'week', '.', 'GCHQ', 'responded', 'that', 'the', 'allegations', 'were', '“', 'nonsense', ',', 'utterly', 'ridiculous', 'and', 'should', 'be', 'ignored', '”', '.', 'Former', 'judge', 'Andrew', 'Napolitano', 'initially', 'made', 'the', 'claims', 'of', 'GCHQ', 'involvement', '.', 'Mr', 'Spicer', 'quoted', 'Mr', 'Napolitano', ':', '“', 'Three', 'intelligence', 'sources', 'have', 'informed', 'Fox', 'News', 'that', 'President', 'Obama', 'went', 'outside', 'the', 'chain', 'of', 'command', '.', '”']

Stemmed tokens: ['britain', '’', 's', 'commun',

In [8]:
#named entity recognition
from nltk import word_tokenize, pos_tag, ne_chunk

tags = pos_tag(tokens)
ner_tags = ne_chunk(tags)

for entity in ner_tags:
    if isinstance(entity, nltk.Tree):
        entity_words = [word for word, tag in entity.leaves()]
        entity_name = " ".join(entity_words)
        entity_label = entity.label()
        print(f"Entity: {entity_name}, Label: {entity_label}")

Entity: Britain, Label: GPE
Entity: GCHQ, Label: ORGANIZATION
Entity: Donald Trump, Label: PERSON
Entity: US, Label: ORGANIZATION
Entity: White House, Label: FACILITY
Entity: Sean Spicer, Label: PERSON
Entity: US, Label: ORGANIZATION
Entity: Fox News, Label: PERSON
Entity: GCHQ, Label: ORGANIZATION
Entity: Andrew Napolitano, Label: PERSON
Entity: GCHQ, Label: ORGANIZATION
Entity: Spicer, Label: PERSON
Entity: Fox News, Label: PERSON
Entity: Obama, Label: PERSON


In [9]:
import nltk
from nltk import PCFG
from nltk import parse,ChartParser

#finding the ambiguity of the sentence using parse tree
grammar1=nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
count=0
sentence=input("enter the sentence:")
sent=sentence.split()
rd_parse=nltk.ChartParser(grammar1)
for tree in rd_parse.parse(sent):
    print(tree)
    tree.pretty_print()
    print()
    count=count+1
if(count>1):
    print("the given sentence is ambiguous")
else:
    print("the given sentence is unambiguous")

enter the sentence: I shot an elephant in my pajamas


(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
     S                                       
  ___|______________                          
 |                  VP                       
 |         _________|__________               
 |        VP                   PP            
 |    ____|___              ___|___           
 |   |        NP           |       NP        
 |   |     ___|_____       |    ___|_____     
 NP  V   Det        N      P  Det        N   
 |   |    |         |      |   |         |    
 I  shot  an     elephant  in  my     pajamas


(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))
     S                                   
  ___|__________                          
 |              VP                       
 |    __________|______                   
 |   |                 NP                
 |   |     ____________|___               
 |   |    |     |      