IMPORTING NLTK LIBRARIES

In [1]:
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize #For tokenization
from nltk.corpus import stopwords #For stopwords

import re #For regular expressions
from nltk.stem import WordNetLemmatizer #For Lemmatization
from  nltk.stem import PorterStemmer #For stemming

In [2]:
#Defining the sample text to be considered for various NLP processes
example_text="Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by animals including humans. AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals."

SENTENCE TOKENIZATION AND WORD TOKENIZATION

In [3]:
#Implementing Sentence Tokenization
print(sent_tokenize(example_text))

['Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by animals including humans.', 'AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals.']


In [4]:
#Expressing Sentence Tokenization In More Readable Way

i=1
for sent in sent_tokenize(example_text):
    print("Sentence number "+str(i)+": "+sent)
    print()
    i+=1

Sentence number 1: Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to the natural intelligence displayed by animals including humans.

Sentence number 2: AI research has been defined as the field of study of intelligent agents, which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals.



In [5]:
#Word Tokenizer
original_tokens= word_tokenize(example_text)
print(original_tokens)

['Artificial', 'intelligence', '(', 'AI', ')', 'is', 'intelligence', 'demonstrated', 'by', 'machines', ',', 'as', 'opposed', 'to', 'the', 'natural', 'intelligence', 'displayed', 'by', 'animals', 'including', 'humans', '.', 'AI', 'research', 'has', 'been', 'defined', 'as', 'the', 'field', 'of', 'study', 'of', 'intelligent', 'agents', ',', 'which', 'refers', 'to', 'any', 'system', 'that', 'perceives', 'its', 'environment', 'and', 'takes', 'actions', 'that', 'maximize', 'its', 'chance', 'of', 'achieving', 'its', 'goals', '.']


Note: As we can see that a lot of stop- symbols stopwords were observed in the word tokenization process, so, let's attempt to clean up the text as follows. We start by dealing with punctuation symbols known as stop symbols and semantically low-significance words known as stopwords.

In [6]:
#Cleaning off the punctuation symbols 

cleaned_sent= re.sub("[^a-zA-Z]"," ",example_text)
print(cleaned_sent)

Artificial intelligence  AI  is intelligence demonstrated by machines  as opposed to the natural intelligence displayed by animals including humans  AI research has been defined as the field of study of intelligent agents  which refers to any system that perceives its environment and takes actions that maximize its chance of achieving its goals 


In [7]:
#Performing the word tokenization again to obtain the words only
cleaned_words= word_tokenize(cleaned_sent)
print(cleaned_words)

['Artificial', 'intelligence', 'AI', 'is', 'intelligence', 'demonstrated', 'by', 'machines', 'as', 'opposed', 'to', 'the', 'natural', 'intelligence', 'displayed', 'by', 'animals', 'including', 'humans', 'AI', 'research', 'has', 'been', 'defined', 'as', 'the', 'field', 'of', 'study', 'of', 'intelligent', 'agents', 'which', 'refers', 'to', 'any', 'system', 'that', 'perceives', 'its', 'environment', 'and', 'takes', 'actions', 'that', 'maximize', 'its', 'chance', 'of', 'achieving', 'its', 'goals']


DEALING WITH STOPWORDS

In [8]:
#Fetch the stopwrords
stop_words= set(stopwords.words("english"))
#print(stop_words)

In [9]:
#Filtering the list of words by removing the stopwords 

filtered_word_list=[]

for w in cleaned_words:
    if w not in stop_words:
        filtered_word_list.append(w)
        
print(filtered_word_list)

['Artificial', 'intelligence', 'AI', 'intelligence', 'demonstrated', 'machines', 'opposed', 'natural', 'intelligence', 'displayed', 'animals', 'including', 'humans', 'AI', 'research', 'defined', 'field', 'study', 'intelligent', 'agents', 'refers', 'system', 'perceives', 'environment', 'takes', 'actions', 'maximize', 'chance', 'achieving', 'goals']


Note: The above list of words can be called the logically and semantically significant results of word-tokenization

In [10]:
#Obtaining the sentence- version of the filtered_word_list

filtered_sentence=" ".join(filtered_word_list)
print(filtered_sentence)

Artificial intelligence AI intelligence demonstrated machines opposed natural intelligence displayed animals including humans AI research defined field study intelligent agents refers system perceives environment takes actions maximize chance achieving goals


PERFORMING STEMMING AND LEMMATIZATION

In [11]:
#Initializing the objects for stemming and lemmatization

ps=PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [12]:
stemmed_word_list=[] #For stemming
lemmatized_word_list=[] #For lemmatizing

for word in filtered_word_list:
    stemmed_word_list.append(ps.stem(word))
    lemmatized_word_list.append(lemmatizer.lemmatize(word))

        
print("The list of stemmed versions of the words: ")
print(stemmed_word_list)
print()
print("The list of lemmatized versions of the words: ")
print(lemmatized_word_list)

The list of stemmed versions of the words: 
['artifici', 'intellig', 'ai', 'intellig', 'demonstr', 'machin', 'oppos', 'natur', 'intellig', 'display', 'anim', 'includ', 'human', 'ai', 'research', 'defin', 'field', 'studi', 'intellig', 'agent', 'refer', 'system', 'perceiv', 'environ', 'take', 'action', 'maxim', 'chanc', 'achiev', 'goal']

The list of lemmatized versions of the words: 
['Artificial', 'intelligence', 'AI', 'intelligence', 'demonstrated', 'machine', 'opposed', 'natural', 'intelligence', 'displayed', 'animal', 'including', 'human', 'AI', 'research', 'defined', 'field', 'study', 'intelligent', 'agent', 'refers', 'system', 'perceives', 'environment', 'take', 'action', 'maximize', 'chance', 'achieving', 'goal']


In [13]:
#Obtaining the sentence- version of the stemmed_word_list and lemmatized_word_list

print("The stemmed sentence: ")
stemmed_sentence=" ".join(stemmed_word_list)
print(stemmed_sentence)
print()
print("The lemmatized sentence: ")
lemmatized_sentence=" ".join(lemmatized_word_list)
print(lemmatized_sentence)

The stemmed sentence: 
artifici intellig ai intellig demonstr machin oppos natur intellig display anim includ human ai research defin field studi intellig agent refer system perceiv environ take action maxim chanc achiev goal

The lemmatized sentence: 
Artificial intelligence AI intelligence demonstrated machine opposed natural intelligence displayed animal including human AI research defined field study intelligent agent refers system perceives environment take action maximize chance achieving goal


PART-OF-SPEECH (POS) TAGGING 

In [14]:
from nltk import pos_tag

In [15]:
#Retrieving the list of words only (exclude puctuation signs)
print(cleaned_words) 

['Artificial', 'intelligence', 'AI', 'is', 'intelligence', 'demonstrated', 'by', 'machines', 'as', 'opposed', 'to', 'the', 'natural', 'intelligence', 'displayed', 'by', 'animals', 'including', 'humans', 'AI', 'research', 'has', 'been', 'defined', 'as', 'the', 'field', 'of', 'study', 'of', 'intelligent', 'agents', 'which', 'refers', 'to', 'any', 'system', 'that', 'perceives', 'its', 'environment', 'and', 'takes', 'actions', 'that', 'maximize', 'its', 'chance', 'of', 'achieving', 'its', 'goals']


In [16]:
tagged_words= pos_tag(cleaned_words)
tagged_words

[('Artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('AI', 'NNP'),
 ('is', 'VBZ'),
 ('intelligence', 'NN'),
 ('demonstrated', 'VBN'),
 ('by', 'IN'),
 ('machines', 'NNS'),
 ('as', 'IN'),
 ('opposed', 'VBN'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('natural', 'JJ'),
 ('intelligence', 'NN'),
 ('displayed', 'VBN'),
 ('by', 'IN'),
 ('animals', 'NNS'),
 ('including', 'VBG'),
 ('humans', 'NNS'),
 ('AI', 'NNP'),
 ('research', 'NN'),
 ('has', 'VBZ'),
 ('been', 'VBN'),
 ('defined', 'VBN'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('field', 'NN'),
 ('of', 'IN'),
 ('study', 'NN'),
 ('of', 'IN'),
 ('intelligent', 'NN'),
 ('agents', 'NNS'),
 ('which', 'WDT'),
 ('refers', 'NNS'),
 ('to', 'TO'),
 ('any', 'DT'),
 ('system', 'NN'),
 ('that', 'WDT'),
 ('perceives', 'VBZ'),
 ('its', 'PRP$'),
 ('environment', 'NN'),
 ('and', 'CC'),
 ('takes', 'VBZ'),
 ('actions', 'NNS'),
 ('that', 'IN'),
 ('maximize', 'VB'),
 ('its', 'PRP$'),
 ('chance', 'NN'),
 ('of', 'IN'),
 ('achieving', 'VBG'),
 ('its', 'PRP$'),
 ('goals', 'NNS')]

CHUNKING

In [17]:
#Noun Phrase Chunking

chunk_grammar="NP:{<DT>?<JJ>*<NN>|<NNS>}" #combination of optional determiner(dt), at least 0 adjectives and common noun is a noun-phrase

In [18]:
#Create a parser 

chunk_parser= nltk.RegexpParser(chunk_grammar)

In [19]:
#Finding output of parsing

chunk_parse_result= chunk_parser.parse(tagged_words)

In [20]:
#Draw the result

#chunk_parse_result.draw()

CHINKING

In [21]:
#Here, I am attempting to chink the prepositions and determiners

In [30]:
chink_grammar="NOT_IN_DT: }<IN|DT>{" #chink the prepositions and determiners

In [31]:
#Create a parser 

chink_parser= nltk.RegexpParser(chink_grammar)

In [32]:
#Finding output of parsing

chink_parse_result= chink_parser.parse(tagged_words)

In [33]:
#Draw the result

chink_parse_result.draw()