In [1]:
## Install by uncommenting the line with 'pip' command, if needed
## Reference: https://www.nltk.org/install.html
#!pip install nltk

In [2]:
# Import packages
import nltk


In [3]:
# Data we will use
data1 = 'At eight o\'clock on Thursday morning, we will go to school. There is' + \
           ' no one there but myself. '
data2 = "CSCE 771: Computer Processing of Natural Language " + \
    " Lecture 3: Words, Morphology, Lexicons" + \
    " Prof. Biplav Srivastava, AI Institute    31st Aug 2022 "
data3 = "Hola Class! Dost kya kar rahe ho? Let us pay attention." ## Mixed language

# Understanding Tokenization

In [4]:
data = data1
tokens = nltk.word_tokenize(data)
print ("Original data - " + data)
print ("INFO: tokens - \n\t" + str(tokens))

Original data - At eight o'clock on Thursday morning, we will go to school. There is no one there but myself. 
INFO: tokens - 
	['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',', 'we', 'will', 'go', 'to', 'school', '.', 'There', 'is', 'no', 'one', 'there', 'but', 'myself', '.']


In [5]:
data = data2
tokens = nltk.word_tokenize(data)
print ("Original data - " + data)
print ("INFO: tokens - \n\t" + str(tokens))

Original data - CSCE 771: Computer Processing of Natural Language  Lecture 3: Words, Morphology, Lexicons Prof. Biplav Srivastava, AI Institute    31st Aug 2022 
INFO: tokens - 
	['CSCE', '771', ':', 'Computer', 'Processing', 'of', 'Natural', 'Language', 'Lecture', '3', ':', 'Words', ',', 'Morphology', ',', 'Lexicons', 'Prof.', 'Biplav', 'Srivastava', ',', 'AI', 'Institute', '31st', 'Aug', '2022']


In [6]:
# Mixed languages
data = data3
tokens = nltk.word_tokenize(data)
print ("Original data - " + data)
print ("INFO: tokens - \n\t" + str(tokens))

Original data - Hola Class! Dost kya kar rahe ho? Let us pay attention.
INFO: tokens - 
	['Hola', 'Class', '!', 'Dost', 'kya', 'kar', 'rahe', 'ho', '?', 'Let', 'us', 'pay', 'attention', '.']


In [7]:
# A function to get word types and print in sorted form
def get_word_types(data):
    
    print ("INFO: input - " + data)
    tokens = nltk.word_tokenize(data)
    tokens = set([t for t in tokens if t.strip()])

    tokens = sorted(tokens)
    print ("INFO: sorted unique words - \n\t" + str(tokens))

In [8]:
get_word_types(data1)

INFO: input - At eight o'clock on Thursday morning, we will go to school. There is no one there but myself. 
INFO: sorted unique words - 
	[',', '.', 'At', 'There', 'Thursday', 'but', 'eight', 'go', 'is', 'morning', 'myself', 'no', "o'clock", 'on', 'one', 'school', 'there', 'to', 'we', 'will']


In [9]:
get_word_types(data2)

INFO: input - CSCE 771: Computer Processing of Natural Language  Lecture 3: Words, Morphology, Lexicons Prof. Biplav Srivastava, AI Institute    31st Aug 2022 
INFO: sorted unique words - 
	[',', '2022', '3', '31st', '771', ':', 'AI', 'Aug', 'Biplav', 'CSCE', 'Computer', 'Institute', 'Language', 'Lecture', 'Lexicons', 'Morphology', 'Natural', 'Processing', 'Prof.', 'Srivastava', 'Words', 'of']


In [10]:
get_word_types(data3)

INFO: input - Hola Class! Dost kya kar rahe ho? Let us pay attention.
INFO: sorted unique words - 
	['!', '.', '?', 'Class', 'Dost', 'Hola', 'Let', 'attention', 'ho', 'kar', 'kya', 'pay', 'rahe', 'us']


# Parts of Speech

In [11]:
nltk.pos_tag(tokens)

[('Hola', 'NNP'),
 ('Class', 'NN'),
 ('!', '.'),
 ('Dost', 'NNP'),
 ('kya', 'VBD'),
 ('kar', 'JJ'),
 ('rahe', 'NN'),
 ('ho', 'NN'),
 ('?', '.'),
 ('Let', 'VB'),
 ('us', 'PRP'),
 ('pay', 'VB'),
 ('attention', 'NN'),
 ('.', '.')]

In [12]:
# Has homonym - permit
tokens = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(tokens)

[('They', 'PRP'),
 ('refuse', 'VBP'),
 ('to', 'TO'),
 ('permit', 'VB'),
 ('us', 'PRP'),
 ('to', 'TO'),
 ('obtain', 'VB'),
 ('the', 'DT'),
 ('refuse', 'NN'),
 ('permit', 'NN')]

# Explore Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 


In [14]:
data_list = ['churches', 'rocks', 'children', 'focii']
for data in data_list:
    print("Lemma for - " + data + " : " + lemmatizer.lemmatize(data)) 

Lemma for - churches : church
Lemma for - rocks : rock
Lemma for - children : child
Lemma for - focii : focii


In [15]:
# Stemmed example for a larger data
data = " for example compressed and compression are both accepted as equivalent to compress."
tokens = nltk.word_tokenize(data) 
lemmed_data = ""
for token in tokens:
    lemmed_data = lemmed_data + " " + lemmatizer.lemmatize(token)
print(lemmed_data)

 for example compressed and compression are both accepted a equivalent to compress .


# Explore Stemming

In [16]:
# From https://www.geeksforgeeks.org/python-stemming-words-with-nltk
from nltk.stem import PorterStemmer 
ps = PorterStemmer() 
  
# choose some words to be stemmed 
words = ["program", "programs", "programer", "programing", "programers"] 
  
for w in words: 
    print(w, " : ", ps.stem(w)) 

program  :  program
programs  :  program
programer  :  program
programing  :  program
programers  :  program


In [17]:
# choose other words to be stemmed 
words = ["is", "being", "are", "was", "were"] 
  
for w in words: 
    print(w, " : ", ps.stem(w)) 

is  :  is
being  :  be
are  :  are
was  :  wa
were  :  were


In [18]:
# Stemmed example
data = " for example compressed and compression are both accepted as equivalent to compress."
tokens = nltk.word_tokenize(data) 
stemmed_data = ""
for token in tokens:
    stemmed_data = stemmed_data + " " + ps.stem(token)
print(stemmed_data)

 for exampl compress and compress are both accept as equival to compress .
