In [5]:
# Installation of nltk
#In Jupyter, the console commands can be executed by the ‘!’ sign before the command within the cell
#!pip install nltk



### Text Preprocessing 
Following code can be used for text preprocessing useful for various NLP applications.

First we need to import nltk

For a given text, we can do sentence tokenization and word tokenization using nltk library functions.
We can remove the punctuations using string library.

We can then remove stop words in English to get the important words in the text.

We also perform stemming and lemmatization. Stemming and Lemmatization are two different techniques that help reduce our data space. We don’t need to check every single form of a word for reducing the size of the big data corpus.

In [1]:
#import nltk library for using its different functions
import nltk
import string
import re

In [30]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rahulagarwal/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rahulagarwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rahulagarwal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rahulagarwal/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/rahulagarwal/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [8]:
#  Sentence Tokenization  - Tokenizes sentences from text
from nltk.tokenize import sent_tokenize

In [9]:
# Word Tokenization  - Tokenizes words in sentences
from nltk.tokenize import word_tokenize

In [10]:
#statement = "Hello all, I am Dr. Chetana. Welcome to the lab session of Natural Language Processing(NLP). NLP is a very interesting area."
statement = "Natural Language Processing is interesting and one of the best courses!!"

In [11]:
sentences = sent_tokenize(statement)
print(sentences)
words = word_tokenize(statement)
print(words)

['Natural Language Processing is interesting and one of the best courses!', '!']
['Natural', 'Language', 'Processing', 'is', 'interesting', 'and', 'one', 'of', 'the', 'best', 'courses', '!', '!']


In [13]:
for sentence in sentences:
    print(sentence)

Natural Language Processing is interesting and one of the best courses!
!


In [15]:
for word in words:
    print(word)

Natural
Language
Processing
is
interesting
and
one
of
the
best
courses
!
!


In [16]:
 # Remove punctuations
for word in words:
    if word not in string.punctuation:
        print(word)

Natural
Language
Processing
is
interesting
and
one
of
the
best
courses


In [17]:
only_words=[w for w in words if not w in string.punctuation]
print(only_words)

['Natural', 'Language', 'Processing', 'is', 'interesting', 'and', 'one', 'of', 'the', 'best', 'courses']


In [18]:
#Removal of stop words from the text
from nltk.corpus import stopwords

In [19]:
# List of English stop words 
english_stop_words=set(stopwords.words("english"))
print(english_stop_words)

{'doesn', 'can', 'from', 'mightn', "you'd", 'below', 'his', "weren't", 'after', 'further', 'will', 'do', "it'd", 'by', 'hers', 'own', 'yourselves', 'there', 'shan', 'its', 'had', "shouldn't", 'how', 'very', 'on', "haven't", 'hadn', 'are', 'during', 'in', 'those', 'we', 'while', 'more', 'then', 'wasn', 'some', 'most', 'about', "i'd", 'was', 'it', 'and', 'hasn', 'nor', "don't", 'should', 'himself', 'herself', 'mustn', 'o', 'yourself', 'didn', 'once', "i'll", 'is', 'ourselves', "won't", 'll', "wouldn't", 'here', 'at', 'ours', "aren't", 'against', 'i', 'him', 'were', 'to', 'the', 'doing', "you've", "mightn't", 'few', "they'd", 'out', 'for', 'this', 'through', 'all', 'than', "she'd", 'be', "we'd", 'only', 'your', 'who', 'both', 'my', "i'm", "he'll", "mustn't", "they'll", 'a', "she'll", 'same', 'of', 'she', 'why', 'before', 'has', 'won', 'wouldn', 'been', 'isn', 'me', "they've", 'into', 't', 'or', 'aren', "we'll", 'her', 'yours', "he'd", 'our', 'other', 'until', 'did', "shan't", 'itself', "w

In [20]:
# Removal of stop words from the text
keywords=[w for w in only_words if not w in english_stop_words]
print(keywords)

['Natural', 'Language', 'Processing', 'interesting', 'one', 'best', 'courses']


### Lemmatization

Lemmatization in NLP is the process through which several different forms of the same word are mapped to one single form, which we can call the root form or the base form. In more technical terms, the root form is called a lemma. By reducing the number of forms a word can take, we make sure that we reduce our data space and that we don’t have to check every single form of a word. It helps us ignore morphological variations on a single word. Lemmatization brings context to the words.So it goes a steps further by linking words with similar meaning to one word. For example if a paragraph has words like cars, trains and automobile, then it will link all of them to automobile. In the below program we use the WordNet lexical database for lemmatization.

In [21]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
keywords=[w for w in only_words if w in wordnet_lemmatizer.lemmatize(w)]
print(keywords)
#Next find the roots of the word
##for w in keywords:
    #lemmatized_words=wordnet_lemmatizer.lemmatize(w)
    #print(wordnet_lemmatizer.lemmatize(w))

['Natural', 'Language', 'Processing', 'is', 'interesting', 'and', 'one', 'of', 'the', 'best']


### Stemming

Stemming in NLP is the process of removing prefixes and suffixes from words so that they are reduced to simpler forms which are called stems. The purpose of stemming is to reduce our vocabulary and dimensionality for NLP tasks and to improve speed and efficiency in information retrieval and information processing tasks. Stemming is a simpler, faster process than lemmatization. The difference is that stemming is usually only rule-based approach. And, as we've showed with our earlier example, rule-based approaches can fail very quickly on more complex examples. But for most problems, it works well enough. Many search engines use stemming to improve their search results.


In [22]:
# Stemming
from nltk.stem import PorterStemmer

In [23]:
porter_stemmer = PorterStemmer()
# First Word tokenization
nltk_tokens = nltk.word_tokenize(statement)
#Next find the roots of the word
for w in keywords:
       print(porter_stemmer.stem(w))

natur
languag
process
is
interest
and
one
of
the
best


In [24]:
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language='english')
words = ['generous','generate','generously','generation']
for word in words:
    print(word,"--->",snowball.stem(word))

generous ---> generous
generate ---> generat
generously ---> generous
generation ---> generat


In [25]:
from nltk.stem import LancasterStemmer
lancaster = LancasterStemmer()
words = ['eating','eats','eaten','puts','putting']
for word in words:
    print(word,"--->",lancaster.stem(word))

eating ---> eat
eats ---> eat
eaten ---> eat
puts ---> put
putting ---> put


In [26]:
from nltk.stem import RegexpStemmer
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
words = ['mass','was','bee','computer','advisable']
for word in words:
    print(word,"--->",regexp.stem(word))

mass ---> mas
was ---> was
bee ---> bee
computer ---> computer
advisable ---> advis


In [27]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, RegexpStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer(language='english')
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
word_list = ["friend", "friendship", "friends", "friendships"]
print("{0:20}{1:20}{2:20}{3:30}{4:40}".format("Word","Porter Stemmer","Snowball Stemmer","Lancaster Stemmer",'Regexp Stemmer'))
for word in word_list:
    print("{0:20}{1:20}{2:20}{3:30}{4:40}".format(word,porter.stem(word),snowball.stem(word),lancaster.stem(word),regexp.stem(word)))

Word                Porter Stemmer      Snowball Stemmer    Lancaster Stemmer             Regexp Stemmer                          
friend              friend              friend              friend                        friend                                  
friendship          friendship          friendship          friend                        friendship                              
friends             friend              friend              friend                        friend                                  
friendships         friendship          friendship          friend                        friendship                              


In [28]:
# POS Tagging

In [31]:
print(nltk.pos_tag(keywords))

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('interesting', 'JJ'), ('and', 'CC'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('best', 'JJS')]
