In [127]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [2]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/aman.satyawali/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [3]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
whitman_leaves_words = nltk.corpus.gutenberg.words('whitman-leaves.txt')

In [6]:
len(whitman_leaves_words)

154883

<h3>Removing special characters</h3>

In [35]:
def remove_regex(input_text, regex_pattern) :
    urls = re.finditer(regex_pattern, input_text)
    for i in urls :
        input_text = re.sub(i.group(), '', input_text)
    return input_text

In [37]:
ptrn = '#[\w]*'

output = remove_regex('remove this #hashtag from #my given string', ptrn)

print(output)

remove this  from  given string


<h3>Removing white spaces</h3>

In [42]:
inp = '\t this is the sample string '
print(inp)
output = inp.strip()

print(output)

	 this is the sample string 
this is the sample string


In [44]:
print(output.split(' '))

['this', 'is', 'the', 'sample', 'string']


<h3>Removing numbers</h3>

In [64]:
def remove_numbers(input_text) :
    ptrn = '\s*[0-9]+'
    
    urls = re.finditer(ptrn, input_text)
    
    for i in urls :
        print(i.group())
        input_text = re.sub(i.group(), '', input_text)
    
    return input_text

In [65]:
input_text = 'Hello 123, 2here, I a2m'

output = remove_numbers(input_text)

print(output)

 123
 2
2
Hello,here, I am


<h3>Converting to lowercase</h3>

<h3>Tokenization</h3>

In [69]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/aman.satyawali/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [77]:
input_text = 'The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and statistical natural language processing for English written in the Python programming language.'
tokens = word_tokenize(input_text)

print(tokens)

['The', 'Natural', 'Language', 'Toolkit', ',', 'or', 'more', 'commonly', 'NLTK', ',', 'is', 'a', 'suite', 'of', 'libraries', 'and', 'programs', 'for', 'symbolic', 'and', 'statistical', 'natural', 'language', 'processing', 'for', 'English', 'written', 'in', 'the', 'Python', 'programming', 'language', '.']


<h3>Removing stop words</h3>

In [81]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aman.satyawali/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [84]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'when', 'shouldn', 'hadn', 'do', 'mustn', 'why', 'did', 'an', 'it', 'that', 'so', 'further', "couldn't", 'which', 'above', 'd', 'weren', 's', 'won', "you'd", "wouldn't", 'no', 'herself', 'if', 'to', 'his', 'will', 'most', 'me', 'or', "shouldn't", 'than', 'o', 'ma', 'between', 'there', "doesn't", 'other', "should've", 'itself', "won't", 'here', 'hers', 'she', 'a', 'needn', 'had', 'yourselves', 'm', 'ours', 'you', 'too', 'doesn', 'these', 'my', 'into', 'didn', 'been', 'again', "wasn't", 'shan', 'in', 'hasn', 'her', "she's", 'was', "mightn't", 'wasn', "mustn't", 'am', 'more', 'he', 'for', 'not', "shan't", 'are', 'out', 'aren', "don't", 'should', 'after', "you're", 'over', "needn't", 't', 'him', 'mightn', "it's", 'doing', 'how', 'now', 'up', 'each', 'all', 'their', 'isn', 'being', 'what', "you'll", 'own', 'were', 'himself', 'have', 'they', 'who', 'by', "hadn't", 'before', 'whom', 'does', 'll', 'through', 'where', 'very', 'nor', 'its', 'under', "weren't", 'themselves', 'wouldn', 'about', '

In [92]:
input_text = '"All work and no play makes Jack a dull boy" is a proverb. It means that without time off from work, a person becomes both bored and boring.'

tokens = word_tokenize(input_text)

print([i for i in tokens if i not in stop_words])

['``', 'All', 'work', 'play', 'makes', 'Jack', 'dull', 'boy', "''", 'proverb', '.', 'It', 'means', 'without', 'time', 'work', ',', 'person', 'becomes', 'bored', 'boring', '.']


<h3>Stemming</h3>

In [94]:
stemmer = PorterStemmer()

In [97]:
for i in tokens :
    print(stemmer.stem(i))

``
all
work
and
no
play
make
jack
a
dull
boy
''
is
a
proverb
.
it
mean
that
without
time
off
from
work
,
a
person
becom
both
bore
and
bore
.


In [109]:
stemmer = SnowballStemmer('english')
stemmer2 = SnowballStemmer('english', ignore_stopwords = True)


print(stemmer.stem('having'))
print(stemmer.stem('generously'))

have
generous


In [111]:
print(SnowballStemmer('english').stem('generously'))
print(SnowballStemmer('porter').stem('generously'))

generous
gener


<h3>Lemmatisation</h3>

In [116]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/aman.satyawali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [119]:
lemma = nltk.wordnet.WordNetLemmatizer()

In [1]:
print(lemma.lemmatize('meeting'))
print(lemma.lemmatize('meeting', pos = 'v'))

NameError: name 'lemma' is not defined

'aman'