### Text Preprocessing for cleaning the input 

### Tokenization in nlp

In [4]:
import nltk 

In [2]:
corpus = """Hello! welcome to basic learning of nlp's concept. 
we are going to learn about nlp."""

In [3]:
#sentence tokenization : convert a paragraph into sentences
from nltk import sent_tokenize
sen = sent_tokenize(corpus)
print(sen)
type(sen)

['Hello!', "welcome to basic learning of nlp's concept.", 'we are going to learn about nlp.']


list

In [4]:
for i in sen:
    print(i)
    

Hello!
welcome to basic learning of nlp's concept.
we are going to learn about nlp.


In [5]:
#WORD TOKENIZATION : convert a sentence into words
from nltk import word_tokenize
word = word_tokenize(corpus)
print(word)

['Hello', '!', 'welcome', 'to', 'basic', 'learning', 'of', 'nlp', "'s", 'concept', '.', 'we', 'are', 'going', 'to', 'learn', 'about', 'nlp', '.']


In [6]:
for i in word:
    print(i)

Hello
!
welcome
to
basic
learning
of
nlp
's
concept
.
we
are
going
to
learn
about
nlp
.


In [7]:
from nltk import wordpunct_tokenize # ' will be treated as a separate token 
word_punct = wordpunct_tokenize(corpus)
print(word_punct)

['Hello', '!', 'welcome', 'to', 'basic', 'learning', 'of', 'nlp', "'", 's', 'concept', '.', 'we', 'are', 'going', 'to', 'learn', 'about', 'nlp', '.']


In [8]:
from nltk.tokenize import TreebankWordTokenizer #. is not considered seperate word but at the end of word if there is a full stop then it will be considered as a separate token
token = TreebankWordTokenizer()
print(token.tokenize(corpus))

['Hello', '!', 'welcome', 'to', 'basic', 'learning', 'of', 'nlp', "'s", 'concept.', 'we', 'are', 'going', 'to', 'learn', 'about', 'nlp', '.']


## steaming  
in steaming many words convert into some undefined words which lead to change the meaning of that word it is major isuue 

### portersteammer

In [35]:
from nltk.stem import PorterStemmer

In [36]:
word = ['running', 'run', 'runs', 'eat', 'eats', 'eating']

In [37]:
steaming = PorterStemmer()
for i in word:
    print(i+ "---" +steaming.stem(i))

running---run
run---run
runs---run
eat---eat
eats---eat
eating---eat


In [38]:
steaming.stem("congratulation") 

'congratul'

### regexstemmer class 
NLTK has RegexpStemmer class with the help of which we can easily implement Regular Expression Stemmer algorithms. It basically takes a single regular expression and removes any prefix or suffix that matches the expression.

In [39]:
from nltk.stem import RegexpStemmer

In [40]:
reg_stem = RegexpStemmer('ing$|s$|es$|ed$', min=3)

In [41]:
reg_stem.stem('running')

'runn'

### snowball stemmer : more accurate than porter stemmer
It is a stemming algorithm which is also known as the Porter2 stemming algorithm as it is a better version of the Porter Stemmer since some issues of it were fixed in this stemmer.

In [42]:
from nltk.stem import SnowballStemmer

In [43]:
snowballstem = SnowballStemmer('english')

In [44]:
snowballstem.stem("running")

'run'

In [45]:
steaming.stem("fairly"),steaming.stem("sportingly")

('fairli', 'sportingli')

In [46]:
snowballstem.stem("fairly"),snowballstem.stem("sportingly")

('fair', 'sport')

In [47]:
snowballstem.stem("history"),snowballstem.stem("goes")

('histori', 'goe')

In [48]:
steaming.stem("history"),steaming.stem("goes")

('histori', 'goe')

### lemmatization 
Lemmatization technique is like stemming. The output we will get after lemmatization is called ‘lemma’ which is a root word rather than root stem. After lemmatization, we will be getting a valid word which meaning will not change.

In [49]:
from nltk.stem import WordNetLemmatizer

In [50]:
lemmatizer = WordNetLemmatizer()

In [51]:
nltk.download('wordnet') 
"""based on pos tag the output will be defined
pos:
noun - n
verb -v
adjective - a
adverb -r"""

[nltk_data] Downloading package wordnet to C:\Users\User1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'based on pos tag the output will be defined\npos:\nnoun - n\nverb -v\nadjective - a\nadverb -r'

In [52]:
print(lemmatizer.lemmatize("running", pos='n'))
print(lemmatizer.lemmatize("running", pos='v'))
print(lemmatizer.lemmatize("better", pos='a'))
print(lemmatizer.lemmatize("better", pos='r'))

running
run
good
well


In [53]:
lemmatizer.lemmatize("fairly", pos='v'),lemmatizer.lemmatize("sportingly", pos='v')

('fairly', 'sportingly')

In [54]:
lemmatizer.lemmatize("history"),lemmatizer.lemmatize("goes")

('history', 'go')

### stopwords 

In [55]:
corpus = """Hello! welcome to basic learning of nlp's concept. 
we are going to learn about nlp.
NLP is widely used in chatbots and virtual assistants.
It helps computers understand human language.
Tokenization, stemming, and lemmatization are important steps in NLP."""

In [56]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [57]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [58]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [59]:
stemmer = PorterStemmer()

In [60]:
sen = nltk.sent_tokenize(corpus)
sen

['Hello!',
 "welcome to basic learning of nlp's concept.",
 'we are going to learn about nlp.',
 'NLP is widely used in chatbots and virtual assistants.',
 'It helps computers understand human language.',
 'Tokenization, stemming, and lemmatization are important steps in NLP.']

In [61]:
# apply stopwords to corpus and then filter and after that apply steaming

for i in range(len(sen)): #iterate through sentences
    word = nltk.word_tokenize(sen[i]) # tokenize each sentence into words
    words = [stemmer.stem(w) for w in word if w not in stopwords.words('english')] # filter out stopwords and apply stemming
    sen[i] = ' '.join(words) # join the words back into a sentence

In [62]:
sen

['hello !',
 "welcom basic learn nlp 's concept .",
 'go learn nlp .',
 'nlp wide use chatbot virtual assist .',
 'it help comput understand human languag .',
 'token , stem , lemmat import step nlp .']

In [63]:
from nltk.stem import SnowballStemmer
snowballstm = SnowballStemmer('english')

In [64]:
for i in range(len(sen)):
    word = nltk.word_tokenize(sen[i]) 
    words = [snowballstm.stem(w) for w in word if w not in stopwords.words('english')]
    sen[i] = ' '.join(words)

In [65]:
sen

['hello !',
 "welcom basic learn nlp 's concept .",
 'go learn nlp .',
 'nlp wide use chatbot virtual assist .',
 'help comput understand human languag .',
 'token , stem , lemmat import step nlp .']

In [66]:
from nltk.stem import WordNetLemmatizer
lamatizer = WordNetLemmatizer()

In [67]:
for i in range(len(sen)):
    word = nltk.word_tokenize(sen[i])
    words = [lamatizer.lemmatize(w, pos='v') for w in word if w not in stopwords.words('english')]
    sen[i] = ' '.join(words)

sen

['hello !',
 "welcom basic learn nlp 's concept .",
 'go learn nlp .',
 'nlp wide use chatbot virtual assist .',
 'help comput understand human languag .',
 'token , stem , lemmat import step nlp .']

### POS tagging 

In [68]:
from nltk.corpus import stopwords

In [69]:
sen = nltk.sent_tokenize(corpus)
sen

['Hello!',
 "welcome to basic learning of nlp's concept.",
 'we are going to learn about nlp.',
 'NLP is widely used in chatbots and virtual assistants.',
 'It helps computers understand human language.',
 'Tokenization, stemming, and lemmatization are important steps in NLP.']

In [70]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [71]:
for i in range(len(sen)):
    word = nltk.word_tokenize(sen[i])
    words = [w for w in word if w not in stopwords.words('english')]
    pos_tag = nltk.pos_tag(words)
    print(pos_tag)

[('Hello', 'NN'), ('!', '.')]
[('welcome', 'JJ'), ('basic', 'JJ'), ('learning', 'VBG'), ('nlp', 'NN'), ("'s", 'POS'), ('concept', 'NN'), ('.', '.')]
[('going', 'VBG'), ('learn', 'JJ'), ('nlp', 'NN'), ('.', '.')]
[('NLP', 'NNP'), ('widely', 'RB'), ('used', 'VBD'), ('chatbots', 'NNS'), ('virtual', 'JJ'), ('assistants', 'NNS'), ('.', '.')]
[('It', 'PRP'), ('helps', 'VBZ'), ('computers', 'NNS'), ('understand', 'VBP'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]
[('Tokenization', 'NN'), (',', ','), ('stemming', 'VBG'), (',', ','), ('lemmatization', 'NN'), ('important', 'JJ'), ('steps', 'NNS'), ('NLP', 'NNP'), ('.', '.')]


In [72]:
nltk.pos_tag("this is a test sentence".split())

[('this', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('test', 'NN'),
 ('sentence', 'NN')]

### tree for pos tagging

In [73]:
words = nltk.word_tokenize('this is a test sentence')


In [74]:
tag_elements = nltk.pos_tag(words)

In [75]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\User1/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [76]:
nltk.download('words')

[nltk_data] Downloading package words to C:\Users\User1/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [77]:
import numpy
nltk.ne_chunk(tag_elements).draw()