In [1]:
### Import the necessary libraries

import nltk
import string
import re

In [2]:
### We lowercase the text to reduce the size of the vocabulary of our text data.

def text_lowercase(text):
    return text.lower()


input_str = "Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"
text_lowercase(input_str)

"hey, did you know that the summer break is coming? amazing right !! it's only 5 more days !!"

##### Remove Numbers

In [3]:
### For remove number we can use regular expression

def remove_num(text):
    result = re.sub(pattern= r"\d+",repl= '',string=text)
    return result

input_str = "There are 3 balls in this bag, and 12 in the other one."

remove_num(input_str)

'There are  balls in this bag, and  in the other one.'

##### We can also convert the numbers into words. This can be done by using the inflect library.

In [4]:
import inflect

In [5]:
p = inflect.engine()

In [6]:
## Convert number into words
def convert_number(text):
    # create a empty list
    new_string = []
    # split string into list of words
    temp_str = text.split()
    
    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
        else:
            new_string.append(word)
    
    temp_str = ' '.join(new_string)
    return temp_str

input_str = 'There are 3 balls in this bag, and 12 in the other one.'
convert_number(input_str)

'There are three balls in this bag, and twelve in the other one.'

##### Remove punctuation:

In [7]:
import string

def remove_punctuation(text):
    translator = str.maketrans('','',string.punctuation)
    return text.translate(translator)


input_str = "Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"
remove_punctuation(input_str)

'Hey did you know that the summer break is coming Amazing right  Its only 5 more days '

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
a = "alamin"
mytable = a.maketrans('','','lin')
print(a.translate(mytable))

aam


In [10]:
txt = "Good night Sam!"
x = "mSa"
y = "eJo"
z = "odnght"
mytable = txt.maketrans(x, y, z)
print(txt.translate(mytable))

G i Joe!


##### Remove whitespaces:
We can use the join and split function to remove all the white spaces in a string.

In [11]:
def remove_whitespace(text):
    return ' '.join(text.split())

input_str = "   we don't need   the given questions"
remove_whitespace(input_str)

"we don't need the given questions"

In [12]:
print("testHook".removeprefix('test'))
print("textHook".removesuffix('Hook'))

Hook
text


##### Remove default stopwords:
Stopwords are words that do not contribute to the meaning of a sentence. Hence, they can safely be removed without causing any change in the meaning of the sentence. The NLTK library has a set of stopwords and we can use these to remove stopwords from our text and return a list of word tokens.

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [14]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    
    word_tokens = word_tokenize(text)
    
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    filtered_sen = ' '.join(filtered_text)
    
    return (filtered_text,filtered_sen)
    

example_text = "This is a sample sentence and we are going to remove the stopwords from this."
sentence,words = remove_stopwords(example_text)

print(sentence,'\n')
print(words)

['This', 'sample', 'sentence', 'going', 'remove', 'stopwords', '.'] 

This sample sentence going remove stopwords .


In [15]:
example_text = "This is a sample sentence and we are going to remove the stopwords from this."
print(word_tokenize(example_text))

['This', 'is', 'a', 'sample', 'sentence', 'and', 'we', 'are', 'going', 'to', 'remove', 'the', 'stopwords', 'from', 'this', '.']


##### Stemming:
Stemming is the process of getting the root form of a word. Stem or root is the part to which inflectional affixes (-ed, -ize, -de, -s, etc.) are added. The stem of a word is created by removing the prefix or suffix of a word. So, stemming a word may not result in actual words.

1. books      --->    book
2. looked     --->    look
3. denied     --->    deni
4. flies      --->    fli

![img1.png](attachment:img1.png)

![img2.png](attachment:img2.png)

In [16]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [17]:
stemmer = PorterStemmer()

In [18]:
def stem_words(text):
    word_tokens = word_tokenize(text) # This line return a list of words
    stems = [stemmer.stem(word) for word in word_tokens]
    return stems

text = 'data science uses scientific methods algorithms and many types of processes'
stem_words(text)

['data',
 'scienc',
 'use',
 'scientif',
 'method',
 'algorithm',
 'and',
 'mani',
 'type',
 'of',
 'process']

##### Lemmatization:
Like stemming, lemmatization also converts a word to its root form. The only difference is that lemmatization ensures that the root word belongs to the language. We will get valid words if we use lemmatization. In NLTK, we use the WordNetLemmatizer to get the lemmas of words. We also need to provide a context for the lemmatization. So, we add the part-of-speech as a parameter.

In [19]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [20]:
lemmatizer = WordNetLemmatizer()

In [21]:
def lemmatizer_word(text):
    word_tokens = word_tokenize(text) # This line return a list of words
    # provide context i.e. part-of-speech
    lemmas = [lemmatizer.lemmatize(word= word, pos='v') for word in word_tokens]
    
    return lemmas

text = 'data science uses scientific methods algorithms and many types of processes'
print(stem_words(text))

['data', 'scienc', 'use', 'scientif', 'method', 'algorithm', 'and', 'mani', 'type', 'of', 'process']
