# 1. Tokenize a simple sentence using word_tokenize. ("Natural Language Processing with Python is fun.")

In [1]:
from nltk.tokenize import word_tokenize

sent = "Natural Language Processing with Python is fun."
word = word_tokenize(sent)
word

['Natural', 'Language', 'Processing', 'with', 'Python', 'is', 'fun', '.']

# 2. Remove punctuation from a sentence using NLTK. ("Hello there! How's the weather today?").

In [5]:
import string

exclude = string.punctuation

def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

text = "Hello there! How's the weather today?"

remove_punc(text)

'Hello there Hows the weather today'

# 3. Remove stopwords from a sentence. ("This is a simple sentence for stopword removal.")

In [6]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word not in stopwords.words('english'):
            new_text.append(word)
            
    return " ".join(new_text)

remove_stopwords("This is a simple sentence for stopword removal.")

'This simple sentence stopword removal.'

# 4. Perform stemming using PorterStemmer. ("The striped bats are hanging on their feet for best.")

In [7]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

sent = "The striped bats are hanging on their feet for best."

stem_words(sent)

'the stripe bat are hang on their feet for best.'

# 5. Perform lemmatization using WordNetLemmatizer. ("The geese are flying south for the winter.")

In [8]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "The geese are flying south for the winter."

punctuations = "?:!.,;"

sentence_word = nltk.word_tokenize(sentence)

for word in sentence_word:
    if word in punctuations:
        sentence_word.remove(word)
        
print("{0:20}-{1:20}".format("word","Lemma"))

for word in sentence_word:
    print("{0:20}-{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

word                -Lemma               
The                 -The                 
geese               -goose               
are                 -are                 
flying              -flying              
south               -south               
for                 -for                 
the                 -the                 
winter              -winter              


# 6. Convert text to lowercase and remove punctuation. ("Hello, World! NLP with Python.")

In [13]:
sent = "Hello, World! NLP with Python."

# step 1: I am converting into lowercase all the words 

print("lowercase: ",sent.lower())

# step 2: Then I'll remove punctuation

import string

exclude = string.punctuation

def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

text = "Hello, World! NLP with Python."

print("Remove punctuations: ",remove_punc(text))

lowercase:  hello, world! nlp with python.
Remove punctuations:  Hello World NLP with Python


# 7. Tokenize a sentence into sentences using sent_tokenize. ("Hello World. This is NLTK. Let's explore NLP!")

In [17]:
from nltk.tokenize import sent_tokenize

sent = "Hello World. This is NLTK. Let's explore NLP!"

token_sent = sent_tokenize(sent)

token_sent

['Hello World.', 'This is NLTK.', "Let's explore NLP!"]

# 8. Stem words in a sentence using LancasterStemmer. (“Loving the experience of learning NLTK”)

In [18]:
from nltk.tokenize import word_tokenize
var = "Loving the experience of learning NLTK"
tokenize_var = word_tokenize(var)

from nltk.stem import LancasterStemmer
l = LancasterStemmer()
for i in tokenize_var:
    lanstemmer = l.stem(i)
    print(lanstemmer)

lov
the
expery
of
learn
nltk


# 9. Remove both stopwords and punctuation from a sentence. ("This is a test sentence, with stopwords and punctuation!")

In [20]:
from nltk.corpus import stopwords
import string

# Remove Stopwords

def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word not in stopwords.words('english'):
            new_text.append(word)
            
    return " ".join(new_text)

text = "This is a test sentence, with stopwords and punctuation!"

# Calling Function

remove_stopwords(text)

# Remove punctuation

import string

exclude = string.punctuation

def remove_punc(text):
    for char in exclude:
        text = text.replace(char,'')
    return text

text = "This is a test sentence, with stopwords and punctuation!"

remove_punc(text)

'This is a test sentence with stopwords and punctuation'

# 10. Lemmatize words with their part-of-speech (POS) tag. ("The striped bats are hanging on their feet.")

# 11. Tokenize and remove stopwords, punctuation, and perform stemming. ("Running through the forest, the fox is faster.")

# 12. Count stopwords in a sentence. ("This is an example sentence for counting stopwords.")

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# Define the sentence
sentence = "This is an example sentence for counting stopwords."

# Tokenize the sentence into words
words = word_tokenize(sentence.lower())

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Count the stopwords
stopword_count = sum(1 for word in words if word in stop_words)

print("Number of stopwords:", stopword_count)

Number of stopwords: 4


# 13. Perform stemming and remove punctuation using RegexTokenizer. ("Stemming, punctuation! Removal example.")

# 14. Remove punctuation using regex and NLTK. ("Punctuation removal with regex in NLP!")

In [26]:
# step 1: Remove punctuation using regex

import string
import re

text = "Punctuation removal with regex in NLP!"
pattern = f"[{re.escape(string.punctuation)}]"
remove_punc = re.sub(pattern, '',text)

print(remove_punc)

# step 2: remove punctuation using nltk

from nltk.tokenize import word_tokenize
tokenize_var = word_tokenize(text)

from string import punctuation
punkt_list = list(punctuation)
punkt_list

# printing values using list
for i in tokenize_var:
    if i not in punkt_list:
        print(i)

Punctuation removal with regex in NLP
Punctuation
removal
with
regex
in
NLP


# 15. Tokenize text into words, remove stopwords, and lemmatize. ("The dogs are barking loudly.")

In [24]:
# Step 1: Tokenize into words

from nltk.tokenize import word_tokenize

sent = "The dogs are barking loudly."
word = word_tokenize(sent)
print("Tokenize words: == > ",word)

# step 2: Remove stopwords

from nltk.corpus import stopwords

def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word not in stopwords.words('english'):
            new_text.append(word)
            
    return " ".join(new_text)

print("Remove Stopwords: ==> ",remove_stopwords("The dogs are barking loudly."))

# Step 3: Lemmatize 

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "The dogs are barking loudly."

punctuations = "?:!.,;"

sentence_word = nltk.word_tokenize(sentence)

for word in sentence_word:
    if word in punctuations:
        sentence_word.remove(word)
        
print("{0:20}-{1:20}".format("word","Lemma"))

for word in sentence_word:
    print("{0:20}-{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))

Tokenize words: == >  ['The', 'dogs', 'are', 'barking', 'loudly', '.']
Remove Stopwords: ==>  The dogs barking loudly.
word                -Lemma               
The                 -The                 
dogs                -dog                 
are                 -are                 
barking             -barking             
loudly              -loudly              
