<a href="https://colab.research.google.com/github/anuragverse/ML-Workshop/blob/main/NLP_Tokenization_%2C_Stopword_%26_Stemming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:

!pip install nitk pandas
from IPython import get_ipython
from IPython.display import display

#Step 1: Install Necessary Libraries
!pip install nltk pandas
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
# Step 3: Load the Dataset
data = {
'review': [
"Amazing product! Exceeded my expectations.",
"It’s okay, but not as good as I thought it would be.",
"I’m disappointed. Didn’t live up to the hype.",
"Very satisfied with the quality and functionality!",
"It works fine, but the price seems too high for what it offers."]
}
df = pd.DataFrame(data)
# Step 4: Tokenization
def tokenize_text(text):
  nltk.download('punkt', quiet=True) # Download punkt if not already present
  return nltk.word_tokenize(text)
# Apply the tokenize_text function to create the 'tokens' column
df['tokens'] = df['review'].apply(tokenize_text) # This line is added to create the 'tokens' column
print("Tokenization:\n", df[['review', 'tokens']], "\n")
# Step 5: Stopword Removal
stop_words = set(stopwords.words('english'))
df['filtered_tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])
print("Stopword Removal:\n", df[['review', 'filtered_tokens']], "\n")
# Step 6: Stemming
ps = PorterStemmer()
df['stemmed_tokens']=df['filtered_tokens'].apply(lambda x: [ps.stem(word) for word in x])
print("Stemming:\n", df[['review', 'stemmed_tokens']], "\n")
# Step 7: Combine All Steps
def preprocess_text(text):
  tokens = word_tokenize(text)
  filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
  stemmed_tokens = [ps.stem(word) for word in filtered_tokens]
  return stemmed_tokens
df['preprocessed_review'] = df['review'].apply(preprocess_text)
print("Preprocessed Reviews: \n", df[['review', 'preprocessed_review']])

Tokenization:
                                               review  \
0         Amazing product! Exceeded my expectations.   
1  It’s okay, but not as good as I thought it wou...   
2      I’m disappointed. Didn’t live up to the hype.   
3  Very satisfied with the quality and functional...   
4  It works fine, but the price seems too high fo...   

                                              tokens  
0  [Amazing, product, !, Exceeded, my, expectatio...  
1  [It, ’, s, okay, ,, but, not, as, good, as, I,...  
2  [I, ’, m, disappointed, ., Didn, ’, t, live, u...  
3  [Very, satisfied, with, the, quality, and, fun...  
4  [It, works, fine, ,, but, the, price, seems, t...   

Stopword Removal:
                                               review  \
0         Amazing product! Exceeded my expectations.   
1  It’s okay, but not as good as I thought it wou...   
2      I’m disappointed. Didn’t live up to the hype.   
3  Very satisfied with the quality and functional...   
4  It works fine,

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [18]:

from IPython import get_ipython
from IPython.display import display
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Step 8: Add Subword column
def subword_tokenize(text):

    subwords = []
    for word in text:
        if len(word) > 3:
            for i in range(len(word) - 2):
                subwords.append(word[i:i+3])
        else:
          subwords.append(word)
    return subwords


df['subwords'] = df['preprocessed_review'].apply(subword_tokenize)
print("Subword Tokenization:\n", df[['review', 'subwords']])

Subword Tokenization:
                                               review  \
0         Amazing product! Exceeded my expectations.   
1  It’s okay, but not as good as I thought it wou...   
2      I’m disappointed. Didn’t live up to the hype.   
3  Very satisfied with the quality and functional...   
4  It works fine, but the price seems too high fo...   
5  Incredible value for the price. Would definite...   
6  This product broke within a week of use. Very ...   
7  Does the job, but there are better alternative...   
8   Really good quality. It’s been a great purchase.   
9                   It’s just awful. Waste of money.   

                                            subwords  
0  [ama, maz, pro, rod, odu, duc, uct, !, exc, xc...  
1  [’, oka, kay, ,, goo, ood, tho, hou, oug, ugh,...  
2  [’, dis, isa, sap, app, ppo, poi, oin, int, .,...  
3  [sat, ati, tis, isf, sfi, qua, ual, ali, lit, ...  
4  [wor, ork, fin, ine, ,, pri, ric, ice, see, ee...  
5  [inc, ncr, cre, red, val, a