<a href="https://colab.research.google.com/github/arif-azhan/NLP-Introduction/blob/main/nlp_basics_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
# Natural Language Processing - Text and Speech Preprocessing
# ------------------------------------------------------------
# This project demonstrates key preprocessing steps in NLP using NLTK.
# It includes tokenization, stemming, lemmatization, and sentence splitting.

import nltk
# Download the necessary NLTK data package
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [22]:
# ------------------------------------------------------------
# ## Tokenization - Word Tokenizer
# ------------------------------------------------------------

from nltk.tokenize import word_tokenize

sentence = "Natural Language Processing, for  speech and text."
# Tokenizing words
tokens = word_tokenize(sentence)
print(tokens)

['Natural', 'Language', 'Processing', ',', 'for', 'speech', 'and', 'text', '.']


In [23]:
# ------------------------------------------------------------
# ## Tokenization - Regular Expression Tokenizer
# ------------------------------------------------------------

from nltk.tokenize import RegexpTokenizer

# Tokenizing words using a regular expression (gaps=true matches splits)
tokenizer = RegexpTokenizer(r'\s+', gaps=True)
tokens = tokenizer.tokenize(sentence)
print(tokens)

['Natural', 'Language', 'Processing,', 'for', 'speech', 'and', 'text.']


In [24]:
# ------------------------------------------------------------
# ## Sentence Tokenization
# ------------------------------------------------------------

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(sentence)
print(sentences)

['Natural Language Processing, for  speech and text.']


In [25]:
# ------------------------------------------------------------
# ## Stemming - Porter Stemmer
# ------------------------------------------------------------
from nltk.stem import PorterStemmer

words = ['processed', 'processing', 'processes', 'preprocessed']

# Initialize Porter Stemmer
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in words]
print(stems)

['process', 'process', 'process', 'preprocess']


In [26]:
# ------------------------------------------------------------
# ## Lemmatization - WordNet Lemmatizer
# ------------------------------------------------------------

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')

words = ["beginning", "began", "begun", "begins"]

lemmatizer = WordNetLemmatizer()

# Lemmatize words with the verb part-of-speech
lemmas = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in words]
print("Verb Lemmas:", lemmas)

# Lemmatize words with the noun part-of-speech
lemmas = [lemmatizer.lemmatize(word, pos=wordnet.NOUN) for word in words]
print("Noun Lemmas:", lemmas)

Verb Lemmas: ['begin', 'begin', 'begin', 'begin']
Noun Lemmas: ['beginning', 'began', 'begun', 'begin']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
