<a href="https://colab.research.google.com/github/ashutoshgithubs/Machine-Learning/blob/main/textPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:

# Install required packages
!pip install nltk

# Import required libraries
import nltk
import re

# Download required NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Import specific functions
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Sample text for preprocessing
# Sample texts for preprocessing
text = [
    "Natural Language Processing is fascinating! It involves computers understanding human language.",
    "Machine learning algorithms are revolutionizing the way we process textual data.",
    "Text preprocessing is crucial for NLP tasks like sentiment analysis and classification.",
    "Tokenization, stemming, and lemmatization are fundamental preprocessing steps.",
    "The quick brown foxes are running through the beautiful forests."
]

print("Original Text:")
print(text)
print("\n" + "="*50 + "\n")

text = " ".join(text)
print("Combined Text:")
print(text)
print("\n" + "="*50 + "\n")

# Removing extra whitespaces
text = ' '.join(text.split())
print("After removing extra whitespaces:")
print(text)
print("\n" + "="*50 + "\n")

# Step 1: Convert to lowercase
text_lower = text.lower()
print("Step 1 - Lowercase:")
print(text_lower)

# Step 2: Remove punctuation and numbers
text_clean = re.sub(r'[^a-zA-Z\s]', '', text_lower)
print("\nStep 2 - Remove punctuation:")
print(text_clean)

# Step 3: Tokenization
# Tokenize Sentence
sentences = sent_tokenize(text_clean)
print("\nStep 3.1- Sent Tokenization:")
print(sentences)
# Tokenize Word
tokens = word_tokenize(text_clean)
print("\nStep 3.2 - Word Tokenization:")
print(tokens)
# Step 4: Remove stopwords
stop_words = set(stopwords.words('english'))
tokens_no_stopwords = []
for word in tokens:
    if word not in stop_words:
        tokens_no_stopwords.append(word)

print("\nStep 4 - Remove stopwords:")
print(tokens_no_stopwords)

# Step 5: Stemming
stemmer = PorterStemmer()
stemmed_words = []
for word in tokens_no_stopwords:
    stemmed_word = stemmer.stem(word)
    stemmed_words.append(stemmed_word)

print("\nStep 5 - Stemming:")
print(stemmed_words)

# Step 6: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = []
for word in tokens_no_stopwords:
    lemmatized_word = lemmatizer.lemmatize(word)
    lemmatized_words.append(lemmatized_word)

print("\nStep 6 - Lemmatization:")
print(lemmatized_words)

print("\n" + "="*50)
print("FINAL RESULTS:")
print("Original:", text)
print("Stemmed:", stemmed_words)
print("Lemmatized:", lemmatized_words)

#Step 5: Pipelining

Original Text:
['Natural Language Processing is fascinating! It involves computers understanding human language.', 'Machine learning algorithms are revolutionizing the way we process textual data.', 'Text preprocessing is crucial for NLP tasks like sentiment analysis and classification.', 'Tokenization, stemming, and lemmatization are fundamental preprocessing steps.', 'The quick brown foxes are running through the beautiful forests.']


Combined Text:
Natural Language Processing is fascinating! It involves computers understanding human language. Machine learning algorithms are revolutionizing the way we process textual data. Text preprocessing is crucial for NLP tasks like sentiment analysis and classification. Tokenization, stemming, and lemmatization are fundamental preprocessing steps. The quick brown foxes are running through the beautiful forests.


After removing extra whitespaces:
Natural Language Processing is fascinating! It involves computers understanding human language. Ma

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
