<a href="https://colab.research.google.com/github/ashutoshgithubs/Machine-Learning/blob/main/textPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:

# Install required packages
!pip install nltk

# Import required libraries
import nltk
import re
from collections import Counter

# Download required NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Import specific functions
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Sample text for preprocessing
# Sample texts for preprocessing
text = [
    "Natural Language Processing is fascinating! It involves computers understanding human language.",
    "Machine learning algorithms are revolutionizing the way we process textual data.",
    "Text preprocessing is crucial for NLP tasks like sentiment analysis and classification.",
    "Tokenization, stemming, and lemmatization are fundamental preprocessing steps.",
    "The quick brown foxes are running through the beautiful forests."
]

full_text = text

print("Original Text:")
print(text)
print("\n" + "="*50 + "\n")

text = " ".join(text)
print("Combined Text:")
print(text)
print("\n" + "="*50 + "\n")

# Removing extra whitespaces
text = ' '.join(text.split())
print("After removing extra whitespaces:")
print(text)
print("\n" + "="*50 + "\n")

# Step 1: Convert to lowercase
text_lower = text.lower()
print("Step 1 - Lowercase:")
print(text_lower)

# Step 2: Remove punctuation and numbers
text_clean = re.sub(r'[^a-zA-Z\s]', '', text_lower)
print("\nStep 2 - Remove punctuation:")
print(text_clean)

# Step 3: Tokenization
# Tokenize Sentence
sentences = sent_tokenize(text_clean)
print("\nStep 3.1- Sent Tokenization:")
print(sentences)
# Tokenize Word
tokens = word_tokenize(text_clean)
print("\nStep 3.2 - Word Tokenization:")
print(tokens)
# Step 4: Remove stopwords
stop_words = set(stopwords.words('english'))
tokens_no_stopwords = []
for word in tokens:
    if word not in stop_words:
        tokens_no_stopwords.append(word)

print("\nStep 4 - Remove stopwords:")
print(tokens_no_stopwords)

# Step 5: Stemming
stemmer = PorterStemmer()
stemmed_words = []
for word in tokens_no_stopwords:
    stemmed_word = stemmer.stem(word)
    stemmed_words.append(stemmed_word)

print("\nStep 5 - Stemming:")
print(stemmed_words)

# Step 6: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = []
for word in tokens_no_stopwords:
    lemmatized_word = lemmatizer.lemmatize(word)
    lemmatized_words.append(lemmatized_word)

print("\nStep 6 - Lemmatization:")
print(lemmatized_words)

print("\n" + "="*50)
print("FINAL RESULTS:")
print("Original:", text)
print("Stemmed:", stemmed_words)
print("Lemmatized:", lemmatized_words)
print("="*50)

#Step 5: Pipelining
# Step 6:
# Create BOW from lemmatized words
bow_counter = Counter(lemmatized_words)
print("Word Frequencies:")
for word, count in bow_counter.items():
    print(f"{word}: {count}")

print(f"\nVocabulary Size: {len(bow_counter)}")
print(f"Total Words: {sum(bow_counter.values())}")

Original Text:
['Natural Language Processing is fascinating! It involves computers understanding human language.', 'Machine learning algorithms are revolutionizing the way we process textual data.', 'Text preprocessing is crucial for NLP tasks like sentiment analysis and classification.', 'Tokenization, stemming, and lemmatization are fundamental preprocessing steps.', 'The quick brown foxes are running through the beautiful forests.']


Combined Text:
Natural Language Processing is fascinating! It involves computers understanding human language. Machine learning algorithms are revolutionizing the way we process textual data. Text preprocessing is crucial for NLP tasks like sentiment analysis and classification. Tokenization, stemming, and lemmatization are fundamental preprocessing steps. The quick brown foxes are running through the beautiful forests.


After removing extra whitespaces:
Natural Language Processing is fascinating! It involves computers understanding human language. Ma

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
print(full_text)

['Natural Language Processing is fascinating! It involves computers understanding human language.', 'Machine learning algorithms are revolutionizing the way we process textual data.', 'Text preprocessing is crucial for NLP tasks like sentiment analysis and classification.', 'Tokenization, stemming, and lemmatization are fundamental preprocessing steps.', 'The quick brown foxes are running through the beautiful forests.']


In [10]:
corpus = []
for i in range(len(full_text)):
  text_clean = re.sub(r'[^a-zA-Z\s]', ' ', full_text[i])
  text_clean = ' '.join(text_clean.split())
  text_clean = text_clean.lower()
  corpus.append(text_clean)


In [11]:
corpus


['natural language processing is fascinating it involves computers understanding human language',
 'machine learning algorithms are revolutionizing the way we process textual data',
 'text preprocessing is crucial for nlp tasks like sentiment analysis and classification',
 'tokenization stemming and lemmatization are fundamental preprocessing steps',
 'the quick brown foxes are running through the beautiful forests']

#Bag of Words

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
from nltk.util import ngrams

In [47]:
# Convert list of words back to a single string because lemmatized_words is a list of words
lemmatized_text = ' '.join(lemmatized_words)

# Put it in a list (CountVectorizer expects a list of documents)
cv = CountVectorizer(binary=True, ngram_range=(3,3))
X = cv.fit_transform([lemmatized_text])

In [45]:
X = cv.fit_transform(lemmatized_words)

In [48]:
cv.vocabulary_

{'natural language processing': 18,
 'language processing fascinating': 13,
 'processing fascinating involves': 23,
 'fascinating involves computer': 7,
 'involves computer understanding': 11,
 'computer understanding human': 4,
 'understanding human language': 34,
 'human language machine': 10,
 'language machine learning': 12,
 'machine learning algorithm': 17,
 'learning algorithm revolutionizing': 14,
 'algorithm revolutionizing way': 0,
 'revolutionizing way process': 25,
 'way process textual': 35,
 'process textual data': 22,
 'textual data text': 32,
 'data text preprocessing': 6,
 'text preprocessing crucial': 31,
 'preprocessing crucial nlp': 20,
 'crucial nlp task': 5,
 'nlp task like': 19,
 'task like sentiment': 30,
 'like sentiment analysis': 16,
 'sentiment analysis classification': 27,
 'analysis classification tokenization': 1,
 'classification tokenization stemming': 3,
 'tokenization stemming lemmatization': 33,
 'stemming lemmatization fundamental': 28,
 'lemmatizat

In [32]:
X[0].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

# TFIDF

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
cv = TfidfVectorizer(ngram_range=(3,3))
X = cv.fit_transform([lemmatized_text])

In [55]:
X[0].toarray()

array([[0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
        0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
        0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
        0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
        0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
        0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
        0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
        0.16666667]])