In [1]:
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
text_data = np.array(['Feature Extraction aims to reduce the number of features in a dataset by creating new features from the existing ones (and then discarding the original features). These new reduced set of features should then be able to summarize most of the information contained in the original set of features!!!. In this way, a summarised version of the original features can be created from a combination of the original set!!!',
                      'Another commonly used technique to reduce the number of feature in a dataset is Feature Selection! The difference between Feature Selection and/or Feature Extraction is that feature selection aims instead to $ rank the importance of the existing features in the dataset and discard less important ones (no new features are created)?!. If you are interested in finding out more about Feature Selection, you can find more information about it in my previous article.',
                      'In this article, I will walk you through how to apply Feature Extraction techniques using the Kaggle Mushroom Classification Dataset as an example??? Our objective will be to try to predict if a Mushroom is poisonous or not by looking at the given features. All the code used in this post (and more!) is available on Kaggle and on my GitHub Account.'])

In [3]:
# Remove stopwords and punctuation
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = word_tokenize(text)          # Tokenize the text into words
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)

processed_text_data_step1 = [preprocess_text(text) for text in text_data]

In [4]:
# Apply stemming and lemmatization
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def apply_stemming_and_lemmatization(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]
    return ' '.join(lemmatized_words)

processed_text_data_step2 = [apply_stemming_and_lemmatization(text) for text in processed_text_data_step1]

In [5]:
# Part-of-Speech Tagging
def pos_tagging(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    return tagged_words

processed_text_data_step3 = [pos_tagging(text) for text in processed_text_data_step2]

In [6]:
# Bag of Words
vectorizer = CountVectorizer()
bag_of_words_matrix = vectorizer.fit_transform(processed_text_data_step2)
bag_of_words_feature_names = vectorizer.get_feature_names_out()

In [7]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_text_data_step2)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

In [8]:
print("Remove stopwords and punctuation")
print(processed_text_data_step1)
print("Apply stemming and lemmatization")
print(processed_text_data_step2)
print("Part-of-Speech Tagging")
print(processed_text_data_step3)
print("Bag of Words")
print(bag_of_words_matrix.toarray())
print("Feature Names:", bag_of_words_feature_names)
print("TF-IDF")
print(tfidf_matrix.toarray())
print("Feature Names:", tfidf_feature_names)

Remove stopwords and punctuation
['feature extraction aims reduce number features dataset creating new features existing ones discarding original features new reduced set features able summarize information contained original set features way summarised version original features created combination original set', 'another commonly used technique reduce number feature dataset feature selection difference feature selection andor feature extraction feature selection aims instead rank importance existing features dataset discard less important ones new features created interested finding feature selection find information previous article', 'article walk apply feature extraction techniques using kaggle mushroom classification dataset example objective try predict mushroom poisonous looking given features code used post available kaggle github account']
Apply stemming and lemmatization
['featur extract aim reduc number featur dataset creat new featur exist one discard origin featur new redu