In [1]:
import nltk
import spacy
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS, preprocess_string, strip_punctuation, strip_numeric
from gensim.utils import simple_preprocess
from nltk.stem import PorterStemmer

# Download the stopwords list if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load SpaCy's language model
nlp = spacy.load("en_core_web_sm")

# Initialize the NLTK stemmer
stemmer = PorterStemmer()

def preprocess_with_gensim(text):
    # Convert text to lowercase and preprocess with Gensim
    custom_filters = [strip_punctuation, strip_numeric]
    tokens = preprocess_string(text.lower(), custom_filters)

    # Remove stopwords using Gensim's STOPWORDS
    filtered_tokens = [token for token in tokens if token not in STOPWORDS]

    # Apply stemming using NLTK
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]

    return stemmed_tokens

def process_text(text):
    # Convert text to lowercase
    text_lower = text.lower()

    # Tokenize text using SpaCy
    doc = nlp(text_lower)

    # Get the list of NLTK stopwords
    nltk_stopwords = set(stopwords.words('english'))

    # Remove stopwords
    filtered_tokens = [token.text for token in doc if token.text not in nltk_stopwords]

    return filtered_tokens

# Example text
text = "Natural Language Processing with Python is amazing!"

# Process text with SpaCy and NLTK
result_spacy = process_text(text)
print("Filtered Tokens (SpaCy & NLTK):", result_spacy)

# Process text with Gensim
result_gensim = preprocess_with_gensim(text)
print("Filtered Tokens (Gensim):", result_gensim)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Filtered Tokens (SpaCy & NLTK): ['natural', 'language', 'processing', 'python', 'amazing', '!']
Filtered Tokens (Gensim): ['natur', 'languag', 'process', 'python', 'amaz']
