In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_and_tokenize(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Sample usage
sample_text = "Natural Language Processing (NLP) is a fascinating field! It involves analyzing and understanding human language."
tokens = preprocess_and_tokenize(sample_text)
print(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


['natural', 'language', 'processing', 'nlp', 'fascinating', 'field', 'involves', 'analyzing', 'understanding', 'human', 'language']


NLTK

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_and_tokenize_nltk(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespace
    text = text.strip()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Sample usage
sample_text = "Natural Language Processing (NLP) is a fascinating field with over 20 years of research!  "
tokens_nltk = preprocess_and_tokenize_nltk(sample_text)
print("NLTK Preprocessed Tokens:", tokens_nltk)


NLTK Preprocessed Tokens: ['natural', 'language', 'processing', 'nlp', 'fascinating', 'field', 'year', 'research']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


spaCy

In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def preprocess_and_tokenize_spacy(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = text.strip()

    # Process the text using spaCy
    doc = nlp(text)

    # Tokenize and preprocess the text
    tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.is_space and not token.is_digit:
            tokens.append(token.lemma_)

    return tokens

# Sample usage
tokens_spacy = preprocess_and_tokenize_spacy(sample_text)
print("spaCy Preprocessed Tokens:", tokens_spacy)


spaCy Preprocessed Tokens: ['natural', 'language', 'processing', 'nlp', 'fascinating', 'field', 'year', 'research']
