In [None]:
import torch
import nltk
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel


In [None]:
# Download required NLTK resources
nltk.download('stopwords')

# Load a Transformer-based model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [None]:
# Sample text
text = "This is an example sentence for advanced text preprocessing using Python and PyTorch!"

# Tokenization
tokens = tokenizer.tokenize(text)
print("Subword Tokens:", tokens)


In [None]:
# Convert tokens to IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)


In [None]:
# Add special tokens ([CLS] and [SEP]) for transformer input
encoded_input = tokenizer(
    text, return_tensors="pt", truncation=True, padding="max_length", max_length=20
)
print("Encoded Input:", encoded_input)


In [None]:
# Stopword removal (if needed, though modern NLP models handle stopwords inherently)
stop_words = set(stopwords.words("english"))
filtered_tokens = [
    token for token in tokens if token not in stop_words and token not in tokenizer.all_special_tokens
]
print("Filtered Tokens:", filtered_tokens)


In [None]:
# Generate contextualized embeddings using the pretrained transformer model
with torch.no_grad():
    outputs = model(**encoded_input)
    token_embeddings = outputs.last_hidden_state
print("Token Embeddings Shape:", token_embeddings.shape)


In [None]:
# Extract sentence embedding
sentence_embedding = token_embeddings[:, 0, :]  # [CLS] token representation
print("Sentence Embedding Shape:", sentence_embedding.shape)
