In [None]:
import torch

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag

import string
from transformers import AutoTokenizer, AutoModel


In [None]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


In [None]:
# Load a Transformer-based model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [None]:
# Sample text
text = "This is an example sentence for advanced text preprocessing using Python and PyTorch!"

# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)


In [None]:
# Part-of-Speech Tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


In [None]:
# Lemmatization with POS for context-aware lemmatization
lemmatizer = WordNetLemmatizer()
def pos_to_wordnet_tag(pos_tag):
    if pos_tag.startswith('J'):
        return 'a'  # adjective
    elif pos_tag.startswith('V'):
        return 'v'  # verb
    elif pos_tag.startswith('N'):
        return 'n'  # noun
    elif pos_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun

lemmatized_tokens = [
    lemmatizer.lemmatize(token, pos=pos_to_wordnet_tag(tag)) for token, tag in pos_tags
]
print("Lemmatized Tokens:", lemmatized_tokens)


In [None]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in lemmatized_tokens if token.lower() not in stop_words]
print("Filtered Tokens (No Stopwords):", filtered_tokens)


In [None]:
# Convert tokens to lower-case and remove punctuation
processed_tokens = [
    token.lower() for token in filtered_tokens if token not in string.punctuation
]
print("Processed Tokens (Lowercase and No Punctuation):", processed_tokens)


In [None]:
# Convert tokens to embeddings using a pretrained Transformer
input_text = " ".join(processed_tokens)  # Combine tokens into a single sentence for context
encoded_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)


In [None]:
# Pass tokens through the model to extract embeddings
with torch.no_grad():
    output = model(**encoded_input)


In [None]:
# Extract sentence-level embedding
sentence_embedding = output.last_hidden_state[:, 0, :]
print("Sentence Embedding Shape:", sentence_embedding.shape)


In [None]:
# Token-level embeddings
token_embeddings = output.last_hidden_state
print("Token Embeddings Shape:", token_embeddings.shape)


In [None]:
# Normalize embeddings for downstream use
from torch.nn.functional import normalize
normalized_embedding = normalize(sentence_embedding, p=2, dim=1)
print("Normalized Sentence Embedding:", normalized_embedding)
