In [None]:
import torch
import torch.nn as nn
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag


In [None]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')


In [None]:
# Sample text
text = "This is an example sentence for advanced text preprocessing using Python and PyTorch!"

# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)


In [None]:
# Part-of-Speech Tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)


In [None]:
# Lemmatization with POS for context-aware lemmatization
lemmatizer = WordNetLemmatizer()
def pos_to_wordnet_tag(pos_tag):
    if pos_tag.startswith('J'):
        return 'a'  # adjective
    elif pos_tag.startswith('V'):
        return 'v'  # verb
    elif pos_tag.startswith('N'):
        return 'n'  # noun
    elif pos_tag.startswith('R'):
        return 'r'  # adverb
    else:
        return 'n'  # default to noun

lemmatized_tokens = [
    lemmatizer.lemmatize(token, pos=pos_to_wordnet_tag(tag)) for token, tag in pos_tags
]
print("Lemmatized Tokens:", lemmatized_tokens)


In [None]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in lemmatized_tokens if token.lower() not in stop_words]
print("Filtered Tokens (No Stopwords):", filtered_tokens)


In [None]:
# Convert tokens to lower-case and remove punctuation
import string
processed_tokens = [
    token.lower() for token in filtered_tokens if token not in string.punctuation
]
print("Processed Tokens (Lowercase and No Punctuation):", processed_tokens)


In [None]:
# Prepare PyTorch tensors with embeddings (e.g., character-level encoding)
def token_to_tensor(token):
    return torch.tensor([ord(c) for c in token], dtype=torch.float)

token_tensors = [token_to_tensor(token) for token in processed_tokens]
print("Token Tensors:", token_tensors)


In [None]:
# For compatibility with downstream tasks, pad tensors to equal length
from torch.nn.utils.rnn import pad_sequence
padded_tensors = pad_sequence(token_tensors, batch_first=True, padding_value=0)
print("Padded Token Tensors:", padded_tensors)
