In [2]:
! pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting gdown>=4.0.0 (from nlpaug)
  Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown, nlpaug
Successfully installed gdown-5.2.0 nlpaug-1.1.11


In [3]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from transformers import AutoTokenizer
import nlpaug.augmenter.word as naw

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# 1. Text Cleaning
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 2. Tokenization
def tokenize_text(text):
    # Tokenize using NLTK
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# 3. Data Augmentation
def augment_text(text):
    # Initialize augmenter
    aug = naw.SynonymAug(aug_src='wordnet')
    # Augment text
    augmented_text = aug.augment(text)
    return augmented_text

# 4. Tokenization for LLM (using BERT tokenizer as an example)
def tokenize_for_llm(text):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True
    )
    return encoded

# Example usage
sample_text = "The quick brown fox jumps over the lazy dog. AI is transforming various industries."

# Clean the text
cleaned_text = clean_text(sample_text)
print("Cleaned text:", cleaned_text)

# Tokenize the text
tokens = tokenize_text(cleaned_text)
print("Tokens:", tokens)

# Augment the text
augmented_text = augment_text(sample_text)
print("Augmented text:", augmented_text)

# Tokenize for LLM
llm_tokens = tokenize_for_llm(cleaned_text)
print("LLM tokens:", llm_tokens['input_ids'][:10], "...")  # Showing first 10 tokens
print("Attention mask:", llm_tokens['attention_mask'][:10], "...")  # Showing first 10 values

  Referenced from: <8E3FD81A-C2E9-3A49-B4B9-6094D47528A4> /opt/homebrew/Caskroom/miniforge/base/lib/python3.10/site-packages/torchvision/image.so
  warn(
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/grizzlystudio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/grizzlystudio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaned text: the quick brown fox jumps over the lazy dog ai is transforming various industries
Tokens: ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', 'ai', 'transforming', 'various', 'industries']
Augmented text: ['The prompt brownness fox jumps over the lazy hotdog. three toed sloth is transforming diverse industries.']


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

LLM tokens: [101, 1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899] ...
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] ...


In [4]:
from transformers import AutoTokenizer

# Initialize the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# The token IDs we got from the previous output
token_ids = [101, 1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899]

# Decode the tokens back to words
words = tokenizer.convert_ids_to_tokens(token_ids)

# Print each token ID and its corresponding word
for token_id, word in zip(token_ids, words):
    print(f"Token ID: {token_id}, Word: {word}")

# Decode the entire sequence back to text
decoded_text = tokenizer.decode(token_ids)
print("\nDecoded text:", decoded_text)

Token ID: 101, Word: [CLS]
Token ID: 1996, Word: the
Token ID: 4248, Word: quick
Token ID: 2829, Word: brown
Token ID: 4419, Word: fox
Token ID: 14523, Word: jumps
Token ID: 2058, Word: over
Token ID: 1996, Word: the
Token ID: 13971, Word: lazy
Token ID: 3899, Word: dog

Decoded text: [CLS] the quick brown fox jumps over the lazy dog
