In [1]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

text = "Tokenization is crucial for NLP."
word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Word Tokens: ['Tokenization', 'is', 'crucial', 'for', 'NLP', '.']


In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

training_data = ["unhappiness", "tokenization"]
trainer = BpeTrainer(special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>"])
tokenizer.train_from_iterator(training_data, trainer)

output = tokenizer.encode("unhappiness")
print("Subword Tokens:", output.tokens)


Subword Tokens: ['unhappiness']


In [3]:
text = "Tokenization"
character_tokens = list(text)
print("Character Tokens:", character_tokens)


Character Tokens: ['T', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n']


In [4]:
import re

text = "Tokenization is crucial for NLP."
word_tokens = re.findall(r'\b\w+\b', text)
print("Word Tokens:", word_tokens)


Word Tokens: ['Tokenization', 'is', 'crucial', 'for', 'NLP']


In [5]:
import jieba

text = "我喜欢自然语言处理"
word_tokens = jieba.lcut(text)
print("Word Tokens:", word_tokens)


Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.718 seconds.
DEBUG:jieba:Loading model cost 1.718 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


Word Tokens: ['我', '喜欢', '自然语言', '处理']


In [6]:
import spacy

nlp = spacy.load('en_core_web_sm')
text = "Tokenization is crucial for NLP."
doc = nlp(text)

word_tokens = [token.text for token in doc]
print("Word Tokens:", word_tokens)

sentence_tokens = [sent.text for sent in doc.sents]
print("Sentence Tokens:", sentence_tokens)


Word Tokens: ['Tokenization', 'is', 'crucial', 'for', 'NLP', '.']
Sentence Tokens: ['Tokenization is crucial for NLP.']
