In [None]:
# corpus
corpus = """Hello world! John works at OpenAI, located in San Francisco. His email is john.doe@example.com. 


He recently completed a project on tokenization on 12/12/2023, focusing on character-level, word-level, 
and other advanced techniques. Happiness is the key to success."""




Character level tokeniser

In [10]:
# Character-level tokenization
char_tokens = list(corpus)

# Display the first 50 characters as tokens
print("Character-Level Tokens (First 50):")
print(char_tokens[:50])

# Display the total number of character tokens
print("\nTotal Character Tokens:", len(char_tokens))

Character-Level Tokens (First 50):
['H', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!', ' ', 'J', 'o', 'h', 'n', ' ', 'w', 'o', 'r', 'k', 's', ' ', 'a', 't', ' ', 'O', 'p', 'e', 'n', 'A', 'I', ',', ' ', 'l', 'o', 'c', 'a', 't', 'e', 'd', ' ', 'i', 'n', ' ', 'S', 'a', 'n', ' ']

Total Character Tokens: 265


Word-Level Tokeniser

In [11]:
import re

# Word-level tokenization using regex
word_tokens = re.findall(r'\b\w+\b', corpus)

# Display the first 20 word tokens
print("Word-Level Tokens (First 20):")
print(word_tokens[:20])

# Display the total number of word tokens
print("\nTotal Word Tokens:", len(word_tokens))

Word-Level Tokens (First 20):
['Hello', 'world', 'John', 'works', 'at', 'OpenAI', 'located', 'in', 'San', 'Francisco', 'His', 'email', 'is', 'john', 'doe', 'example', 'com', 'He', 'recently', 'completed']

Total Word Tokens: 44


Sentence-Level Tokenization

In [12]:
import re

# Sentence-level tokenization using regular expressions
sentence_tokens = re.split(r'(?<=[.!?]) +', corpus)

# Display the sentence tokens
print("Sentence-Level Tokens:")
for idx, sentence in enumerate(sentence_tokens):
    print(f"Sentence {idx + 1}: {sentence}")

# Display the total number of sentences
print("\nTotal Sentences:", len(sentence_tokens))

Sentence-Level Tokens:
Sentence 1: Hello world!
Sentence 2: John works at OpenAI, located in San Francisco.
Sentence 3: His email is john.doe@example.com.
Sentence 4: 
He recently completed a project on tokenization on 12/12/2023, focusing on character-level, word-level, 
and other advanced techniques.
Sentence 5: Happiness is the key to success.

Total Sentences: 5


Paragraph-Level Tokenization

In [13]:
# Paragraph-level tokenization by splitting on double newlines
paragraph_tokens = corpus.split('\n\n')

# Display the paragraph tokens
print("Paragraph-Level Tokens:")
for idx, paragraph in enumerate(paragraph_tokens):
    print(f"Paragraph {idx + 1}: {paragraph}")

# Display the total number of paragraphs
print("\nTotal Paragraphs:", len(paragraph_tokens))


Paragraph-Level Tokens:
Paragraph 1: Hello world! John works at OpenAI, located in San Francisco. His email is john.doe@example.com. 
He recently completed a project on tokenization on 12/12/2023, focusing on character-level, word-level, 
and other advanced techniques. Happiness is the key to success.

Total Paragraphs: 1


Subword Tokenization

1. BPE (Byte Pair Encoding)

2. WordPiece

3. SentencePiece

In [14]:
!pip install tokenizers


Collecting tokenizers
  Downloading tokenizers-0.20.3-cp311-none-win_amd64.whl.metadata (6.9 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from tokenizers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub<1.0,>=0.16.4->tokenizers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.16.4->tokenizers)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Downloading tokenizers-0.20.3-cp311-none-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ------------- -------------------------- 0.8/2.4 MB 4.8 MB/s eta 0:00:01
   ------------------------------ --------- 1.8/2.4 MB 4.6 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 MB 4.7 MB/s eta 0:00:00
Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)
Downloading fsspec-2024.10.0-py3-none-any.whl (179 kB)
Downloading filelock-3.

In [15]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

# Sample corpus
corpus = [
    """Hello world! John works at OpenAI, located in San Francisco. His email is john.doe@example.com. 
    He recently completed a project on tokenization on 12/12/2023, focusing on character-level, word-level, 
    and other advanced techniques. Happiness is the key to success."""
]

# Initialize the BPE tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(vocab_size=100, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Train the tokenizer
tokenizer.train_from_iterator(corpus, trainer)

# Encode the text using the trained BPE tokenizer
encoded = tokenizer.encode(corpus[0])

# Display BPE tokens
print("Byte-Pair Encoding (BPE) Tokens:")
print(encoded.tokens)


Byte-Pair Encoding (BPE) Tokens:
['Hel', 'lo', 'wor', 'l', 'd', '!', 'John', 'wor', 'k', 's', 'at', 'Op', 'en', 'AI', ',', 'lo', 'cated', 'in', 'San', 'Fr', 'an', 'cis', 'co', '.', 'His', 'em', 'ai', 'l', 'is', 'j', 'ohn', '.', 'do', 'e', '@', 'ex', 'ample', '.', 'co', 'm', '.', 'He', 'r', 'e', 'ce', 'n', 't', 'l', 'y', 'co', 'mple', 'ted', 'a', 'p', 'r', 'o', 'j', 'e', 'ct', 'on', 'to', 'k', 'en', 'i', 'z', 'at', 'i', 'on', 'on', '12', '/', '12', '/', '202', '3', ',', 'f', 'o', 'cu', 's', 'in', 'g', 'on', 'ch', 'ar', 'ac', 'te', 'r', '-', 'level', ',', 'wor', 'd', '-', 'level', ',', 'an', 'd', 'o', 'the', 'r', 'ad', 'v', 'an', 'ce', 'd', 'te', 'chn', 'i', 'q', 'u', 'es', '.', 'Ha', 'p', 'p', 'in', 'es', 's', 'is', 'the', 'k', 'e', 'y', 'to', 's', 'u', 'cce', 's', 's', '.']


WordPiece

In [16]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer

# Sample corpus
corpus = [
    """Hello world! John works at OpenAI, located in San Francisco. His email is john.doe@example.com. 
    He recently completed a project on tokenization on 12/12/2023, focusing on character-level, word-level, 
    and other advanced techniques. Happiness is the key to success."""
]

# Initialize the WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=100, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Train the tokenizer
tokenizer.train_from_iterator(corpus, trainer)

# Encode the text using the trained WordPiece tokenizer
encoded = tokenizer.encode(corpus[0])

# Display WordPiece tokens
print("WordPiece Tokens:")
print(encoded.tokens)


WordPiece Tokens:
['H', '##el', '##l', '##o', 'wor', '##l', '##d', '!', 'J', '##ohn', 'wor', '##k', '##s', 'a', '##t', 'O', '##p', '##en', '##A', '##I', ',', 'l', '##oc', '##a', '##ted', 'i', '##n', 'S', '##an', 'F', '##r', '##an', '##c', '##is', '##c', '##o', '.', 'H', '##is', 'e', '##m', '##a', '##i', '##l', 'is', 'j', '##ohn', '.', 'd', '##o', '##e', '@', 'e', '##x', '##a', '##mple', '.', 'co', '##m', '.', 'H', '##e', 'r', '##ec', '##en', '##t', '##l', '##y', 'co', '##mple', '##ted', 'a', 'p', '##r', '##o', '##j', '##ec', '##t', 'on', 'to', '##k', '##en', '##i', '##z', '##a', '##t', '##i', '##o', '##n', 'on', '12', '/', '12', '/', '2', '##0', '##2', '##3', ',', 'f', '##oc', '##u', '##s', '##in', '##g', 'on', 'c', '##h', '##a', '##r', '##a', '##c', '##t', '##er', '-', 'le', '##vel', ',', 'wor', '##d', '-', 'le', '##vel', ',', 'a', '##n', '##d', 'o', '##t', '##h', '##er', 'a', '##d', '##v', '##an', '##c', '##ed', 't', '##ec', '##hn', '##i', '##q', '##u', '##es', '.', 'H', '##a', '##p'

SentencePiece

In [18]:
!pip install sentencepiece


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ------------------------------- -------- 786.4/991.5 kB 5.6 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 5.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [20]:
import sentencepiece as spm
import os

# Save the corpus to a temporary file (SentencePiece requires input from a file)
with open("temp_corpus.txt", "w") as f:
    f.write("""Hello world! John works at OpenAI, located in San Francisco. His email is john.doe@example.com. 
    He recently completed a project on tokenization on 12/12/2023, focusing on character-level, word-level, 
    and other advanced techniques. Happiness is the key to success.""")

# Train the SentencePiece model with a reduced vocabulary size
spm.SentencePieceTrainer.Train('--input=temp_corpus.txt --model_prefix=sp_model --vocab_size=50')

# Load the trained model
sp = spm.SentencePieceProcessor(model_file='sp_model.model')

# Tokenize the corpus
tokens = sp.encode("""Hello world! John works at OpenAI, located in San Francisco. His email is john.doe@example.com. 
He recently completed a project on tokenization on 12/12/2023, focusing on character-level, word-level, 
and other advanced techniques. Happiness is the key to success.""", out_type=str)

# Display SentencePiece tokens
print("SentencePiece Tokens:")
print(tokens)

# Cleanup temporary files
os.remove("temp_corpus.txt")
os.remove("sp_model.model")
os.remove("sp_model.vocab")



SentencePiece Tokens:
['▁', 'H', 'e', 'l', 'l', 'o', '▁wor', 'l', 'd', '!', '▁', 'J', 'o', 'h', 'n', '▁wor', 'k', 's', '▁', 'a', 't', '▁', 'O', 'p', 'e', 'n', 'A', 'I', ',', '▁', 'l', 'o', 'c', 'a', 't', 'e', 'd', '▁', 'i', 'n', '▁', 'S', 'a', 'n', '▁', 'F', 'r', 'a', 'n', 'c', 'is', 'co', '.', '▁', 'H', 'is', '▁', 'e', 'm', 'a', 'i', 'l', '▁', 'is', '▁', 'j', 'o', 'h', 'n', '.', 'd', 'o', 'e', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'co', 'm', '.', '▁', 'H', 'e', '▁', 'r', 'e', 'c', 'e', 'n', 't', 'l', 'y', '▁', 'co', 'm', 'p', 'l', 'e', 't', 'e', 'd', '▁', 'a', '▁', 'p', 'r', 'o', 'j', 'e', 'c', 't', '▁o', 'n', '▁', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', '▁o', 'n', '▁', '1', '2', '/', '1', '2', '/', '2', '0', '2', '3', ',', '▁', 'f', 'o', 'c', 'u', 's', 'i', 'n', 'g', '▁o', 'n', '▁', 'c', 'h', 'a', 'r', 'a', 'c', 't', 'e', 'r', '-', 'l', 'e', 'v', 'e', 'l', ',', '▁wor', 'd', '-', 'l', 'e', 'v', 'e', 'l', ',', '▁', 'a', 'n', 'd', '▁o', 't', 'h', 'e', 'r', '▁'

Morphological Tokenization

In [24]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [25]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "He recently completed a project on tokenization, focusing on character-level, word-level, and morphological analysis."

# Process the text with spaCy
doc = nlp(text)

# Extract morphemes using spaCy's token analysis
print("Morphological Tokens:")
for token in doc:
    # Display the token, its lemma, and morphological details
    print(f"Word: {token.text}, Lemma: {token.lemma_}, Morphology: {token.morph}")


Morphological Tokens:
Word: He, Lemma: he, Morphology: Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs
Word: recently, Lemma: recently, Morphology: 
Word: completed, Lemma: complete, Morphology: Tense=Past|VerbForm=Fin
Word: a, Lemma: a, Morphology: Definite=Ind|PronType=Art
Word: project, Lemma: project, Morphology: Number=Sing
Word: on, Lemma: on, Morphology: 
Word: tokenization, Lemma: tokenization, Morphology: Number=Sing
Word: ,, Lemma: ,, Morphology: PunctType=Comm
Word: focusing, Lemma: focus, Morphology: Aspect=Prog|Tense=Pres|VerbForm=Part
Word: on, Lemma: on, Morphology: 
Word: character, Lemma: character, Morphology: Number=Sing
Word: -, Lemma: -, Morphology: PunctType=Dash
Word: level, Lemma: level, Morphology: Number=Sing
Word: ,, Lemma: ,, Morphology: PunctType=Comm
Word: word, Lemma: word, Morphology: Number=Sing
Word: -, Lemma: -, Morphology: PunctType=Dash
Word: level, Lemma: level, Morphology: Number=Sing
Word: ,, Lemma: ,, Morphology: PunctType=Comm
Word: and,

N-Gram Tokenization

In [26]:
from nltk import ngrams

# Sample text
text = "He recently completed a project on tokenization, focusing on character-level, word-level, and morphological analysis."

# Split the text into words
words = text.split()

# Generate bigrams (n=2)
bigrams = list(ngrams(words, 2))
print("Bigrams (n=2):")
for bigram in bigrams:
    print(bigram)

# Generate trigrams (n=3)
trigrams = list(ngrams(words, 3))
print("\nTrigrams (n=3):")
for trigram in trigrams:
    print(trigram)


Bigrams (n=2):
('He', 'recently')
('recently', 'completed')
('completed', 'a')
('a', 'project')
('project', 'on')
('on', 'tokenization,')
('tokenization,', 'focusing')
('focusing', 'on')
('on', 'character-level,')
('character-level,', 'word-level,')
('word-level,', 'and')
('and', 'morphological')
('morphological', 'analysis.')

Trigrams (n=3):
('He', 'recently', 'completed')
('recently', 'completed', 'a')
('completed', 'a', 'project')
('a', 'project', 'on')
('project', 'on', 'tokenization,')
('on', 'tokenization,', 'focusing')
('tokenization,', 'focusing', 'on')
('focusing', 'on', 'character-level,')
('on', 'character-level,', 'word-level,')
('character-level,', 'word-level,', 'and')
('word-level,', 'and', 'morphological')
('and', 'morphological', 'analysis.')


Syllable-Level Tokenization

In [27]:
!pip install syllapy

Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2


In [31]:
import syllapy

# Sample text
text = "He recently completed a project on tokenization, focusing on character-level, word-level, and morphological analysis."

# Tokenize by words first
words = text.split()

# Perform syllable-level counting
print("Syllable Count for Each Word:")
for word in words:
    # Get the syllable count for each word
    syllable_count = syllapy.count(word)
    print(f"Word: {word}, Syllable Count: {syllable_count}")




Syllable Count for Each Word:
Word: He, Syllable Count: 1
Word: recently, Syllable Count: 2
Word: completed, Syllable Count: 3
Word: a, Syllable Count: 1
Word: project, Syllable Count: 2
Word: on, Syllable Count: 1
Word: tokenization,, Syllable Count: 5
Word: focusing, Syllable Count: 3
Word: on, Syllable Count: 1
Word: character-level,, Syllable Count: 5
Word: word-level,, Syllable Count: 3
Word: and, Syllable Count: 1
Word: morphological, Syllable Count: 5
Word: analysis., Syllable Count: 4


General Rules for Counting Syllables
Here are some basic rules to estimate syllable counts:

Each Vowel Sound is Usually a Syllable:

Examples: "Cat" (1 syllable), "Elephant" (3 syllables: e-le-phant)
Consonant-Vowel Patterns:

When consonants are placed between vowels, they often divide syllables.
Example: "Tokenization" (5 syllables: to-ken-i-za-tion).
Silent Letters:

Silent vowels don't add a syllable.
Example: "Bake" (1 syllable), where the "e" is silent.
Prefixes and Suffixes:

Prefixes and suffixes often add syllables if they have vowel sounds.
Example: "Happiness" (3 syllables: hap-pi-ness).
Examples of Syllable Counts in Words from the Sentence
Let's break down the syllable counts in some words from your example sentence:

"He": 1 syllable (one vowel sound)
"Recently": 3 syllables (re-cen-tly)
"Re" and "cent" both contain vowel sounds separated by consonants.
"Completed": 3 syllables (com-ple-ted)
"Com," "ple," and "ted" each contain distinct vowel sounds.
"Tokenization": 5 syllables (to-ken-i-za-tion)
This word contains multiple vowel sounds, creating five syllables.
Why Syllable Counts Matter
Syllable counts are often used in:

Poetry and Rhythm: Where the number of syllables affects the rhythm of a verse.
Speech and Pronunciation: Breaking words into syllables helps with proper pronunciation.
Speech Processing: Syllable-level tokenization can be useful in applications like text-to-speech and speech recognition.

Entity-Level Tokenization

In [35]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = """r. John Doe, a researcher at Google, traveled to New York on September 15, 2023, to attend the Tech Conference. 
You can contact him at john.doe@google.com for more information about the project. He previously worked for Microsoft in Seattle,"""

# Process the text with spaCy
doc = nlp(text)

# Extract named entities
print("Entity-Level Tokens:")
for entity in doc.ents:
    print(f"Entity: {entity.text}, Label: {entity.label_}")


Entity-Level Tokens:
Entity: r. John Doe, Label: PERSON
Entity: Google, Label: ORG
Entity: New York, Label: GPE
Entity: September 15, 2023, Label: DATE
Entity: the Tech Conference, Label: FAC
Entity: Microsoft, Label: ORG
Entity: Seattle, Label: GPE


Regex-Based Tokenization

In [39]:
import re

# Sample text
text = """
Dr. John Doe can be reached at john.doe@example.com. 
The project deadline is 09/30/2023, and the report is due on September 15, 2023. 
Visit our website at https://www.example.com for more information.
"""

# Regex patterns for various entities
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
date_pattern = r'\b(\d{2}/\d{2}/\d{4})\b|\b(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b'
url_pattern = r'https?://(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[a-zA-Z0-9._%+-]*)*'


# Extract emails
emails = re.findall(email_pattern, text)
print("Emails Found:")
for email in emails:
    print(email)

# Extract dates
dates = re.findall(date_pattern, text)
print("\nDates Found:")
for date in dates:
    # date is a tuple with two parts; we only want the non-empty part
    print("".join(date))

# Extract URLs
urls = re.findall(url_pattern, text)
print("\nURLs Found:")
for url in urls:
    print(url)


Emails Found:
john.doe@example.com

Dates Found:
09/30/2023
September

URLs Found:
https://www.example.com
