In [None]:
# Define the input corpus (text data for tokenization)
corpus="""Hello Welcome,to Krish Naik's NLP Tutorials.
Please do watch the entire course! to become expert in NLP.
"""

In [None]:
# Print the original text
print(corpus)

Hello Welcome,to Krish Naik's NLP Tutorials.
Please do watch the entire course! to become expert in NLP.



In [None]:
# Import nltk for Natural Language Processing
import nltk

# Download the 'punkt_tab' tokenizer model (used for word and sentence tokenization)
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /Users/pro/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# ---------------------------------------------------
# Sentence Tokenization: Splitting text into sentences
# ---------------------------------------------------
from nltk.tokenize import sent_tokenize

In [None]:
# Tokenize the corpus into sentences
documents = sent_tokenize(corpus)

In [None]:
# Check the type of 'documents' (should be a list)
type(documents)

list

In [None]:
# Print each sentence obtained from sentence tokenization
for sentence in documents:
    print(sentence)

Hello Welcome,to Krish Naik's NLP Tutorials.
Please do watch the entire course!
to become expert in NLP.


In [None]:
# ---------------------------------------------------
# Word Tokenization: Splitting sentences into words
# ---------------------------------------------------
from nltk.tokenize import word_tokenize

In [None]:
# Example: Tokenizing entire corpus into words (not assigned here)
word_tokenize(corpus)

['Hello',
 'Welcome',
 ',',
 'to',
 'Krish',
 'Naik',
 "'s",
 'NLP',
 'Tutorials',
 '.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'course',
 '!',
 'to',
 'become',
 'expert',
 'in',
 'NLP',
 '.']

In [None]:
# Tokenize each sentence into words and print them
for sentence in documents:
    print(word_tokenize(sentence))

['Hello', 'Welcome', ',', 'to', 'Krish', 'Naik', "'s", 'NLP', 'Tutorials', '.']
['Please', 'do', 'watch', 'the', 'entire', 'course', '!']
['to', 'become', 'expert', 'in', 'NLP', '.']


In [None]:
# ---------------------------------------------------
# WordPunct Tokenizer: Splits words and punctuation
# Tokenize using wordpunct_tokenize (splits on punctuation too)
# ---------------------------------------------------
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(corpus)

In [33]:
# ---------------------------------------------------
# TreebankWordTokenizer: More fine-tuned tokenizer used in Penn Treebank
# ---------------------------------------------------
from nltk.tokenize import TreebankWordTokenizer

# Create tokenizer object
tokenizer=TreebankWordTokenizer()
# Tokenize the corpus using Treebank tokenizer
tokenizer.tokenize(corpus)

['Hello',
 'Welcome',
 ',',
 'to',
 'Krish',
 'Naik',
 "'s",
 'NLP',
 'Tutorials.',
 'Please',
 'do',
 'watch',
 'the',
 'entire',
 'course',
 '!',
 'to',
 'become',
 'expert',
 'in',
 'NLP',
 '.']

# --------------------------------------
# Conclusion and Importance of Tokenization
# --------------------------------------

"""
Tokenization is a fundamental step in Natural Language Processing (NLP).
In this notebook, we explored various tokenization techniques using NLTK:

1. Sentence Tokenization: Splits a paragraph into individual sentences.
2. Word Tokenization: Splits a sentence or paragraph into words.
3. WordPunct Tokenization: Splits words and punctuation separately.
4. TreebankWordTokenizer: A rule-based tokenizer that handles complex structures like contractions and punctuations more effectively.

Each tokenizer has its own use case depending on the task and level of detail required.

Why is tokenization important?
------------------------------
Tokenization is necessary because:
- It converts raw text into manageable pieces (tokens) for further analysis.
- It helps in tasks such as text classification, named entity recognition, POS tagging, and machine translation.
- Machine learning models cannot process raw text; they require structured input (typically in the form of tokens or vectors).
- Tokenization enables normalization, removing ambiguity from natural language.

In summary, tokenization serves as the bridge between human language and machine-understandable text representations. It's a critical preprocessing step for almost every NLP application.
"""
