# Tokenization Example

## Define corpus (paragraph)

In [1]:
corpus = """The gentle flow of the river carried whispers of ancient stories as it meandered through the lush greenery, where sunlight filtered through towering trees, casting a mosaic of light and shadow on the water. A sense of timeless beauty enveloped the scene, reminding passersby of nature's enduring charm."""

## Tokenization

In [2]:
from nltk.tokenize import sent_tokenize

### Convert paragraph to sentences

In [3]:
documents = sent_tokenize(corpus)

In [4]:
type(documents)

list

In [5]:
for sentence in documents:
  print(sentence)

The gentle flow of the river carried whispers of ancient stories as it meandered through the lush greenery, where sunlight filtered through towering trees, casting a mosaic of light and shadow on the water.
A sense of timeless beauty enveloped the scene, reminding passersby of nature's enduring charm.


### Covert sentences into words

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
word_tokenize(corpus)

['The',
 'gentle',
 'flow',
 'of',
 'the',
 'river',
 'carried',
 'whispers',
 'of',
 'ancient',
 'stories',
 'as',
 'it',
 'meandered',
 'through',
 'the',
 'lush',
 'greenery',
 ',',
 'where',
 'sunlight',
 'filtered',
 'through',
 'towering',
 'trees',
 ',',
 'casting',
 'a',
 'mosaic',
 'of',
 'light',
 'and',
 'shadow',
 'on',
 'the',
 'water',
 '.',
 'A',
 'sense',
 'of',
 'timeless',
 'beauty',
 'enveloped',
 'the',
 'scene',
 ',',
 'reminding',
 'passersby',
 'of',
 'nature',
 "'s",
 'enduring',
 'charm',
 '.']

In [8]:
from nltk.tokenize import wordpunct_tokenize

In [9]:
wordpunct_tokenize(corpus)

['The',
 'gentle',
 'flow',
 'of',
 'the',
 'river',
 'carried',
 'whispers',
 'of',
 'ancient',
 'stories',
 'as',
 'it',
 'meandered',
 'through',
 'the',
 'lush',
 'greenery',
 ',',
 'where',
 'sunlight',
 'filtered',
 'through',
 'towering',
 'trees',
 ',',
 'casting',
 'a',
 'mosaic',
 'of',
 'light',
 'and',
 'shadow',
 'on',
 'the',
 'water',
 '.',
 'A',
 'sense',
 'of',
 'timeless',
 'beauty',
 'enveloped',
 'the',
 'scene',
 ',',
 'reminding',
 'passersby',
 'of',
 'nature',
 "'",
 's',
 'enduring',
 'charm',
 '.']

In [10]:
from nltk.tokenize import TreebankWordTokenizer

In [11]:
tokenizer = TreebankWordTokenizer()

In [12]:
tokenizer.tokenize(corpus)

['The',
 'gentle',
 'flow',
 'of',
 'the',
 'river',
 'carried',
 'whispers',
 'of',
 'ancient',
 'stories',
 'as',
 'it',
 'meandered',
 'through',
 'the',
 'lush',
 'greenery',
 ',',
 'where',
 'sunlight',
 'filtered',
 'through',
 'towering',
 'trees',
 ',',
 'casting',
 'a',
 'mosaic',
 'of',
 'light',
 'and',
 'shadow',
 'on',
 'the',
 'water.',
 'A',
 'sense',
 'of',
 'timeless',
 'beauty',
 'enveloped',
 'the',
 'scene',
 ',',
 'reminding',
 'passersby',
 'of',
 'nature',
 "'s",
 'enduring',
 'charm',
 '.']