# Tokenization.


In [1]:
text = "Tokenization is one of the first step in any NLP pipeline. Tokenization is nothing but splitting the raw text into small chunks of words or sentences, called tokens. If the text is split into words, then its called as 'Word Tokenization' and if it's split into sentences then its called as 'Sentence Tokenization'. Generally 'space' is used to perform the word tokenization and characters like 'periods, exclamation point and newline char are used for Sentence Tokenization.  We have to choose the appropriate method as per the task in hand. While performing the tokenization few characters like spaces, punctuations are ignored and will not be the part of final list of tokens."

### Built-in methods


In [2]:
tokens = text.split()
print(tokens[:5])

['Tokenization', 'is', 'one', 'of', 'the']


#### Sentence tokenization


In [3]:
tokens = text.split(".")
print(tokens[:3])

['Tokenization is one of the first step in any NLP pipeline', ' Tokenization is nothing but splitting the raw text into small chunks of words or sentences, called tokens', " If the text is split into words, then its called as 'Word Tokenization' and if it's split into sentences then its called as 'Sentence Tokenization'"]


### RegEx tokenization


#### Word tokenization


In [4]:
import re

tokens = re.findall("[\w]+", text)
print(tokens[:5])

['Tokenization', 'is', 'one', 'of', 'the']


### NLTK library


#### Word tokenization


In [5]:
!pip install nltk

In [6]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)
print(tokens[:5])

['Tokenization', 'is', 'one', 'of', 'the']


#### Sentence tokenization


In [7]:
from nltk.tokenize import sent_tokenize

tokens = sent_tokenize(text)
print(tokens[:3])

['Tokenization is one of the first step in any NLP pipeline.', 'Tokenization is nothing but splitting the raw text into small chunks of words or sentences, called tokens.', "If the text is split into words, then its called as 'Word Tokenization' and if it's split into sentences then its called as 'Sentence Tokenization'."]


### Spacy library


#### Word tokenization


In [8]:
!pip install spacy
!python -m spacy download en

In [9]:
from spacy.lang.en import English

english_tokenizer = English()

doc = english_tokenizer(text)
tokens = [token.text for token in doc]
print(tokens[:5])

['Tokenization', 'is', 'one', 'of', 'the']


#### Sentence tokenization


In [10]:
english_tokenizer = English()
english_tokenizer.add_pipe("sentencizer")


doc = english_tokenizer(text)
tokens = [token.sent for token in doc.sents]
print(tokens[:3])

[Tokenization is one of the first step in any NLP pipeline., Tokenization is nothing but splitting the raw text into small chunks of words or sentences, called tokens., If the text is split into words, then its called as 'Word Tokenization' and if it's split into sentences then its called as 'Sentence Tokenization'.]
