In [1]:
import nltk
from nltk.corpus import gutenberg

In [2]:
alice=gutenberg.raw(fileids="carroll-alice.txt")
sample_text="We will discuss briefly about the basic syntax,\
 structure and design philosophies. \
 There is a defined hierarchical syntax for Python code which you should remember \
 when writing code! Python is a really powerful programming language!"

In [3]:
len(alice)

144395

In [4]:
alice[0:100]

"[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I. Down the Rabbit-Hole\n\nAlice was"

## NLTK Sentence Tokenizer
* nltk.sent_tokenize is the default sentence tokenizer recommended by nltk
* It is a pre-trained instance of the PunktSentenceTokenizer which works on various languages
* Doesn't just use periods;Also considers other punctuations and capitalization of words to delimit sentences.

In [5]:
default_st=nltk.sent_tokenize
alice_sentences=default_st(text=alice)
sample_sentences=default_st(text=sample_text)

In [6]:
print("total alice sentences:",len(alice_sentences),"\n",alice_sentences[0:2],"\n\n")
print("total sample sentences:",len(sample_sentences),"\n",sample_sentences[0:2])

total alice sentences: 1625 
 ["[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I.", "Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, 'and what is the use of a book,' thought Alice 'without pictures or\nconversation?'"] 


total sample sentences: 3 
 ['We will discuss briefly about the basic syntax, structure and design philosophies.', 'There is a defined hierarchical syntax for Python code which you should remember  when writing code!']


**Tokenizing German Text**

In [7]:
from nltk.corpus import europarl_raw

In [8]:
german_text=europarl_raw.german.raw(fileids="ep-00-01-17.de")

In [9]:
len(german_text)

157171

In [10]:
german_text[0:100]

' \nWiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sit'

In [11]:
german_sentences_def=default_st(text=german_text,language="german")

**Loading Of Models for above usage**
* The below code shows that indeed there is indeed a trained model for above task under the punkt in tokenizers.

In [12]:
german_tokenizer=nltk.data.load(resource_url="/Users/adithyabandi/nltk_data/tokenizers/punkt/german.pickle")
german_sentences=german_tokenizer.tokenize(german_text)

In [13]:
german_sentences==german_sentences_def

True

In [14]:
print("total german sentences:",len(german_sentences_def),"\n",german_sentences_def[0:2])

total german sentences: 938 
 [' \nWiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen , wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daß Sie schöne Ferien hatten .', 'Wie Sie feststellen konnten , ist der gefürchtete " Millenium-Bug " nicht eingetreten .']


**Tokenization Using PunktTokenizer Class**
* Involves creation of above class instance and calling tokenize function using the same object.

In [15]:
punkt_st=nltk.tokenize.PunktSentenceTokenizer()
sample_sentences=punkt_st.tokenize(sample_text)
print(sample_sentences)

['We will discuss briefly about the basic syntax, structure and design philosophies.', 'There is a defined hierarchical syntax for Python code which you should remember  when writing code!', 'Python is a really powerful programming language!']


**Sentence Tokenization using Regular Expressions**

In [16]:
SENTENCE_TOKENS_PATTERN = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s'
regex_st=nltk.tokenize.RegexpTokenizer(pattern=SENTENCE_TOKENS_PATTERN,gaps=True)
sample_sentences = regex_st.tokenize(sample_text)
sample_sentences

['We will discuss briefly about the basic syntax, structure and design philosophies.',
 ' There is a defined hierarchical syntax for Python code which you should remember  when writing code!',
 'Python is a really powerful programming language!']

## Word Tokenization
* To be done after the sentence tokenization to get the words as tokens
* The splitting are done based on periods,commas,single quotes.
* Most punctuation characters are split and separated into independent words.
* Split words with contractions.eg: do and n't for don't

In [18]:
sentence="The brown fox wasn't that quick and he couldn't win the race"
default_wt=nltk.word_tokenize
words=default_wt(sentence)
words

['The',
 'brown',
 'fox',
 'was',
 "n't",
 'that',
 'quick',
 'and',
 'he',
 'could',
 "n't",
 'win',
 'the',
 'race']

**TreebankWordTokenizer**
* The above instance is a function of TreebankWordTokenizer class and can be used directly instead.

In [19]:
treebank=nltk.tokenize.TreebankWordTokenizer()
words=treebank.tokenize(sentence)
words

['The',
 'brown',
 'fox',
 'was',
 "n't",
 'that',
 'quick',
 'and',
 'he',
 'could',
 "n't",
 'win',
 'the',
 'race']

**Word Tokenization using Regular Expressions**
* Pattern parameter dictates the pattern to be tokenized.
* Gaps parameter tells whether it's the gaps between the patterns to be tokenized or the tokens themselves

In [20]:
TOKEN_PATTERN=r'\w+'
regex_wt=nltk.RegexpTokenizer(pattern=TOKEN_PATTERN,gaps=False)
words=regex_wt.tokenize(sentence)
words

['The',
 'brown',
 'fox',
 'wasn',
 't',
 'that',
 'quick',
 'and',
 'he',
 'couldn',
 't',
 'win',
 'the',
 'race']

In [23]:
GAP_PATTERN=r"\s+"
regex_wt=nltk.RegexpTokenizer(pattern=GAP_PATTERN,gaps=True)
words=regex_wt.tokenize(sentence)
words

['The',
 'brown',
 'fox',
 "wasn't",
 'that',
 'quick',
 'and',
 'he',
 "couldn't",
 'win',
 'the',
 'race']

In [26]:
#To get the start and end indices of the token along with the tokens.
word_indices=list(regex_wt.span_tokenize(sentence))
word_indices
print([sentence[start:end] for start,end in word_indices])

['The', 'brown', 'fox', "wasn't", 'that', 'quick', 'and', 'he', "couldn't", 'win', 'the', 'race']


**Other Word Tokenizers**
* WordPunktTokenizer: Uses the pattern: r'\w+|[^w\s]+'
* WhitespaceTokenizer: Splits based on whitespaces like tabs,newlines,spaces

In [27]:
word_punkt=nltk.tokenize.WordPunctTokenizer()
words=word_punkt.tokenize(sentence)
words

['The',
 'brown',
 'fox',
 'wasn',
 "'",
 't',
 'that',
 'quick',
 'and',
 'he',
 'couldn',
 "'",
 't',
 'win',
 'the',
 'race']

In [28]:
whitespace_wt=nltk.tokenize.WhitespaceTokenizer()
words=whitespace_wt.tokenize(sentence)
words

['The',
 'brown',
 'fox',
 "wasn't",
 'that',
 'quick',
 'and',
 'he',
 "couldn't",
 'win',
 'the',
 'race']