# Tokenization and segmentation

In [4]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize')
doc = nlp('This is a test sentence for stanza. This is another sentence.')

for i, sentence in enumerate(doc.sentences):
    print(f'------ Tokens for sentence {i + 1} ------')
    print(*[f'id: {token.id[0]}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 154kB [00:00, 32.5MB/s]                    
2022-05-12 00:29:34 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2022-05-12 00:29:34 INFO: Use device: gpu
2022-05-12 00:29:34 INFO: Loading: tokenize
2022-05-12 00:29:34 INFO: Done loading processors!


------ Tokens for sentence 1 ------
id: 1	text: This
id: 2	text: is
id: 3	text: a
id: 4	text: test
id: 5	text: sentence
id: 6	text: for
id: 7	text: stanza
id: 8	text: .
------ Tokens for sentence 2 ------
id: 1	text: This
id: 2	text: is
id: 3	text: another
id: 4	text: sentence
id: 5	text: .


In [5]:
print([sentence.text for sentence in doc.sentences])

['This is a test sentence for stanza.', 'This is another sentence.']


### Tokenization without sentence segmentation

In [7]:
nlp = stanza.Pipeline(
    lang='en',
    processors='tokenize',
    tokenize_no_ssplit=True
)
doc = nlp('This is a sentence.\n\nThis is a second. This is a third.')
for i, sentence in enumerate(doc.sentences):
    print(f'------ Tokens for sentence {i + 1} ------')
    print(*[f'id: {token.id[0]}\ttext: {token.text}' for token in sentence.tokens], sep='\n')
    

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 154kB [00:00, 54.3MB/s]                    
2022-05-12 00:34:05 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2022-05-12 00:34:05 INFO: Use device: gpu
2022-05-12 00:34:05 INFO: Loading: tokenize
2022-05-12 00:34:05 INFO: Done loading processors!


------ Tokens for sentence 1 ------
id: 1	text: This
id: 2	text: is
id: 3	text: a
id: 4	text: sentence
id: 5	text: .
------ Tokens for sentence 2 ------
id: 1	text: This
id: 2	text: is
id: 3	text: a
id: 4	text: second
id: 5	text: .
id: 6	text: This
id: 7	text: is
id: 8	text: a
id: 9	text: third
id: 10	text: .


In [11]:
nlp = stanza.Pipeline(
    lang='en',
    processors='tokenize',
    tokenize_pretokenized=True
)
doc = nlp(' This is token.ization done my way!\nSentence split, too!')
for i, sentence in enumerate(doc.sentences):
    print(f'------ Tokens for sentence {i + 1} ------')
    print(*[f'id: {token.id[0]}\ttext: {token.text}' for token in sentence.tokens], sep='\n')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json: 154kB [00:00, 51.3MB/s]                    
2022-05-12 00:37:46 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2022-05-12 00:37:46 INFO: Use device: gpu
2022-05-12 00:37:46 INFO: Loading: tokenize
2022-05-12 00:37:46 INFO: Done loading processors!


------ Tokens for sentence 1 ------
id: 1	text: This
id: 2	text: is
id: 3	text: token.ization
id: 4	text: done
id: 5	text: my
id: 6	text: way!
------ Tokens for sentence 2 ------
id: 1	text: Sentence
id: 2	text: split,
id: 3	text: too!
