# Example of using sentence splitter chunking
Compare the diff of splitting_1.txt and splitting_2.txt

In [None]:
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index import SimpleDirectoryReader, Document
from llama_index.utils import globals_helper
from langchain.text_splitter import NLTKTextSplitter, SpacyTextSplitter, RecursiveCharacterTextSplitter

document = SimpleDirectoryReader('data').load_data()[0]
text_splitter_default = TokenTextSplitter() # use default settings
text_chunks = text_splitter_default.split_text(document.text)
doc_chunks = [Document(t) for t in text_chunks]
tokenizer = globals_helper.tokenizer
with open('splitting_1.txt', 'w') as f:
    for idx, doc in enumerate(doc_chunks):
        f.write("\n-------\n\n{}. Size: {} tokens\n".format(idx, len(tokenizer(doc.text))) + doc.text)

from llama_index.langchain_helpers.text_splitter import SentenceSplitter

sentence_splitter = SentenceSplitter()
text_chunks = sentence_splitter.split_text(document.text)
doc_chunks = [Document(t) for t in text_chunks]
with open('splitting_2.txt', 'w') as f:
    for idx, doc in enumerate(doc_chunks):
        f.write("\n-------\n\n{}. Size: {} tokens\n".format(idx, len(tokenizer(doc.text))) + doc.text)

nltk_splitter = NLTKTextSplitter()
text_chunks = nltk_splitter.split_text(document.text)
doc_chunks = [Document(t) for t in text_chunks]
tokenizer = globals_helper.tokenizer
with open('splitting_3.txt', 'w') as f:
    for idx, doc in enumerate(doc_chunks):
        f.write("\n-------\n\n{}. Size: {} tokens\n".format(idx, len(tokenizer(doc.text))) + doc.text)

# spacy_splitter = SpacyTextSplitter()
# text_chunks = spacy_splitter.split_text(document.text)
# tokenizer = globals_helper.tokenizer
# with open('splitting_4.txt', 'w') as f:
#     for idx, doc in enumerate(doc_chunks):
#         f.write("\n-------\n\n{}. Size: {} tokens\n".format(idx, len(tokenizer(doc.text))) + doc.text)

# from langchain.text_splitter import TokenTextSplitter
# token_text_splitter = TokenTextSplitter()
# text_chunks = token_text_splitter.split_text(document.text)
# doc_chunks = [Document(t) for t in text_chunks]
# tokenizer = globals_helper.tokenizer
# with open('splitting_5.txt', 'w') as f:
#     for idx, doc in enumerate(doc_chunks):
#         f.write("\n-------\n\n{}. Size: {} tokens\n".format(idx, len(tokenizer(doc.text))) + doc.text)

# recursive_splitter = RecursiveCharacterTextSplitter()
# text_chunks = recursive_splitter.split_text(document.text)
# doc_chunks = [Document(t) for t in text_chunks]
# tokenizer = globals_helper.tokenizer
# with open('splitting_6.txt', 'w') as f:
#     for idx, doc in enumerate(doc_chunks):
#         f.write("\n-------\n\n{}. Size: {} tokens\n".format(idx, len(tokenizer(doc.text))) + doc.text)


## Testing with Chinese

In [None]:
from llama_index.langchain_helpers.text_splitter import SentenceSplitter
from llama_index.readers.schema.base import Document
from llama_index.indices.service_context import ServiceContext
from llama_index.node_parser.simple import SimpleNodeParser
from llama_index.indices.vector_store import GPTVectorStoreIndex
import wikipedia

In [None]:

sentence_splitter = SentenceSplitter()
wikipedia.set_lang('zh')
page = wikipedia.page('美国', auto_suggest=True).content
sentence_splitter.split_text(page)

In [None]:

node_parser = SimpleNodeParser(text_splitter=sentence_splitter)
service_context = ServiceContext.from_defaults(node_parser=node_parser)
documents = []
documents.append(Document(page))
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)