### Load the corpus

In [None]:
import numpy as np

corpus_brtext = []
corpus_brtext_test = []
sents_set = set()

# or br-phono.txt
with open('br-text.txt') as f:
    sents_set = set(l.replace('\n','') for l in f.readlines())      
sents = [l.split(' ') for l in sents_set]

# Sample 200 sentences to form a doc; repeat 400 times to get 400 docs
for _ in range(400):
    corpus_brtext.append([])
    for i in set(np.random.choice(range(len(sents)),200)):
        corpus_brtext[-1].append(sents[i])
    corpus_brtext[-1] = [[''.join(j) for j in corpus_brtext[-1]], corpus_brtext[-1]]

sents2 = sents[int(len(sents)*0.9):]
sents2 = [[''.join(j) for j in sents2], sents2]
corpus_brtext_test.append(sents2)

### Run LiB

In [None]:
from libtok.model import LessIsBetter
from libtok.structures import TrieList

model = LessIsBetter(
    # Common parameters
    life = 10,
    max_len = 12,
    memory_in = 0.25,
    memory_out = 0.0001,
    update_rate = 0.2,
    # Parameters for detecting skip-gram (new function; being testing; default it is disable)
    use_skip=False,
    mini_gap = 7
)

# Define the train/test set
corpus_train = corpus_brtext
corpus_test = corpus_brtext_test

# New and init a memory
model.initialise(corpus_train[0][0]) # init the Lexicon memory with some unigrams in corpus

# RUN!!!
for epoch_id in range(5001):
    model.run(epoch_id, corpus_train, corpus_test)
# [epoch_id]  MemLength:[size of Lexicon memory]
# [B] (Boundary evaluation scores)
# [T] (Token evaluation scores)

### See the head entities in Lexicon memory

In [None]:
model.memory[:50]

### See the chunk segmentation result and the subchunk segmentation result 

In [None]:
# On each two-line display, the above are the original words
# and the below are the model-generated chunks/subchunks.
article, article_raw = corpus_train[2]
onset, end = 10, 20
print('---\nchunks\n---')
model.show_result(article_raw[onset:end], article[onset:end], decompose=False)
print('---\nsubchunks\n---')
model.show_result(article_raw[onset:end], article[onset:end], decompose=True)