### Load the corpus

In [1]:
import numpy as np

corpus_brtext = []
corpus_brtext_test = []
sents_set = set()

with open('br-text.txt') as f:
    for l in f.readlines():
        sents_set.add(l.replace('\n',''))
        
sents_set = list(sents_set)        
sents1 = []
for i in sents_set:
    sent = i.split(' ')
    sents1.append(sent)

for _ in range(400):
    corpus_brtext.append([])
    for i in set(np.random.choice(range(len(sents1)),200)):
        corpus_brtext[-1].append(sents1[i])
    corpus_brtext[-1] = [[''.join(j) for j in corpus_brtext[-1]], corpus_brtext[-1]]

sents2 = []
for i in sents_set[int(len(sents_set)*0.9):]:
    sent = i.split(' ')
    sents2.append(sent)
sents2 = [[''.join(j) for j in sents2], sents2]
corpus_brtext_test.append(sents2)

### Run LiB

In [2]:

from src.libtok import model
import importlib
importlib.reload(model)

model.life = 10
model.max_len = 12
model.memory_in = 0.25
model.memory_out = 0.0001
model.update_rate = 0.2

model.mini_gap = 7
model.use_skip=True

memory = model.TrieList()

corpus_train = corpus_brtext
corpus_test = corpus_brtext_test

model.init(memory, corpus_train[0][0]) # init the Lexicon memory with some unigrams in corpus

for epoch_id in range(5001):
    model.run(epoch_id, memory, corpus_train, corpus_test)

0	  MemLength: 156
[B] Precision: 34.58% 	 Recall: 95.12% 	 F1: 50.72%
[L] Precision: 9.40% 	 Recall: 25.85% 	 F1: 13.78%

100	  MemLength: 1130
[B] Precision: 70.26% 	 Recall: 92.07% 	 F1: 79.70%
[L] Precision: 48.91% 	 Recall: 64.09% 	 F1: 55.48%

200	  MemLength: 1516
[B] Precision: 76.28% 	 Recall: 90.75% 	 F1: 82.89%
[L] Precision: 56.34% 	 Recall: 67.03% 	 F1: 61.22%

300	  MemLength: 1718
[B] Precision: 78.70% 	 Recall: 90.63% 	 F1: 84.24%
[L] Precision: 60.48% 	 Recall: 69.66% 	 F1: 64.75%

400	  MemLength: 1852
[B] Precision: 80.13% 	 Recall: 90.52% 	 F1: 85.01%
[L] Precision: 61.60% 	 Recall: 69.58% 	 F1: 65.35%

500	  MemLength: 1976
[B] Precision: 81.17% 	 Recall: 90.25% 	 F1: 85.47%
[L] Precision: 62.41% 	 Recall: 69.39% 	 F1: 65.71%

600	  MemLength: 2078
[B] Precision: 82.05% 	 Recall: 89.67% 	 F1: 85.69%
[L] Precision: 63.00% 	 Recall: 68.85% 	 F1: 65.79%

700	  MemLength: 2147
[B] Precision: 83.01% 	 Recall: 89.09% 	 F1: 85.94%
[L] Precision: 64.19% 	 Recall: 68.89% 	 

### See the head entities in Lexicon memory

In [3]:
memory[:50]

[('skipgram', '[bos]', '[eos]'),
 ('skipgram', '[bos]', 'you'),
 ('skipgram', '[bos]', 's'),
 'and',
 ('skipgram', '[bos]', 'do'),
 'the',
 'you',
 "that's",
 'it',
 "it's",
 'yeah',
 ('skipgram', '[bos]', 'it'),
 'youcan',
 'what',
 'wanna',
 'that',
 'canyou',
 "he's",
 ('skipgram', 'look', '[eos]'),
 ('skipgram', '[bos]', 'i'),
 'now',
 ('skipgram', 'a', '[eos]'),
 "there's",
 'ithink',
 'is',
 'we',
 "here's",
 'see',
 'he',
 'isthat',
 ('skipgram', '[bos]', 'c'),
 ('skipgram', 'and', '[eos]'),
 'do',
 "'s",
 'your',
 'thisis',
 'did',
 'like',
 'with',
 'this',
 'no',
 ('skipgram', 'it', '[eos]'),
 'youwanna',
 "what's",
 'put',
 'look',
 'where',
 'to',
 'another',
 "you're"]

In [7]:
for k in memory[:200]:
    if k[0] == 'skipgram':
        print(k[1:])

('[bos]', '[eos]')
('[bos]', 'you')
('[bos]', 's')
('[bos]', 'do')
('[bos]', 'it')
('look', '[eos]')
('[bos]', 'i')
('a', '[eos]')
('[bos]', 'c')
('and', '[eos]')
('it', '[eos]')
('[bos]', 'an')
('[bos]', 'a')
('his', '[eos]')
('[bos]', 'the')
('e', '[eos]')
('ok', '[eos]')
('the', '[eos]')
('you', '[eos]')
('this', '[eos]')
('re', '[eos]')
('wanna', '[eos]')
('[bos]', 'put')
('isthat', '[eos]')


In [8]:
for k in memory:
    if k in memory.relationship:
        print(k,memory.relationship[k])

ithink ('skipgram', '[bos]', 'you')
isthat ('skipgram', '[bos]', 'do')
another ('skipgram', '[bos]', 'c')
you're ('skipgram', '[bos]', 's')
inthe ('skipgram', 'look', '[eos]')
who ('skipgram', '[bos]', 'do')
build ('skipgram', '[bos]', 'it')
smell ('skipgram', 'ok', '[eos]')
open ('skipgram', '[bos]', '[eos]')
when ('skipgram', '[bos]', 'you')
ishe ('skipgram', '[bos]', 's')
kindof ('skipgram', '[bos]', 'an')
through ('skipgram', 'look', '[eos]')
tr ('skipgram', '[bos]', 'i')
color ('skipgram', '[bos]', 's')
drink ('skipgram', 'a', '[eos]')
nice ('skipgram', 'it', '[eos]')
we'll ('skipgram', '[bos]', 'put')
pull ('skipgram', '[bos]', 'it')
think ('skipgram', '[bos]', 'you')
isthis ('skipgram', '[bos]', 'c')
push ('skipgram', '[bos]', 'it')
please ('skipgram', '[bos]', '[eos]')
daddy's ('skipgram', '[bos]', 's')
alotof ('skipgram', 'yeah', '[eos]')
sitdown ('skipgram', 'wanna', '[eos]')
got ('skipgram', '[bos]', '[eos]')
dada ('skipgram', '[bos]', '[eos]')
theboy ('skipgram', '[bos]', '

### See the chunk segmentation result and the subchunk segmentation result 

In [10]:
article, article_raw = corpus_train[2]
onset, end = 10, 20
print('---\nchunks\n---')
model.show_result(memory, article_raw[onset:end], article[onset:end], decompose=False)
print('---\nsubchunks\n---')
model.show_result(memory, article_raw[onset:end], article[onset:end], decompose=True)

---
chunks
---
 	 are those 	 sandals 	 shall we 	 open 	 the door 	
 	 arethose 	 s andals 	 shallwe 	 open 	 thedoor 	

can you 	 stick 	 it out 	 it doesn't 	 have 	 a doggie 	 want say 	
canyou 	 stick 	 itout 	 itdoesn't 	 have 	 adoggie 	 wantsay 	

good bye 	 no it's not 	 do you wanna 	 untie 	 the shoes 	 i see 	 an apple 	
goodbye 	 noit'snot 	 doyouwanna 	 untie 	 theshoes 	 isee 	 anapple 	

---
subchunks
---
 	 are 	 those 	 sandals 	 shall 	 we 	 open 	
 	 are 	 those 	 s and a ls 	 shall 	 we 	 open 	

the door 	 can you 	 stick 	 it 	 out 	 it 	 doesn't 	
thedoor 	 canyou 	 stick 	 it 	 out 	 it 	 doesn't 	

have 	 a 	 doggie 	 want 	 say 	 good 	 bye 	
have 	 a 	 doggie 	 want 	 say 	 good 	 bye 	

no 	 it's not 	 do 	 you 	 wanna 	 untie 	
no 	 it'snot 	 do 	 you 	 wanna 	 un tie 	

the 	 shoes 	 i see 	 an 	 apple 	 do 	 you 	
the 	 shoes 	 isee 	 an 	 apple 	 do 	 you 	

