In [1]:
import torch
import warnings
warnings.filterwarnings("ignore")

import youtokentome as yttm
from kb_data_loaders import KBVocab

BPE_model_path = '/mnt/STORAGE/KINLP/data/BPE-30k.mdl'
bpe_encoder = yttm.BPE(model=BPE_model_path)

kb_vocab = KBVocab()
kbvocab_state_dict_file_path = "/mnt/STORAGE/KINLP/data/kb_vocab_state_dict_2021-02-07.pt"
kb_vocab.load_state_dict(torch.load(kbvocab_state_dict_file_path))

print('_stem_vocab:', len(kb_vocab._stem_vocab))
print('reduced_stem_vocab:', len(kb_vocab.reduced_stem_vocab))
print('reduced_stem_vocab_idx_subsample_weights_max:', kb_vocab.reduced_stem_vocab_idx_subsample_weights_max)
print('reduced_stem_vocab_idx_subsample_weights_min:', kb_vocab.reduced_stem_vocab_idx_subsample_weights_min)

print('affix_vocab_idx_subsample_weights_max:', kb_vocab.affix_vocab_idx_subsample_weights_max)
print('affix_vocab_idx_subsample_weights_min:', kb_vocab.affix_vocab_idx_subsample_weights_min)

print('pos_tag_vocab_idx_subsample_weights_max:', kb_vocab.pos_tag_vocab_idx_subsample_weights_max)
print('pos_tag_vocab_idx_subsample_weights_min:', kb_vocab.pos_tag_vocab_idx_subsample_weights_min)

print('morpheme_slot_vocab_idx_subsample_weights_max:', kb_vocab.morpheme_slot_vocab_idx_subsample_weights_max)
print('morpheme_slot_vocab_idx_subsample_weights_min:', kb_vocab.morpheme_slot_vocab_idx_subsample_weights_min)

print('STEM-<UNK>:', kb_vocab.reduced_stem_vocab_idx_subsample_weights[kb_vocab.reduced_stem_vocab['<UNK>']])
print('AFFIX-<UNK>:', kb_vocab.affix_vocab_idx_subsample_weights[kb_vocab.affix_vocab['<UNK>']])
print('POS-<UNK>:', kb_vocab.pos_tag_vocab_idx_subsample_weights[kb_vocab.pos_tag_vocab['<UNK>']])
print('MORPH-<UNK>:', kb_vocab.morpheme_slot_vocab_idx_subsample_weights[kb_vocab.morpheme_slot_vocab['<UNK>']])
print('STEM-163:', kb_vocab._stem_vocab_idx[163])
print('STEM-163:', kb_vocab.reduced_stem_vocab_idx[kb_vocab.mapped_stem_vocab_idx[163]])
print('STEM-163:', kb_vocab.reduced_stem_vocab_idx_subsample_weights[kb_vocab.mapped_stem_vocab_idx[163]])

print('\nreduced_stem_vocab: [V:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('V:')]))
print('reduced_stem_vocab: [N:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('N:')]))
print('reduced_stem_vocab: [QA:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('QA:')]))
print('reduced_stem_vocab: [PO:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('PO:')]))
print('reduced_stem_vocab: [DE:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('DE:')]))
print('reduced_stem_vocab: [NU:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('NU:')]))
print('reduced_stem_vocab: [OT:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('OT:')]))
print('reduced_stem_vocab: [NP:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('NP:')]))
print('reduced_stem_vocab: [T:]', len([k for k in kb_vocab.reduced_stem_vocab if k.startswith('T:')]))
print('reduced_stem_vocab: [CLS:]', len([k for k in kb_vocab.reduced_stem_vocab if not (k.startswith('NP:') or k.startswith('T:') or k.startswith('V:') or k.startswith('N:') or k.startswith('QA:') or k.startswith('PO:') or k.startswith('DE:') or k.startswith('NU:') or k.startswith('OT:'))]))
print('\nVocabulary ready!')

print('Vocab ready!')

generating ./kinlpmorpholib.c
the current directory is '/home/user/projects/user/kinyabert/modeling/kb_plain'
running build_ext
building 'kinlpmorpholib' extension
gcc -pthread -B /home/user/anaconda3/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/home/user/anaconda3/include/python3.8 -c kinlpmorpholib.c -o ./kinlpmorpholib.o -fopenmp -D use_openmp -O3 -march=native -ffast-math
gcc -pthread -shared -B /home/user/anaconda3/compiler_compat -L/home/user/anaconda3/lib -Wl,-rpath=/home/user/anaconda3/lib -Wl,--no-as-needed -Wl,--sysroot=/ ./kinlpmorpholib.o -lkinlp -o ./kinlpmorpholib.cpython-38-x86_64-linux-gnu.so -fopenmp
_stem_vocab: 95677
reduced_stem_vocab: 34008
reduced_stem_vocab_idx_subsample_weights_max: 109.97302453744622
reduced_stem_vocab_idx_subsample_weights_min: 0.14438313861524893
affix_vocab_idx_subsample_weights_max: 109.97661663303704
affix_vocab_idx_subsample_weights_min: 0.10404824292856545
pos_tag_vocab_idx_sub

In [2]:
kb_vocab.compute_subsampling_weights()
kbvocab_state_dict_file_path = "/mnt/STORAGE/KINLP/data/kb_vocab_state_dict_2021-02-07.pt"
torch.save(kb_vocab.state_dict(),kbvocab_state_dict_file_path)
print('Done')

Done


In [11]:
morpho_rel_pos_dmax = 5
V = [(x,y,d) for x in kb_vocab.pos_tag_vocab_idx for y in kb_vocab.pos_tag_vocab_idx for d in range(-morpho_rel_pos_dmax,morpho_rel_pos_dmax+1) if d!=0]
morpho_rel_pos_dict = {(x,y,d):(i+1) for i,(x,y,d) in enumerate(V)}

print(len(morpho_rel_pos_dict))

237160


In [12]:
print(len(morpho_rel_pos_dict))
#print(morpho_rel_pos_dict)
save_dict = {'morpho_rel_pos_dict':morpho_rel_pos_dict, 'morpho_rel_pos_dmax':morpho_rel_pos_dmax}
morpho_rel_pos_dict_file_path = "/mnt/STORAGE/KINLP/data/morpho_rel_pos_dict_2021-03-24.pt"
torch.save(save_dict,morpho_rel_pos_dict_file_path)

237160


In [13]:
home_path = "/mnt/STORAGE/KINLP/"
morpho_rel_pos_dict_file_path = (home_path+"data/morpho_rel_pos_dict_2021-03-24.pt")
saved_pos_rel_dict = torch.load(morpho_rel_pos_dict_file_path)
morpho_rel_pos_dict = saved_pos_rel_dict['morpho_rel_pos_dict']
morpho_rel_pos_dmax = saved_pos_rel_dict['morpho_rel_pos_dmax']
print(len(morpho_rel_pos_dict))
print(morpho_rel_pos_dmax)

237160
5
