In [1]:
# Import dependencies
from __future__ import print_function, division
import gc
import sys
import torch
import math

from datetime import datetime

import random

import numpy as np
from tqdm.notebook import tqdm

# Ignore warnings
import warnings

warnings.filterwarnings("ignore")


def time_now():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

from kinlpmorpho import build_kinlp_morpho_lib

build_kinlp_morpho_lib()

from kinlpmorpholib import ffi, lib

from wurlitzer import sys_pipes

with sys_pipes():
    conf = "/mnt/NVM/KINLP/data/kb_config_kinlp.conf"
    lib.start_kinlp_lib(conf.encode('utf-8'))

print(time_now(), 'KINLP Ready!', flush=True)

# %%

import importlib
import morpho_data_loaders
import morpho_model

importlib.reload(morpho_data_loaders)
importlib.reload(morpho_model)

import youtokentome as yttm
from morpho_data_loaders import KBVocab, AffixSetVocab, morpho_model_seq_predict
from morpho_model import kinyabert_base

from test_inference_data_loaders import inf_gather_replicated_itemized_data

home_path = "/mnt/NVM/KINLP/"
USE_GPU = False

device = torch.device('cpu')
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('Using device: ', device)

BPE_model_path = (home_path + "data/BPE-30k.mdl")
bpe_encoder = yttm.BPE(model=BPE_model_path)

kb_vocab = KBVocab()
kbvocab_state_dict_file_path = (home_path + "data/kb_vocab_state_dict_2021-02-07.pt")
kb_vocab.load_state_dict(torch.load(kbvocab_state_dict_file_path))

affix_set_vocab = AffixSetVocab(reduced_affix_dict_file=home_path+"data/reduced_affix_dict_10000.csv",
                                    reduced_affix_dict_map_file=home_path+"data/reduced_affix_dict_map_10000.csv")
print('Vocab ready!')

generating ./kinlpmorpholib.c
(already up-to-date)
the current directory is '/home/user/projects/user/kinyabert/modeling/kinyabert'
running build_ext
building 'kinlpmorpholib' extension
gcc -pthread -B /home/user/anaconda3/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/home/user/anaconda3/include/python3.8 -c kinlpmorpholib.c -o ./kinlpmorpholib.o -fopenmp -D use_openmp -O3 -march=native -ffast-math
gcc -pthread -shared -B /home/user/anaconda3/compiler_compat -L/home/user/anaconda3/lib -Wl,-rpath=/home/user/anaconda3/lib -Wl,--no-as-needed -Wl,--sysroot=/ ./kinlpmorpholib.o -lkinlp -o ./kinlpmorpholib.cpython-38-x86_64-linux-gnu.so -fopenmp

2021-07-26 13:52:18	Setting up KINLP dictionary...

2021-07-26 13:52:19	KINLP dictionary size: 2294464

2021-07-26 13:52:19	Parsing match/req rules...
total_pref_require_global_ids: 408
Initiated all POS classes: 149

2021-07-26 13:52:19	Reading known lexicon...
	Special words: 401
	Verbs l

In [2]:
from morpho_model import KinyaBERT

def morpho_test_mlm_inference(args, input_line, kb_vocab : KBVocab, affix_set_vocab : AffixSetVocab, bpe_encoder, kinya_bert_model : KinyaBERT, morpho_rel_pos_dict, morpho_rel_pos_dmax, mask_ids, proposed_stem_ids=None):
    stem_acc = 0.0
    afset_acc = 0.0
    affix_acc = 0.0

    stem_sum_acc = 0.0
    afset_sum_acc = 0.0
    affix_sum_acc = 0.0

    itemized_data, itemized_parsed_tokens = inf_gather_replicated_itemized_data(args, input_line, 512, kb_vocab, affix_set_vocab, bpe_encoder, morpho_rel_pos_dict, morpho_rel_pos_dmax, mask_ids)

    for myidx, data_item in enumerate(itemized_data):
        seq_parsed_tokens = itemized_parsed_tokens[myidx]
        (max_seq_len,
         seq_rel_pos_arr,
         seq_pos_tags,
         seq_stems,
         seq_afsets,
         seq_affixes,
         seq_tokens_lengths,
         seq_predicted_stems,
         seq_predicted_afsets,
         seq_predicted_affixes,
         seq_predicted_tokens_idx,
         seq_predicted_tokens_affixes_idx,
         seq_predicted_tokens_affixes_lengths) = data_item

        (stem_predictions, stem_predictions_prob, afset_predictions, afset_predictions_prob, affix_predictions) = morpho_model_seq_predict(args, data_item,
                                                                                                kinya_bert_model,
                                                                                                device, 10, proposed_stem_ids=proposed_stem_ids)

        np.set_printoptions(precision=3, linewidth=np.inf)
        print('\nstem_predictions:', stem_predictions.detach().numpy().tolist())
        print('\nseq_predicted_stems:', seq_predicted_stems)
        print('\nafset_predictions:', afset_predictions)
        print('\nstem_predictions_prob:', stem_predictions_prob.detach().numpy())
        print('\nseq_predicted_tokens_affixes_lengths:', seq_predicted_tokens_affixes_lengths)
        print('\nseq_predicted_tokens_affixes_idx:', seq_predicted_tokens_affixes_idx)
        print('\naffix_predictions:', affix_predictions)
        print('\nseq_predicted_affixes:', seq_predicted_affixes, '\n')

        stems_count = 0
        affixes_count = 0
        affixes_total = 0
        afx = 0
        ptk = 0
        next_ptk_idx = 0
        ptoken = seq_parsed_tokens[0]
        for i in range(len(seq_tokens_lengths)):
            # 1. Print Token
            # Keep the same token until all stem_idx are handled
            if (next_ptk_idx == i):
                ptoken = seq_parsed_tokens[ptk]
                next_ptk_idx = i + len(ptoken.stem_idx)
                ptk += 1
            print('\n{} @ --> {} {} {} ==> {}'.format(ptoken.surface_form,
                                                      kb_vocab.pos_tag_vocab_idx[ptoken.pos_tag_idx],
                                                      [kb_vocab.affix_vocab_idx[k] for k in ptoken.affixes_idx],
                                                      [kb_vocab._stem_vocab_idx[k] for k in ptoken.stem_idx],
                                                      [kb_vocab.reduced_stem_vocab_idx[
                                                           kb_vocab.mapped_stem_vocab_idx[k]] for k in
                                                       ptoken.stem_idx]))

            # 1.5 Print input stem,pos & affixes
            print('Input:', '{}/{}'.format(kb_vocab.pos_tag_vocab_idx[seq_pos_tags[i]],
                                           kb_vocab.reduced_stem_vocab_idx[seq_stems[i]]),
                  ['{}'.format(kb_vocab.affix_vocab_idx[a]) for a in
                   seq_affixes[afx:(afx + seq_tokens_lengths[i])]])
            afx += seq_tokens_lengths[i]

            # 2. Stem Prediction
            if i in seq_predicted_tokens_idx:
                pstem = kb_vocab.reduced_stem_vocab_idx[seq_predicted_stems[stems_count]]
                _pstem = kb_vocab.reduced_stem_vocab_idx[stem_predictions[stems_count].item()]
                _pstem_prob = stem_predictions_prob[stems_count].item()
                print('{} [STEM]>>>> Gold: {} --> Pred: {} @ {:.3}'.format((pstem == _pstem), pstem, _pstem,
                                                                           _pstem_prob))
                stem_sum_acc += 1
                if (pstem == _pstem):
                    stem_acc += 1

                if seq_predicted_afsets is not None:
                    pafset = affix_set_vocab.affix_set_idx_to_txt(seq_predicted_afsets[stems_count], kb_vocab)
                    _pafset = affix_set_vocab.affix_set_idx_to_txt(afset_predictions[stems_count].item(), kb_vocab)
                    _pafset_prob = afset_predictions_prob[stems_count].item()
                    print('{} [AFSET]>>>> Gold: {} --> Pred: {} @ {:.3}'.format((pafset == _pafset), pafset, _pafset,
                                                                           _pafset_prob))
                    afset_sum_acc += 1
                    if (pafset == _pafset):
                        afset_acc += 1

                # 3. Affix Prediction
                if stems_count in seq_predicted_tokens_affixes_idx:
                    flen = seq_predicted_tokens_affixes_lengths[affixes_count]
                    paffixes = set([kb_vocab.affix_vocab_idx[a] for a in
                                    seq_predicted_affixes[affixes_total:(affixes_total + flen)]])
                    _paffixes = set([kb_vocab.affix_vocab_idx[a] for a in affix_predictions[affixes_count]])
                    incr = float(len(paffixes.intersection(_paffixes))) / float(len(paffixes))
                    print('{} @ {:.2f} [AFFIX]>>> Gold: {} --> Pred: {}'.format((paffixes == _paffixes), incr,
                                                                                paffixes, _paffixes))
                    affixes_count += 1
                    affixes_total += flen
                    affix_sum_acc += 1
                    affix_acc += incr
                    # if (paffixes == _paffixes):
                    #     affix_acc += 1
                stems_count += 1
        print(
            '\n-------------------------------------------------------------------- NEW SEQUENCE ---------------------------------------------------------------------\n')
    print(input_line, '\n', 'ACCURACY%:>> STEM: {:.2f}%({:.0f}/{:.0f}) \t AFSET: {:.2f}%({:.0f}/{:.0f}) \t AFFIX: {:.2f}%({:.2f}/{:.2f})'.format(
        (100.0 * stem_acc / (stem_sum_acc + 1e-7)), stem_acc, stem_sum_acc,
        (100.0 * afset_acc / (afset_sum_acc + 1e-7)), afset_acc, afset_sum_acc,
        100.0 * affix_acc / (affix_sum_acc + 1e-7), affix_acc, affix_sum_acc))

In [3]:
import os
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from morpho_common import setup_common_args
args = setup_common_args("-g 1 -p 3 -s 1 --use-pos-aware-rel=1 --use-tupe-rel=0".split(' '))

os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '88599'
dist.init_process_group(backend='gloo', init_method='env://', world_size=1, rank=0)

morpho_rel_pos_dict = None
morpho_rel_pos_dmax = 5
if args.use_pos_aware_rel_pos_bias:
    morpho_rel_pos_dict_file_path = (home_path+"data/morpho_rel_pos_dict_2021-03-24.pt")
    saved_pos_rel_dict = torch.load(morpho_rel_pos_dict_file_path)
    morpho_rel_pos_dict = saved_pos_rel_dict['morpho_rel_pos_dict']
    morpho_rel_pos_dmax = saved_pos_rel_dict['morpho_rel_pos_dmax']

kb_model = kinyabert_base(kb_vocab, affix_set_vocab, morpho_rel_pos_dict,
                       device, args, saved_model_file=None)
ddp_model = DDP(kb_model)

file = "/mnt/NVM/KinyaBERT_Checkpoints/kb_attentive/backup_07_24_morpho_attentive_model_base_2021-04-19.pt"

kb_state_dict = torch.load(file,map_location=device)
ddp_model.load_state_dict(kb_state_dict['model_state_dict'])
kb_model = ddp_model.module
kb_model.eval()

print('OK')

Call arguments:
 Namespace(accumulation_steps=128, batch_size=20, cls_dev_input0=None, cls_dev_input1=None, cls_dev_label=None, cls_labels='0,1', cls_test_input0=None, cls_test_input1=None, cls_test_label=None, cls_train_input0=None, cls_train_input1=None, cls_train_label=None, devbest_cls_model_save_file_path=None, devbest_cls_output_file=None, embed_dim=768, final_cls_model_save_file_path=None, final_cls_output_file=None, gpus=1, home_path='/home/user/KINLP/', inference_iters=1, inference_runs=1, layernorm_epsilon=1e-06, max_input_lines=999999, max_seq_len=512, morpho_dim=128, morpho_tr_dim_feedforward=512, morpho_tr_dropout=0.1, morpho_tr_nhead=4, morpho_tr_nlayers=4, num_epochs=20, num_inference_iters=1, num_inference_runs=1, num_iters=200000, num_pos_m_embeddings=3, num_stem_m_embeddings=1, number_of_load_batches=384, peak_lr=0.0004, pooler_dropout=0.1, pos=3, pretrained_model_file=None, pretrained_roberta_checkpoint_file='checkpoint_best.pt', pretrained_roberta_model_dir='/home/u

In [13]:
input_line = "Umuyobozi ushinzwe ishami ry’imisoro y’inzego z’ibanze muri RRA avuga ko ibyo bibazo byavuzwe ariko nta mwanzuro wigeze ubagezwaho wo kutabara imisoro ku bikorwa by’Abihayimana."
mask_ids = [20]
proposed_stems = ["V:ger","V:iger"]
proposed_stem_ids = [kb_vocab.reduced_stem_vocab[s] for s in proposed_stems]
morpho_test_mlm_inference(args, input_line, kb_vocab, affix_set_vocab, bpe_encoder, kb_model, morpho_rel_pos_dict, morpho_rel_pos_dmax, mask_ids, proposed_stem_ids=proposed_stem_ids)



stem_predictions: [575]

seq_predicted_stems: [183]

stem_predictions_prob: [0.225]

seq_predicted_tokens_affixes_lengths: [3]

seq_predicted_tokens_affixes_idx: [0]

affix_predictions: [[26, 5, 13, 66, 36, 9, 35, 12, 171, 20]]

seq_predicted_affixes: [5, 66, 26] 


<CLS> @ --> <CLS> ['<EOT>'] ['<CLS>'] ==> ['<CLS>']
Input: <CLS>/<CLS> []

umuyobozi @ --> N#012 ['N:0:u', 'N:1:mu'] ['N:yobozi'] ==> ['N:yobozi']
Input: N#012/N:yobozi ['N:0:u', 'N:1:mu']

ushinzwe @ --> V#010 ['V:2:u', 'V:16:y', 'V:17:w', 'V:18:ye'] ['V:shing'] ==> ['V:shing']
Input: V#010/V:shing ['V:2:u', 'V:16:y', 'V:17:w', 'V:18:ye']

ishami @ --> N#012 ['N:0:i'] ['N:shami'] ==> ['N:shami']
Input: N#012/N:shami ['N:0:i']

rya @ --> PO#022 ['PO:1:ri'] ['PO:a'] ==> ['PO:a']
Input: PO#022/PO:a ['PO:1:ri']

imisoro @ --> N#012 ['N:0:i', 'N:1:mi'] ['N:soro'] ==> ['N:soro']
Input: N#012/N:soro ['N:0:i', 'N:1:mi']

ya @ --> PO#022 ['PO:1:i'] ['PO:a'] ==> ['PO:a']
Input: PO#022/PO:a ['PO:1:i']

inzego @ --> N#012 ['N:0:i', '