In [1]:
from __future__ import print_function, division

from datetime import datetime

import torch

from fairseq.models.roberta import RobertaModel
from fairseq.models.roberta import XLMRModel

# @Workstation-PC
home_path = "/home/user/KINLP/"
USE_GPU = True

from fairseq.data.encoders import register_bpe

@register_bpe("nonebpe")
class NoneBPE(object):

    @staticmethod
    def add_args(parser):
        pass

    def __init__(self, args):
        pass

    def encode(self, x: str) -> str:
        return x

    def decode(self, x: str) -> str:
        return x

In [2]:
roberta = RobertaModel.from_pretrained("/mnt/NVM/KinyaBERT_Checkpoints/roberta-kinlp/", checkpoint_file='checkpoint790.pt', bpe="nonebpe")
xlmr = XLMRModel.from_pretrained("/mnt/NVM/KinyaBERT_Checkpoints/xlmr.base/", checkpoint_file='model.pt', bpe="nonebpe")

In [3]:
import fastBPE
import sentencepiece

codes_path="/home/user/projects/user/kinyabert/fastBPE/rw_codes"
vocab_path="/home/user/projects/user/kinyabert/fastBPE/vocab.rw.40000"
spm_model_path = "/mnt/NVM/KinyaBERT_Checkpoints/xlmr.base/sentencepiece.bpe.model"

bpe = fastBPE.fastBPE(codes_path, vocab_path)
spm = sentencepiece.SentencePieceProcessor(model_file=spm_model_path)

input = ['tugendane', 'twese', 'n\'','abandi']

bpe_input = [w.split(' ') for w in bpe.apply(input)]
print('bpe:',bpe_input)

spm_input = spm.encode(input, out_type=str)

print('spm:',spm_input)

bpe: [['tugend@@', 'ane'], ['twese'], ["n'"], ['abandi']]
spm: [['▁tug', 'enda', 'ne'], ['▁tw', 'ese'], ['▁n', "'"], ['▁ab', 'andi']]


In [4]:
inp = 'ibi tubikora kuko babitwemereye bitanatugoye'
print('bpe:', bpe.apply([inp])[0])
print('spm:', ' '.join(spm.encode(inp, out_type=str)))

bpe: ibi tubikora kuko babit@@ wemereye bitan@@ atug@@ oye
spm: ▁ibi ▁tu bi kora ▁kuko ▁babi t we mere ye ▁bi tana tu go ye


In [16]:
bpe_in = ' '.join(bpe_input[0])
spm_in = ' '.join(spm_input[0])

print('bpe_in:',bpe_in)
print('spm_in:',spm_in)

roberta_ids = roberta.encode(bpe_in)
xlmr_ids = xlmr.encode(spm_in)

print('roberta:',roberta_ids)
print('xlmr:',xlmr_ids)

bpe_in: tugend@@ ane
spm_in: ▁tug enda ne
roberta: tensor([    0, 36037,   551,     2])
xlmr: tensor([    0, 33952,  7074,    86,     2])


In [17]:
roberta_features = roberta.extract_features(roberta_ids)
xlmr_features = xlmr.extract_features(xlmr_ids)

print('roberta_features:',roberta_features.shape)
print('xlmr_features:',xlmr_features.shape)

roberta_features: torch.Size([1, 4, 768])
xlmr_features: torch.Size([1, 5, 768])


In [None]:
roberta.train()
r, rr = roberta.model(roberta_ids.unsqueeze(0), features_only=True)

xlmr.train()
x, xx = xlmr.model(xlmr_ids.unsqueeze(0), features_only=True)

print('r:',r.shape)
print('x:',x.shape)

print('rr:',rr)
print('xx:',xx)



In [8]:
roberta_ids = roberta.binarize(bpe_input[0])
xlmr_ids = xlmr.binarize(spm_input[0])

print('roberta:',roberta_ids)
print('xlmr:',xlmr_ids)

AttributeError: 'RobertaHubInterface' object has no attribute 'binarize'