Sentencepiece is pretty cool because it is a language-independent subword tokenizer.

**Sentencepiece repository:** 
https://github.com/google/sentencepiece

**Sentencepiece python implementation:** https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Tweet Sentiment Extraction Final

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Tweet Sentiment Extraction Final


In [None]:
!pip install transformers
!pip install sentencepiece



In [None]:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load('./input/albert-configs/albert-large-v2/spiece.model')

True

In [None]:
sp.encode_as_pieces('the quick brown fox jumps over the lazy dog')

['▁the',
 '▁quick',
 '▁brown',
 '▁fox',
 '▁jumps',
 '▁over',
 '▁the',
 '▁lazy',
 '▁dog']

In [None]:
sp.encode_as_ids('the quick brown fox jumps over the lazy dog')

[14, 2231, 886, 2385, 17659, 84, 14, 16792, 1952]

In [None]:
pieces = sp.encode_as_pieces('the quick brown fox jumps over the lazy dog')
"".join(pieces).replace("▁", " ").strip()

'the quick brown fox jumps over the lazy dog'

In [None]:
import sys
sys.path.insert(0, './input/sentencepiece_pb2/')

In [None]:
import sentencepiece_pb2

In [None]:
spt = sentencepiece_pb2.SentencePieceText()
sp = spm.SentencePieceProcessor()
sp.load('./input/albert-configs/albert-large-v2/spiece.model')

True

In [None]:
spt.ParseFromString(sp.encode_as_serialized_proto('the quick brown fox jumps over the lazy dog'))
print(spt)

text: "the quick brown fox jumps over the lazy dog"
pieces {
  piece: "\342\226\201the"
  id: 14
  surface: "the"
  begin: 0
  end: 3
}
pieces {
  piece: "\342\226\201quick"
  id: 2231
  surface: " quick"
  begin: 3
  end: 9
}
pieces {
  piece: "\342\226\201brown"
  id: 886
  surface: " brown"
  begin: 9
  end: 15
}
pieces {
  piece: "\342\226\201fox"
  id: 2385
  surface: " fox"
  begin: 15
  end: 19
}
pieces {
  piece: "\342\226\201jumps"
  id: 17659
  surface: " jumps"
  begin: 19
  end: 25
}
pieces {
  piece: "\342\226\201over"
  id: 84
  surface: " over"
  begin: 25
  end: 30
}
pieces {
  piece: "\342\226\201the"
  id: 14
  surface: " the"
  begin: 30
  end: 34
}
pieces {
  piece: "\342\226\201lazy"
  id: 16792
  surface: " lazy"
  begin: 34
  end: 39
}
pieces {
  piece: "\342\226\201dog"
  id: 1952
  surface: " dog"
  begin: 39
  end: 43
}



In [None]:
class SentencePieceTokenizer:
  def __init__(self, model_path):
    self.sp = spm.SentencePieceProcessor()
    self.sp.load(os.path.join(model_path, 'spiece.model'))
  
  def encode(self, sentence):
    spt = sentencepiece_pb2.SentencePieceText()
    spt.ParseFromString(self.sp.encode_as_serialized_proto(sentence))
    offsets = []
    tokens = []
    for piece in spt.pieces:
      tokens.append(piece.id)
      offsets.append((piece.begin, piece.end))
    return tokens, offsets

In [None]:
import os
class config:
  MODEL_NAME = "albert-large-v2"
spt = SentencePieceTokenizer('./input/albert-configs/albert-large-v2')

In [None]:
spt.encode('the quick brown fox jumps over the lazy dog')

([14, 2231, 886, 2385, 17659, 84, 14, 16792, 1952],
 [(0, 3),
  (3, 9),
  (9, 15),
  (15, 19),
  (19, 25),
  (25, 30),
  (30, 34),
  (34, 39),
  (39, 43)])

In [None]:
import transformers
spt_trans = transformers.AlbertTokenizer.from_pretrained('./input/albert-configs/albert-large-v2/')
spt_trans.encode('the quick brown fox jumps over the lazy dog')

[2, 14, 2231, 886, 2385, 17659, 84, 14, 16792, 1952, 3]