<a href="https://colab.research.google.com/github/abs-git/NLP/blob/main/Engish_Tokenizing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [155]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Data load

In [None]:
!pip install datasets

In [68]:
from datasets import load_dataset, load_metric

In [111]:
GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "rte", "sst2", "stsb", "wnli"]

task = "cola"

dataset = load_dataset("glue", task)
metric = load_metric("glue", task)

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
print(dataset)
print()
print(dataset["train"])
print()
print(dataset["train"][0])

In [None]:
dataset["train"][0]

In [186]:
# data split and save

path = '/content/gdrive/MyDrive/Colab Notebooks/NLP'

train_data = []
for i, data in enumerate(dataset['train']):
  train_data.append(dataset['train'][i]['sentence'])

with open(path + '/glue_train.txt', 'w', encoding='utf-8') as f:
  for sen in train_data:
    f.write(sen + '\n')


test_data = []
for i, data in enumerate(dataset['test']):
  test_data.append(dataset['test'][i]['sentence'])

with open(path + '/glue_test.txt', 'w', encoding='utf-8') as f:
  for sen in test_data:
    f.write(sen + '\n')



## Tokenization

In [214]:
# data load

path = '/content/gdrive/MyDrive/Colab Notebooks/NLP'


with open(path + '/glue_train.txt', 'r') as f:
  train_sentences = f.readlines()

for i, sen in enumerate(train_sentences):
  train_sentences[i] = sen.rstrip("\n")

with open(path + '/glue_test.txt', 'r') as f:
  test_sentences = f.readlines()

for i, sen in enumerate(test_sentences):
  test_sentences[i] = sen.rstrip("\n")

print(len(train_sentences))
print(len(test_sentences))

print(train_sentences[:3])

8551
1063
["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up.", "One more pseudo generalization or I'm giving up."]


In [215]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [216]:
## Word Tokenization

from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer
import spacy

from tensorflow.keras.preprocessing.text import text_to_word_sequence
from torchtext.data.utils import get_tokenizer

sentences = train_sentences[:5]

# nltk word_tokenize
nltk_word_tokens = []
for sen in sentences:
  tokens = word_tokenize(sen)
  nltk_word_tokens.append(tokens)



# nltk word punct
nltk_punct_tokens = []
for sen in sentences:
  tokens = WordPunctTokenizer().tokenize(sen)               # 구두점을 별도로 분류한다.
  nltk_punct_tokens.append(tokens)



# nltk Tree bank
nltk_tree_bank_tokens = []
for sen in sentences:
  tokens = TreebankWordTokenizer().tokenize(sen)               # 하이픈(-)은 하나의 단어로, 아포스트로피(')는 접어로 취급한다.
  nltk_tree_bank_tokens.append(tokens)



# spacy tokenize
spacy_tokens = []
spacy_en = spacy.load('en')
for sen in sentences:
  tokens = [tok.text for tok in spacy_en.tokenizer(sen)]
  spacy_tokens.append(tokens)



# tensorflow keras
keras_tokens = []
for sen in sentences:
  tokens = text_to_word_sequence(sen)               # 자동으로 lower가 진행되고 구두점(,.!)을 제거하지만, 아포스트로피(')는 보존한다.
  keras_tokens.append(tokens)



# torch
torch_tokenizer = get_tokenizer("basic_english")
torch_tokens = []
for sen in sentences:
  tokens = torch_tokenizer(sen)
  torch_tokens.append(tokens)



print("Raw sentence :          ", sentences[0])
print("nltk word tokens :      ", nltk_word_tokens[0])
print("nltk punct tokens :     ", nltk_punct_tokens[0])
print("nltk tree bank tokens : ", nltk_tree_bank_tokens[0])
print("spacy tokens :          ", spacy_tokens[0])
print("keras tokens :          ", keras_tokens[0])
print("torch okens :           ", torch_tokens[0])


Raw sentence :           Our friends won't buy this analysis, let alone the next one we propose.
nltk word tokens :       ['Our', 'friends', 'wo', "n't", 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
nltk punct tokens :      ['Our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
nltk tree bank tokens :  ['Our', 'friends', 'wo', "n't", 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
spacy tokens :           ['Our', 'friends', 'wo', "n't", 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
keras tokens :           ['our', 'friends', "won't", 'buy', 'this', 'analysis', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose']
torch okens :            ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']


In [217]:
# Sentence Tokenization

from nltk.tokenize import sent_tokenize

corpus = ' '.join(sentences)

nltk_sent_tokens = sent_tokenize(corpus)            # 문장의 끝이 아닌 마침표(.)를 구분해서 분할


print("corpus           : ", corpus)
print("nltk sent tokens : ", nltk_sent_tokens)


corpus           :  Our friends won't buy this analysis, let alone the next one we propose. One more pseudo generalization and I'm giving up. One more pseudo generalization or I'm giving up. The more we study verbs, the crazier they get. Day by day the facts are getting murkier.
nltk sent tokens :  ["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up.", "One more pseudo generalization or I'm giving up.", 'The more we study verbs, the crazier they get.', 'Day by day the facts are getting murkier.']


In [218]:
# Subword Tokenization

from collections import defaultdict
from nltk.tokenize import WordPunctTokenizer


# BPE (Byte Pair Encoding)
class BPE():
  def __init__(self):
    self.vocab = []


  def make_dict(self, corpus):     # 코퍼스(corpus)를 읽어들여 글자(character)단위로 분리 후 딕셔너리(dict) 생성
                                   # corpus는 sentence가 담긴 list
    subword_dict = defaultdict(int)

    for sen in corpus:
      tokens = WordPunctTokenizer().tokenize(sen)


      for token in tokens:
        subword_dict[" ".join(list(token)).lower() + " </w>"] = 1      # </w> 문자는 subword의 전/후면 위치를 구분 가능하도록 함.

        self.vocab.extend(list(token.lower()))

    return dict(subword_dict)


  def get_pairs(self, subword_dict):          # dictionary 내의 bi-gram pair를 생성하고 빈도를 counting 한다. 
    
    pairs = defaultdict(int)
    for word, freq in subword_dict.items():
      word_list = word.split()

      for i in range(len(word_list) - 1):
        pairs[(word_list[i], word_list[i+1])] += freq

    return dict(pairs)


  def merge_dict(self, pairs, subword_dict):

    merged_subword_dict = defaultdict(int)
    best_pair = max(pairs, key = pairs.get)   # bi-gram pair dictionary에 가장 많이 등장하는 

    # print(best_pair)
    for word, freq in subword_dict.items():
      
      if " ".join(best_pair) in word:
        merged_word = word.replace(" ".join(best_pair), "".join(best_pair))

        merged_subword_dict[merged_word] = subword_dict[word]    
      else:
        merged_subword_dict[word] = subword_dict[word]

    return dict(merged_subword_dict)


  def get_vocab(self, subword_dict):

    temp_vocab = []
    for word, freq in subword_dict.items():
      word_list = word.split()

      temp_vocab.extend(word_list)

    self.vocab = list(set(temp_vocab) | set(self.vocab))

    return self.vocab
    


In [219]:
byte_pair_encoder = BPE()

subword_dict = byte_pair_encoder.make_dict(sentences)
print(subword_dict)
print()

for i in range(20):
  pairs = byte_pair_encoder.get_pairs(subword_dict)
  subword_dict = byte_pair_encoder.merge_dict(pairs, subword_dict)
  vocab = byte_pair_encoder.get_vocab(subword_dict)

print(subword_dict)
print()
print(vocab)
print(len(vocab))


{'o u r </w>': 1, 'f r i e n d s </w>': 1, 'w o n </w>': 1, "' </w>": 1, 't </w>': 1, 'b u y </w>': 1, 't h i s </w>': 1, 'a n a l y s i s </w>': 1, ', </w>': 1, 'l e t </w>': 1, 'a l o n e </w>': 1, 't h e </w>': 1, 'n e x t </w>': 1, 'o n e </w>': 1, 'w e </w>': 1, 'p r o p o s e </w>': 1, '. </w>': 1, 'm o r e </w>': 1, 'p s e u d o </w>': 1, 'g e n e r a l i z a t i o n </w>': 1, 'a n d </w>': 1, 'i </w>': 1, 'm </w>': 1, 'g i v i n g </w>': 1, 'u p </w>': 1, 'o r </w>': 1, 's t u d y </w>': 1, 'v e r b s </w>': 1, 'c r a z i e r </w>': 1, 't h e y </w>': 1, 'g e t </w>': 1, 'd a y </w>': 1, 'b y </w>': 1, 'f a c t s </w>': 1, 'a r e </w>': 1, 'g e t t i n g </w>': 1, 'm u r k i e r </w>': 1}

{'o u r</w>': 1, 'f r ie nd s</w>': 1, 'w on</w>': 1, "' </w>": 1, 't</w>': 1, 'b u y</w>': 1, 'th is</w>': 1, 'a n al y s is</w>': 1, ', </w>': 1, 'l e t</w>': 1, 'al one</w>': 1, 'th e</w>': 1, 'ne x t</w>': 1, 'one</w>': 1, 'w e</w>': 1, 'p r o p o s e</w>': 1, '. </w>': 1, 'm o re</w>': 1

In [220]:
# data load
# reference : https://colab.research.google.com/github/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb#scrollTo=UtFQqK3tmp7G
# !wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

In [221]:
# SentencePiece (Unigram Language Model Tokenizer)

import sentencepiece as spm
import re

train_data = './botchan.txt'

# command setting
templates = '--input={} --model_prefix={} --vocab_size={} --model_type={}'

vocab_size = 5000
prefix = './sentencepiece_prefix'
model_type = 'bpe'

cmd = templates.format(train_data, prefix, vocab_size, model_type)


# train
spm.SentencePieceTrainer.Train(cmd)


# model load
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format(prefix))

with open('{}.vocab'.format(prefix), encoding='utf-8') as f:      # vocabrary 정보
  vocab_info = [info.strip().split("\t") for info in f]


# test
tokens = sp.encode_as_pieces(test_sentences[0])    # Tokenizing
tokens_IDs = sp.encode_as_ids(test_sentences[0])

sen = sp.decode_pieces(tokens)                    # Detokenizing
sen_from_IDs = sp.decode_ids(tokens_IDs)

print(test_sentences[0])
print(tokens)
print(tokens_IDs)
print(sen)
print(sen_from_IDs)


Bill whistled past the house.
['▁B', 'ill', '▁wh', 'ist', 'led', '▁past', '▁the', '▁house', '.']
[180, 157, 105, 261, 447, 1563, 9, 278, 4951]
Bill whistled past the house.
Bill whistled past the house.


In [222]:
# Huggingface tokenizer

# !pip install tokenizers
import os
from tokenizers import CharBPETokenizer, BertWordPieceTokenizer, SentencePieceBPETokenizer, ByteLevelBPETokenizer

# model load
Basic_tokenizer = CharBPETokenizer()
SP_tokenizer = SentencePieceBPETokenizer()
BL_tokenizer = ByteLevelBPETokenizer()
BWP_tokenizer = BertWordPieceTokenizer(strip_accents= False, lowercase= False)  # strip_accents는 학습 시와 load 시 동일해야한다.


# data load
data_path = '/content/gdrive/MyDrive/Colab Notebooks/NLP'

# params
corpus_file = [data_path + '/glue_train.txt']
vocab_size = 32000
limit_alphabet = 5000
output_path = data_path + '/output'
min_freq = 10


# train
Basic_tokenizer.train(files = corpus_file,
                      vocab_size = vocab_size,
                      min_frequency = min_freq,
                      limit_alphabet = limit_alphabet,
                      show_progress = True
                      )

SP_tokenizer.train(files = corpus_file,
                      vocab_size = vocab_size,
                      min_frequency = min_freq,
                      limit_alphabet = limit_alphabet,
                      show_progress = True
                      )

BL_tokenizer.train(files = corpus_file,
                      vocab_size = vocab_size,
                      min_frequency = min_freq,
                      show_progress = True
                      )

BWP_tokenizer.train(files = corpus_file,
                      vocab_size = vocab_size,
                      min_frequency = min_freq,
                      limit_alphabet = limit_alphabet,
                      show_progress = True
                      )


# model(vocab) save
# Basic_tokenizer.save_model(data_path)
# SP_tokenizer.save_model(data_path)
# BL_tokenizer.save_model(data_path)
# BWP_tokenizer.save_model(data_path)


# model(vocab) load




# test
Basic_output = Basic_tokenizer.encode(test_sentences[0])
SP_output = SP_tokenizer.encode(test_sentences[0])
BL_output = BL_tokenizer.encode(test_sentences[0])
BWP_output = BWP_tokenizer.encode(test_sentences[0])

basic_decoded = Basic_tokenizer.decode(Basic_output.ids)
SP_decoded = SP_tokenizer.decode(SP_output.ids)
BL_decoded = BL_tokenizer.decode(BL_output.ids)
BWP_decoded = BWP_tokenizer.decode(BWP_output.ids)


print("input   : {}".format(test_sentences[0]))
print()

print("basic tokens  : {}".format(Basic_output.tokens))
print("basic id      : {}".format(Basic_output.ids))
print("basic offsets : {}".format(Basic_output.offsets))
print("basic decode  : {}".format(basic_decoded))
print()

print("SP tokens  : {}".format(SP_output.tokens))
print("SP id      : {}".format(SP_output.ids))
print("SP offsets : {}".format(SP_output.offsets))
print("SP decode  : {}".format(SP_decoded))
print()

print("BL tokens  : {}".format(BL_output.tokens))
print("BL id      : {}".format(BL_output.ids))
print("BL offsets : {}".format(BL_output.offsets))
print("BL decode  : {}".format(BL_decoded))
print()

print("BWP tokens  : {}".format(BWP_output.tokens))
print("BWP id      : {}".format(BWP_output.ids))
print("BWP offsets : {}".format(BWP_output.offsets))
print("BWP decode  : {}".format(BWP_decoded))
print()

input   : Bill whistled past the house.

basic tokens  : ['Bill</w>', 'wh', 'ist', 'led</w>', 'p', 'ast</w>', 'the</w>', 'house</w>', '.</w>']
basic id      : [232, 189, 724, 362, 68, 1123, 153, 689, 106]
basic offsets : [(0, 4), (5, 7), (7, 10), (10, 13), (14, 15), (15, 18), (19, 22), (23, 28), (28, 29)]
basic decode  : Bill whistled past the house .

SP tokens  : ['▁Bill', '▁wh', 'ist', 'led', '▁p', 'ast', '▁the', '▁house', '.']
SP id      : [197, 215, 374, 777, 105, 388, 89, 746, 11]
SP offsets : [(0, 4), (4, 7), (7, 10), (10, 13), (13, 15), (15, 18), (18, 22), (22, 28), (28, 29)]
SP decode  : Bill whistled past the house.

BL tokens  : ['Bill', 'Ġwh', 'ist', 'led', 'Ġp', 'ast', 'Ġthe', 'Ġhouse', '.']
BL id      : [567, 374, 501, 641, 274, 528, 259, 898, 13]
BL offsets : [(0, 4), (4, 7), (7, 10), (10, 13), (13, 15), (15, 18), (18, 22), (22, 28), (28, 29)]
BL decode  : Bill whistled past the house.

BWP tokens  : ['Bill', 'whis', '##tle', '##d', 'past', 'the', 'house', '.']
BWP id   

In [224]:
# Subword text encoder (tensorflow)

import tensorflow_datasets as tfds

# model load
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(train_sentences, target_vocab_size = 30000)

tokens_sen = tokenizer.encode(test_sentences[0])
original_sen = tokenizer.decode(tokens_sen)

print(tokenizer.subwords[:10])
print()

print(test_sentences[0])
print(tokens_sen)
print(original_sen)

['the_', 'to_', 'The_', 'a_', 'I_', 'that_', 'is_', ', ', 'John_', 'of_']

Bill whistled past the house.
[23, 3032, 2527, 1, 359, 7681]
Bill whistled past the house.


In [225]:
# Word tokenizing for Neural Network using keras

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences_1 = sentences[:3]
sentences_2 = sentences[3:]


tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")     # 토큰화 되지 않은 문장이 입력될 때 <OOV> 인덱스 (1)가 부여된다.
tokenizer.fit_on_texts(sentences_1)                           # (인코딩) 입력되는 문장들 내의 단어들에 인덱스를 부여한다. 구두점 (, . ! ?)은 제외된다.

word_index = tokenizer.word_index
sentences_1_seq = tokenizer.texts_to_sequences(sentences_1)
padding = pad_sequences(sentences_1_seq, padding = 'post')        # setences_1_seq의 가장 긴 길이의 문장에 따라 패딩을 설정한다.

sentences_2_seq = tokenizer.texts_to_sequences(sentences_2)


print("sentences_1 : ", sentences_1)
print("sentences_2 : ", sentences_2)
print()

print("fit word index : ", word_index)
print("sentence_1 to indices sequence : ", sentences_1_seq)
print("sentence_2 to indices sequence : ", sentences_2_seq)
print()

print("padding of sentence_1 : ", padding)
print()

print('Encoding samples')
print(sentences_1[0], " -> ", sentences_1_seq[0])
print(sentences_2[0], " -> ", sentences_2_seq[0])


sentences_1 :  ["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up.", "One more pseudo generalization or I'm giving up."]
sentences_2 :  ['The more we study verbs, the crazier they get.', 'Day by day the facts are getting murkier.']

fit word index :  {'<OOV>': 1, 'one': 2, 'more': 3, 'pseudo': 4, 'generalization': 5, "i'm": 6, 'giving': 7, 'up': 8, 'our': 9, 'friends': 10, "won't": 11, 'buy': 12, 'this': 13, 'analysis': 14, 'let': 15, 'alone': 16, 'the': 17, 'next': 18, 'we': 19, 'propose': 20, 'and': 21, 'or': 22}
sentence_1 to indices sequence :  [[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 2, 19, 20], [2, 3, 4, 5, 21, 6, 7, 8], [2, 3, 4, 5, 22, 6, 7, 8]]
sentence_2 to indices sequence :  [[17, 3, 19, 1, 1, 17, 1, 1, 1], [1, 1, 1, 17, 1, 1, 1, 1]]

padding of sentence_1 :  [[ 9 10 11 12 13 14 15 16 17 18  2 19 20]
 [ 2  3  4  5 21  6  7  8  0  0  0  0  0]
 [ 2  3  4  5 22  6  7  8  0  0  0  0  0]]

Encoding sample