# Env

In [7]:
import argparse
import collections
import os
import random
import re
import shutil
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import torch

In [8]:
# enwiki file 경로저장
enwiki_file = "/home/tako/youngmi/errordetection/data/enwiki.txt"

In [9]:
# 환경 설정
args = {
    "seed": 1234,    # random seed value
    "device": torch.device("cuda"),
    "corpus": enwiki_file,    # 말뭉치 파일
}
args = argparse.Namespace(**args)
print(args)

Namespace(corpus='/home/tako/youngmi/errordetection/data/enwiki.txt', device=device(type='cuda'), seed=1234)


In [10]:
# random seed 설정
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

# 파일확인

In [11]:
# wiki 라인수 확인
count = 0
f = open(enwiki_file)
for i, line in enumerate(f):
    count += 1
print(count)

66467178


In [12]:
# wiki 내용 확인
f = open(enwiki_file)
for i, line in enumerate(f):
    if i >= 50:
        break
    line = line.strip()
    print(line)


Anarchism
Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be undesirable, unnecessary, and harmful. It is usually described alongside libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement and as having a historical association with anti-capitalism and socialism.
The history of anarchy goes back to prehistory, when humans arguably lived in anarchic societies long before the establishment of formal states, realms or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose, but it was not until the 19th century that a self-conscious political movement emerged. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in workers' struggles for emancipat

# Char Tokenizer

In [36]:
char_counter = collections.defaultdict(int)
# char 개수 확인
f = open(enwiki_file)
for i, line in enumerate(f):
    line = line.strip()
    for c in line:
        char_counter[c] += 1

KeyboardInterrupt: 

In [37]:
# char 개수
print(len(char_counter))

20283


In [38]:
most_freq = sorted(char_counter.items(), key=lambda item: item[1], reverse=True)# 빈도수가 많은 글자 10개 출력
least_freq = sorted(char_counter.items(), key=lambda item: item[1]) # 빈도수가 적은 글자 10개 출력

print(most_freq[:10])
print(least_freq[:10])

[(' ', 1150262080), ('e', 682867213), ('a', 484240208), ('t', 477459004), ('i', 449131995), ('o', 420596357), ('n', 420593530), ('r', 382364324), ('s', 354565582), ('h', 257957230)]
[('≽', 1), ('҅', 1), ('ᴥ', 1), ('ƥ', 1), ('Ẇ', 1), ('˭', 1), ('𨁂', 1), ('⊉', 1), ('⤖', 1), ('🚲', 1)]


In [39]:
# token 개수
count = 0
for c, cnt in char_counter.items():
    count += cnt
print(count)

7556946145


In [40]:
# char에 일련번호 부여
char_to_id = {'[PAD]': 0, '[UNK]': 1}
for c, cnt in char_counter.items():
    char_to_id[c] = len(char_to_id)
print(len(char_to_id))

20285


In [None]:
# wiki char tokenize
f = open(enwiki_file)
for i, line in enumerate(f):
    if i >= 5:
        break
    line = line.strip()
    print(line)
    _id = [char_to_id[c] for c in line]
    print(_id)

# Word Tokenizer

In [14]:
word_counter = collections.defaultdict(int)
# word 개수 확인
f = open(enwiki_file)
for i, line in enumerate(f):
    line = line.strip()
    for w in line.split():
        word_counter[w] += 1

KeyboardInterrupt: 

In [42]:
print(len(word_counter)) # word 개수

2614841


In [43]:
most_freq = sorted(word_counter.items(), key=lambda item: item[1], reverse=True)# 빈도수가 많은 단어 10개 출력
least_freq = sorted(word_counter.items(), key=lambda item: item[1])# 빈도수가 적은 단어 10개 출력

print(most_freq[:10])
print(least_freq[:10])

[('the', 5905474), ('of', 3391153), ('and', 2698259), ('in', 2102004), ('to', 2009975), ('a', 1737393), ('is', 879367), ('was', 870521), ('as', 799966), ('The', 791450)]
[('id="10"', 1), ('url="https://en.wikipedia.org/wiki?curid=10"', 1), ('title="AccessibleComputing">', 1), ('AccessibleComputing', 1), ('id="12"', 1), ('url="https://en.wikipedia.org/wiki?curid=12"', 1), ('title="Anarchism">', 1), ('(libertarian', 1), ('"anarkhia",', 1), ('"arkhos"', 1)]


In [44]:
# token 개수
count = 0
for c, cnt in word_counter.items():
    count += cnt
print(count)

90910077


In [45]:
# word에 일련번호 부여
word_to_id = {'[PAD]': 0, '[UNK]': 1}
for w, cnt in word_counter.items():
    word_to_id[w] = len(word_to_id)
print(len(word_to_id))

2614843


In [46]:
# wiki word tokenize
f = open(enwiki_file)
for i, line in enumerate(f):
    if i >= 5:
        break
    line = line.strip()
    print(line)
    _id = [word_to_id[w] for w in line.split()]
    print(_id)

<doc id="10" url="https://en.wikipedia.org/wiki?curid=10" title="AccessibleComputing">
[2, 3, 4, 5]
AccessibleComputing
[6]

[]

[]

[]


# BPE

In [None]:
# BPE corpus
corpus = """
low lower newest widest
low lower newest widest
low       newest widest
low       newest
low       newest
          newest
"""

In [None]:
word_counter = collections.defaultdict(int)
# word 개수 확인
with zipfile.ZipFile(args.corpus) as z:
    for w in corpus.strip().split():
        word_counter[w] += 1

print(word_counter)

defaultdict(<class 'int'>, {'low': 5, 'lower': 2, 'newest': 6, 'widest': 3})


In [None]:
bpe_counter = collections.defaultdict(int)

# subword 개수 확인
for w, n in word_counter.items():
    w = f"\u2581{w}"
    bpe_counter[" ".join(w)] = n

print(bpe_counter)

defaultdict(<class 'int'>, {'▁ l o w': 5, '▁ l o w e r': 2, '▁ n e w e s t': 6, '▁ w i d e s t': 3})


In [None]:
def update_vocab(vocab, counter):
    """
    vocab 변경
    :param vocab: vocabulary
    :param counter: BPE counter
    """
    for w in counter:
        for s in w.split():
            if s not in vocab:
                vocab[s] = len(vocab)
    return vocab

In [None]:
# bpe 일련번호 부여
bpe_to_id = {'[PAD]': 0, '[UNK]': 1}
bpe_to_id = update_vocab(bpe_to_id, bpe_counter)

print(bpe_to_id)

{'[PAD]': 0, '[UNK]': 1, '▁': 2, 'l': 3, 'o': 4, 'w': 5, 'e': 6, 'r': 7, 'n': 8, 's': 9, 't': 10, 'i': 11, 'd': 12}


In [None]:
def get_stats(counter):
    """
    bi-gram 빈도수 계산
    :param counter: BPE counter
    """
    pairs = collections.defaultdict(int)
    for word, freq in counter.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

In [None]:
def merge_vocab(pair, v_in):
    """
    bi-gram merge
    :param counter: BPE counter
    """
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

In [None]:
# bigram pair
pairs = get_stats(bpe_counter)

print(pairs)

defaultdict(<class 'int'>, {('▁', 'l'): 7, ('l', 'o'): 7, ('o', 'w'): 7, ('w', 'e'): 8, ('e', 'r'): 2, ('▁', 'n'): 6, ('n', 'e'): 6, ('e', 'w'): 6, ('e', 's'): 9, ('s', 't'): 9, ('▁', 'w'): 3, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'e'): 3})


In [None]:
# find most freq bigram pair
best = max(pairs, key=pairs.get)

print(best)

('e', 's')


In [None]:
# merge most freq bigram pair
bpe_counter = merge_vocab(best, bpe_counter)

print(bpe_counter)

{'▁ l o w': 5, '▁ l o w e r': 2, '▁ n e w es t': 6, '▁ w i d es t': 3}


In [None]:
# update vocab
bpe_to_id = update_vocab(bpe_to_id, bpe_counter)

print(bpe_to_id)

{'[PAD]': 0, '[UNK]': 1, '▁': 2, 'l': 3, 'o': 4, 'w': 5, 'e': 6, 'r': 7, 'n': 8, 's': 9, 't': 10, 'i': 11, 'd': 12, 'es': 13}


In [None]:
# bigram pair
pairs = get_stats(bpe_counter)
print(pairs)
# find most freq bigram pair
best = max(pairs, key=pairs.get)
print(best)
# merge most freq bigram pair
bpe_counter = merge_vocab(best, bpe_counter)
print(bpe_counter)
# update vocab
bpe_to_id = update_vocab(bpe_to_id, bpe_counter)
print(bpe_to_id)

defaultdict(<class 'int'>, {('▁low', 'e'): 2, ('e', 'r'): 2, ('▁', 'n'): 6, ('n', 'e'): 6, ('e', 'w'): 6, ('w', 'est'): 6, ('▁', 'w'): 3, ('w', 'i'): 3, ('i', 'd'): 3, ('d', 'est'): 3})
('▁', 'n')
{'▁low': 5, '▁low e r': 2, '▁n e w est': 6, '▁ w i d est': 3}
{'[PAD]': 0, '[UNK]': 1, '▁': 2, 'l': 3, 'o': 4, 'w': 5, 'e': 6, 'r': 7, 'n': 8, 's': 9, 't': 10, 'i': 11, 'd': 12, 'es': 13, 'est': 14, '▁l': 15, '▁lo': 16, '▁low': 17, '▁n': 18}


# Sentencepiece with Morph

In [56]:
# morph 단위로 분할된 말뭉치 생성
with open("kowiki-mecab.txt", "w") as o_f:
    with zipfile.ZipFile(args.corpus) as z:
        with z.open('kowiki.txt') as f:
            for i, line in enumerate(f):
                line = line.decode('utf-8').strip()
                tokens = mecab.morphs(line)
                string = " ".join(tokens)
                o_f.write(string)
                o_f.write("\n")

In [57]:
# 파일 확인
print(os.listdir("./"))

['.config', 'kowiki-mecab.txt', '.ipynb_checkpoints', 'drive', 'kowiki.txt', 'kowiki_32000.model', 'kowiki_32000.vocab', '__MACOSX', 'sample_data']


In [58]:
# morph vocab 학습
train_sentencepiece("kowiki-mecab.txt", "kowiki_mecab_32000")

In [59]:
# 생성결과 확인
print(os.listdir("./"))

['.config', 'kowiki_mecab_32000.vocab', 'kowiki_mecab_32000.model', 'kowiki-mecab.txt', '.ipynb_checkpoints', 'drive', 'kowiki.txt', 'kowiki_32000.model', 'kowiki_32000.vocab', '__MACOSX', 'sample_data']


In [60]:
# 생성파일 백업
corpus_dir = os.path.dirname(args.corpus)
shutil.copy("kowiki_mecab_32000.model", corpus_dir)
shutil.copy("kowiki_mecab_32000.vocab", corpus_dir)

print(os.listdir(corpus_dir))

['kowiki.txt.zip', 'kowiki_32000.model', 'kowiki_32000.vocab', 'kowiki_mecab_32000.model', 'kowiki_mecab_32000.vocab']


In [61]:
# load morph vocab
spm_morph_vocab = spm.SentencePieceProcessor()
spm_morph_vocab.load(os.path.join(corpus_dir, "kowiki_mecab_32000.model"))

True

In [62]:
# wiki spm morph tokenize
with zipfile.ZipFile(args.corpus) as z:
    with z.open('kowiki.txt') as f:
        for i, line in enumerate(f):
            if i >= 5:
                break
            line = line.decode('utf-8').strip()
            print(line)
            string = " ".join(mecab.morphs(line))
            print(string)
            tokens = spm_morph_vocab.encode_as_pieces(string)
            print(tokens)
            _ids = spm_morph_vocab.encode_as_ids(string)
            print(_ids)

지미 카터
지미 카터
['▁지', '미', '▁카', '터']
[45, 534, 556, 57]
제임스 얼 "지미" 카터 주니어(, 1924년 10월 1일 ~ )는 민주당 출신 미국 39번째 대통령 (1977년 ~ 1981년)이다.
제임스 얼 " 지미 " 카터 주니어 ( , 1924 년 10 월 1 일 ~ ) 는 민주당 출신 미국 39 번 째 대통령 ( 1977 년 ~ 1981 년 ) 이 다 .
['▁제임스', '▁얼', '▁"', '▁지', '미', '▁"', '▁카', '터', '▁주니어', '▁(', '▁,', '▁192', '4', '▁년', '▁10', '▁월', '▁1', '▁일', '▁~', '▁)', '▁는', '▁민주', '당', '▁출신', '▁미국', '▁39', '▁번', '▁째', '▁대통령', '▁(', '▁1977', '▁년', '▁~', '▁198', '1', '▁년', '▁)', '▁이', '▁다', '▁.']
[2279, 3285, 51, 45, 534, 51, 556, 57, 4215, 19, 12, 668, 160, 21, 125, 35, 50, 36, 114, 18, 11, 904, 229, 656, 139, 2540, 128, 268, 513, 19, 2435, 21, 114, 194, 389, 21, 18, 10, 9, 7]
지미 카터는 조지아주 섬터 카운티 플레인스 마을에서 태어났다. 조지아 공과대학교를 졸업하였다. 그 후 해군에 들어가 전함·원자력·잠수함의 승무원으로 일하였다. 1953년 미국 해군 대위로 예편하였고 이후 땅콩·면화 등을 가꿔 많은 돈을 벌었다. 그의 별명이 "땅콩 농부" (Peanut Farmer)로 알려졌다.
지미 카터 는 조지 아주 섬터 카운티 플 레인스 마을 에서 태어났 다 . 조지 아 공과 대학교 를 졸업 하 였 다 . 그 후 해군 에 들어가 전함 · 원자력 · 잠수함 의 승무원 으로 일 하 였 다 . 1953 년 미국 해군 대위 로 예편 하 였 고 이후 땅콩 · 면화 등 을 가꿔 많 은 돈