# Env

In [7]:
import argparse
import collections
import os
import random
import re
import shutil
import zipfile

import matplotlib.pyplot as plt
import numpy as np
import torch

In [8]:
# enwiki file 경로저장
enwiki_file = "/home/tako/youngmi/errordetection/data/enwiki.txt"

In [9]:
# 환경 설정
args = {
    "seed": 1234,    # random seed value
    "device": torch.device("cuda"),
    "corpus": enwiki_file,    # 말뭉치 파일
}
args = argparse.Namespace(**args)
print(args)

Namespace(corpus='/home/tako/youngmi/errordetection/data/enwiki.txt', device=device(type='cuda'), seed=1234)


In [10]:
# random seed 설정
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)

# 파일확인

In [11]:
# wiki 라인수 확인
count = 0
f = open(enwiki_file)
for i, line in enumerate(f):
    count += 1
print(count)

66467178


In [12]:
# wiki 내용 확인
f = open(enwiki_file)
for i, line in enumerate(f):
    if i >= 50:
        break
    line = line.strip()
    print(line)


Anarchism
Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be undesirable, unnecessary, and harmful. It is usually described alongside libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement and as having a historical association with anti-capitalism and socialism.
The history of anarchy goes back to prehistory, when humans arguably lived in anarchic societies long before the establishment of formal states, realms or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose, but it was not until the 19th century that a self-conscious political movement emerged. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement flourished in most parts of the world and had a significant role in workers' struggles for emancipat

# Sentencepiece

In [1]:
!pip install sentencepiece



In [13]:
# import sentencepiece
import sentencepiece as spm

In [14]:
def train_sentencepiece(corpus, prefix, vocab_size):
    """
    sentencepiece를 이용해 vocab 학습
    :param corpus: 학습할 말뭉치
    :param prefix: 저장할 vocab 이름
    :param vocab_size: vocab 개수
    """
    spm.SentencePieceTrainer.train(
        f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" +  # 7은 특수문자 개수
        " --model_type=unigram" +
        " --max_sentence_length=999999" +  # 문장 최대 길이
        " --pad_id=0 --pad_piece=[PAD]" +  # pad token 및 id 지정
        " --unk_id=1 --unk_piece=[UNK]" +  # unknown token 및 id 지정
        " --bos_id=2 --bos_piece=[BOS]" +  # begin of sequence token 및 id 지정
        " --eos_id=3 --eos_piece=[EOS]" +  # end of sequence token 및 id 지정
        " --user_defined_symbols=[SEP],[CLS],[MASK]" +  # 기타 추가 토큰 SEP: 4, CLS: 5, MASK: 6
        " --input_sentence_size=100000" +  # 말뭉치에서 셈플링해서 학습
        " --character_coverage=1.0" # to reduce character set 
        " --shuffle_input_sentence=true")  # 셈플링한 말뭉치 shuffle

In [15]:
# vocab 학습
train_sentencepiece(enwiki_file, "enwiki_unigram_8000", 8000)

In [16]:
# load vocab
spm_vocab = spm.SentencePieceProcessor()
spm_vocab.load("/home/tako/youngmi/errordetection/data/enwiki_unigram_8000.model")

True

In [17]:
# vocab 학습
train_sentencepiece(enwiki_file, "enwiki_unigram_32000", 32000)

In [18]:
# load vocab
spm_vocab = spm.SentencePieceProcessor()
spm_vocab.load("/home/tako/youngmi/errordetection/data/enwiki_unigram_32000.model")

True

In [32]:
# vocab 내용 불러오기
with open('/home/tako/youngmi/errordetection/data/enwiki_unigram_32000.vocab', encoding='utf-8')as f:
    Vo = [doc.strip().split("\t")for doc in f]
    
word2idx = {w[0]: i for i, w in enumerate(Vo)}
print(word2idx)



In [36]:
spm_vocab.SetEncodeExtraOptions('bos:eos')

True

In [34]:
f = open(enwiki_file)
for i, line in enumerate(f):
    if i >= 5:
        break
    line = line.strip()
    print(line)
    tokens = spm_vocab.encode_as_pieces(line)
    print(tokens)
    _ids = spm_vocab.encode_as_ids(line)
    print(_ids)


[]
[]
Anarchism
['▁An', 'arch', 'ism']
[470, 7716, 530]
Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be undesirable, unnecessary, and harmful. It is usually described alongside libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement and as having a historical association with anti-capitalism and socialism.
['▁An', 'arch', 'ism', '▁is', '▁a', '▁political', '▁philosophy', '▁and', '▁movement', '▁that', '▁is', '▁sceptic', 'al', '▁of', '▁authority', '▁and', '▁rejects', '▁all', '▁in', 'voluntary', ',', '▁co', 'er', 'c', 'ive', '▁forms', '▁of', '▁hierarchy', '.', '▁An', 'arch', 'ism', '▁calls', '▁for', '▁the', '▁abolition', '▁of', '▁the', '▁state', ',', '▁which', '▁it', '▁holds', '▁to', '▁be', '▁undesirable', ',', '▁unnecessary', ',', '▁and', '▁harmful', '.', '▁It', '▁is', '▁', 'usually', '▁describ

In [21]:
# 문자열을 token으로 분할
tokens = spm_vocab.encode_as_pieces("i love you so much")

print(tokens)

['▁i', '▁love', '▁you', '▁so', '▁much']


In [88]:
# token을 문자열로 복원
print(spm_vocab.decode_pieces(tokens))

i love you so much


In [22]:
# 문자열을 숫자로 분할
ids = spm_vocab.encode_as_ids("i love you so much")

print(ids)

[1829, 1223, 886, 207, 462]


In [23]:
# 숫자를 문자열로 복원
print(spm_vocab.decode_ids(ids))

i love you so much


In [24]:
# token을 숫자로 변경
print(spm_vocab.piece_to_id(tokens))

[1829, 1223, 886, 207, 462]


In [26]:
# 숫자를 token으로 변경
print(spm_vocab.id_to_piece(ids))

['▁i', '▁love', '▁you', '▁so', '▁much']
