In [1]:
import os
import re
import csv
from glob import iglob
from pathlib import Path
import numpy as np

In [2]:
BASE_DIR = "/data/ksb/"
DATA_BASE_DIR = os.path.join(BASE_DIR, "articles")

ORIGIN_PATH = os.path.join(DATA_BASE_DIR,"Origin-Data")
PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")
TITLE_PREPROCESSED_PATH = os.path.join(BASE_DIR,"Title-Preprocessed-Data")

PRETTY_PATH = os.path.join(DATA_BASE_DIR,"Pretty-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")
SWORDS_PATH = os.path.join(DATA_BASE_DIR, "StopWordList.txt")
MODEL_PATH = "Word-Encoding-Model"

In [3]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


In [4]:
mkdir_p(MODEL_PATH)

In [5]:
def get_cmd(filename, pad_id, bos_id, eos_id, unk_id, prefix, vocab_size, character_coverage, model_type):
    templates= '--input={} \
    --pad_id={} \
    --bos_id={} \
    --eos_id={} \
    --unk_id={} \
    --model_prefix={} \
    --vocab_size={} \
    --character_coverage={} \
    --model_type={}'
    
    cmd = templates.format(file_name,
                pad_id,
                bos_id,
                eos_id,
                unk_id,
                prefix,
                vocab_size,
                character_coverage,
                model_type)
    return cmd
    

In [6]:
def get_text(basepath):
    result = []

    for idx, proc_article_path in enumerate(iglob(os.path.join(basepath, '**.csv'), recursive=False)):
    
        f_proc= open(proc_article_path, 'r', newline="\n", encoding="utf-8")
        for [idx, title, contents] in csv.reader(f_proc):
            if contents is '': continue

            cont_list = contents.split("\t")
            result.append('\n'.join(cont_list))
        f_proc.close()

    return result

In [7]:
pad_id=0  
vocab_size = 70000 
bos_id=1
eos_id=2
unk_id=3
character_coverage = 1.0
model_type ='word' 

In [8]:
file_name = os.path.join(MODEL_PATH, "Headline_SentencePiece_train.txt")

headline_src_text = get_text(TITLE_PREPROCESSED_PATH)
headline_tar_text = get_text(SUMMARY_PREPROCESSED_PATH) # 원래 Generated Summary 

with open(file_name, 'w', encoding='utf-8') as f:
    f.write('\n'.join(headline_src_text + headline_tar_text))

model_num = len(list(iglob(os.path.join(MODEL_PATH, 'spm-headline-*.vocab'), recursive=False)))
prefix = os.path.join(MODEL_PATH, 'spm-headline-{}'.format(model_num)) 

headline_cmd = get_cmd(file_name, pad_id, bos_id,
                       eos_id, unk_id, prefix, vocab_size, character_coverage, model_type)

In [12]:
file_name = os.path.join(MODEL_PATH, "SentencePiece_train_src.txt")

src_text = get_text(PREPROCESSED_PATH)

with open(file_name, 'w', encoding='utf-8') as f:
    f.write('\n'.join(src_text))

model_num = len(list(iglob(os.path.join(MODEL_PATH, 'spm-input-*.vocab'), recursive=False)))
prefix = os.path.join(MODEL_PATH, 'spm-input-{}'.format(model_num))

src_cmd = get_cmd(file_name, pad_id, bos_id,
              eos_id, unk_id, prefix, vocab_size, character_coverage, model_type)

In [13]:
file_name = os.path.join(MODEL_PATH, "SentencePiece_train_tar.txt")

tar_text = get_text(SUMMARY_PREPROCESSED_PATH)

with open(file_name, 'w', encoding='utf-8') as f:
    f.write('\n'.join(tar_text))

model_num = len(list(iglob(os.path.join(MODEL_PATH, 'spm-summary-*.vocab'), recursive=False)))
prefix = os.path.join(MODEL_PATH, 'spm-summary-{}'.format(model_num))

tar_cmd = get_cmd(file_name, pad_id, bos_id,
              eos_id, unk_id, prefix, vocab_size, character_coverage, model_type)

In [14]:
print(src_cmd)
print(tar_cmd)
print(headline_cmd)

--input=Word-Encoding-Model/SentencePiece_train_src.txt     --pad_id=0     --bos_id=1     --eos_id=2     --unk_id=3     --model_prefix=Word-Encoding-Model/spm-input-0     --vocab_size=70000     --character_coverage=1.0     --model_type=word
--input=Word-Encoding-Model/SentencePiece_train_tar.txt     --pad_id=0     --bos_id=1     --eos_id=2     --unk_id=3     --model_prefix=Word-Encoding-Model/spm-summary-3     --vocab_size=70000     --character_coverage=1.0     --model_type=word
--input=Word-Encoding-Model/Headline_SentencePiece_train.txt     --pad_id=0     --bos_id=1     --eos_id=2     --unk_id=3     --model_prefix=Word-Encoding-Model/spm-headline-2     --vocab_size=70000     --character_coverage=1.0     --model_type=word


In [15]:
import sentencepiece as spm
spm.SentencePieceTrainer.Train(src_cmd)
spm.SentencePieceTrainer.Train(tar_cmd)
spm.SentencePieceTrainer.Train(headline_cmd)