In [None]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 5.9MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


In [None]:
import os
import re
import csv
from glob import iglob
from pathlib import Path
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
BASE_DIR = "/content/gdrive/My Drive/Colab Notebooks/ETRI_Article_Summarizer/"
DATA_BASE_DIR = os.path.join(BASE_DIR, "articles")

ORIGIN_PATH = os.path.join(DATA_BASE_DIR,"Origin-Data")
PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Preprocessed-Data")
PRETTY_PATH = os.path.join(DATA_BASE_DIR,"Pretty-Data")
SUMMARY_PREPROCESSED_PATH = os.path.join(DATA_BASE_DIR,"Summary-Preprocessed-Data")
SWORDS_PATH = os.path.join(DATA_BASE_DIR, "StopWordList.txt")
MODEL_PATH = os.path.join(BASE_DIR, "Word-Embedding-Model")

In [None]:
MIN_COUNT = 10

In [None]:
def mkdir_p(path):
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


In [None]:
file_name = "SentencePiece_train.txt"
result = []

for idx, proc_article_path in enumerate(iglob(os.path.join(PREPROCESSED_PATH, '**.csv'), recursive=False)):
    
    f_proc= open(proc_article_path, 'r', newline="\n", encoding="utf-8")
    for [idx, title, contents] in csv.reader(f_proc):
        if contents is '': continue

        cont_list = contents.split("\t")
        result.append('\n'.join(cont_list))
    f_proc.close()

with open(file_name, 'w', encoding='utf-8') as f:
    f.write('\n'.join(result))

In [None]:
templates= '--input={} \
--pad_id={} \
--bos_id={} \
--eos_id={} \
--unk_id={} \
--model_prefix={} \
--vocab_size={} \
--character_coverage={} \
--model_type={}'

In [None]:
pad_id=0  #<pad> token을 0으로 설정
vocab_size = 70000 
model_num = len(list(iglob('**.vocab', recursive=False)))
prefix = 'spm-{}'.format(model_num) 
bos_id=1
eos_id=2
unk_id=3
character_coverage = 1.0
model_type ='word' # Choose from unigram (default), bpe, char, or word

In [None]:
cmd = templates.format(file_name,
                pad_id,
                bos_id,
                eos_id,
                unk_id,
                prefix,
                vocab_size,
                character_coverage,
                model_type)
cmd

'--input=SentencePiece_train.txt --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3 --model_prefix=spm-2 --vocab_size=70000 --character_coverage=1.0 --model_type=word'

In [None]:
import sentencepiece as spm
spm.SentencePieceTrainer.Train(cmd)

In [None]:
class IntegerEncoder:
    def __init__(self, filepaths, options):
        self.filepaths = filepaths
        
        self.model = options['model-type']
        self.inv_wv = options['inv_wv']
        self.corpus = options['corpus']
        self.sp = options['spm']
    
    def __get_token_matrix(self):
        token_list =[]
        
        for path in self.filepaths:
            f = open(path, 'r', newline="\n", encoding="utf-8")
            
            for [_, title, contents] in csv.reader(f):
                content = contents.split("\t")
                vec = [token for sent in content for token in sent.split()]

                token_list.append(np.array(vec))
                
            f.close()

        return token_list

    def __get_line_list(self):
        line_list =[]
        
        for path in self.filepaths:
            f = open(path, 'r', newline="\n", encoding="utf-8")
            
            for [_, title, contents] in csv.reader(f):
                content = contents.split("\t")
                line_list.append(' '.join(content))
                
            f.close()

        return line_list
    
    def __glove_encoding(self, token_list):
        return list(map(lambda line: [self.corpus.dictionary[token] for token in line 
                                      if token in self.corpus.dictionary], token_list))
        
    def __sentencepiece_encoding(self, token_list):
        print(token_list)
        return list(map(lambda line: self.sp.EncodeAsIds(line), token_list))
    
    def __word2vec_encoding(self, token_list):
        return list(map(lambda line: [self.inv_wv[token] for token in line
                                     if token in self.inv_wv], token_list))  
    
    def encoder(self):

        token_list = self.__get_token_matrix()
        if self.model is 'GloVe':
            encoding_vec_list = self.__glove_encoding(token_list) 
        elif self.model is 'Word2Vec' :
            encoding_vec_list = self.__word2vec_encoding(token_list)
        else:
            encoding_vec_list = self.__sentencepiece_encoding(self.__get_line_list())
        
        return encoding_vec_list   
    
class Padding:
    def __init__(self, max_len = None):
        self.max_len = max_len
    
    def padding(self, vec_list):
        vec_matrix = tf.keras.preprocessing.sequence.pad_sequences(
            vec_list, maxlen=self.max_len, padding='post', value="", dtype='str')
        
        return vec_matrix

In [None]:
sp = spm.SentencePieceProcessor()
model_num = len(list(iglob('**.vocab', recursive=False))) -1
sp.Load('spm-{}.model'.format(model_num))

True

In [None]:
options = {
    'model-type' : 'Sentence-Piece',
    'inv_wv' : None,
    'corpus' : None,
    'spm' : sp
}
input_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()
output_encoded_list = IntegerEncoder(options=options, filepaths=list(iglob(os.path.join(SUMMARY_PREPROCESSED_PATH, '**.csv'), recursive=False))).encoder()


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
get_max_length = lambda x : np.max([len(line) for line in x])

MAX_LEN = get_max_length(input_encoded_list)
MAX_LEN

250