In [1]:
import os
import csv
import numpy as np
import tensorflow as tf
from glob import iglob

In [29]:
class IntegerEncoder:
    def __init__(self, filepaths, options):
        self.filepaths = filepaths
        
        self.model = options['model-type']
        self.inv_wv = options['inv_wv']
        self.corpus = options['corpus']
        self.sp = options['spm']
    
    def __get_token_matrix(self):
        token_list =[]
        
        for path in self.filepaths:
            f = open(path, 'r', newline="\n", encoding="utf-8")
            
            for [_, title, contents] in csv.reader(f):
                content = contents.split("\t")
                vec = [token for sent in content for token in sent.split()]

                token_list.append(np.array(vec))
                
            f.close()

        return token_list

    def __get_line_list(self):
        line_list =[]
        
        for path in self.filepaths:
            f = open(path, 'r', newline="\n", encoding="utf-8")
            
            for [_, title, contents] in csv.reader(f):
                content = contents.split("\t")
                line_list.append(' '.join(content))
                
            f.close()

        return line_list
    
    def __glove_encoding(self, token_list):
        return list(map(lambda line: [self.corpus.dictionary[token] for token in line 
                                      if token in self.corpus.dictionary], token_list))
        
    def __sentencepiece_encoding(self, token_list):
        return list(map(lambda line: self.sp.EncodeAsIds(line), token_list))
    
    def __word2vec_encoding(self, token_list):
        return list(map(lambda line: [self.inv_wv[token] for token in line
                                     if token in self.inv_wv], token_list))  
    
    def encoder(self):

        token_list = self.__get_token_matrix()
        if self.model is 'GloVe':
            encoding_vec_list = self.__glove_encoding(token_list) 
        elif self.model is 'Word2Vec' :
            encoding_vec_list = self.__word2vec_encoding(token_list)
        else:
            encoding_vec_list = self.__sentencepiece_encoding(self.__get_line_list())
        
        return encoding_vec_list   
    
class Padding:
    def __init__(self, max_len = None):
        self.max_len = max_len
    
    def padding(self, vec_list):
        vec_matrix = tf.keras.preprocessing.sequence.pad_sequences(
            vec_list, maxlen=self.max_len, padding='post', value="", dtype='str')
        
        return vec_matrix