In [None]:
import os
drive_path = 'D:/model/'
if not os.path.isdir(drive_path):
    os.makedirs(drive_path)

In [None]:
# encoding: utf-8
import unittest
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from os.path import isfile, join
import re
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
import glob
import math

In [None]:
def find_first_sublist(seq, sublist, start=0):
    length = len(sublist)
    for index in range(start, len(seq)):
        if seq[index:index+length] == sublist:
            return index, index+length

def replace_sublist(seq, sublist, replacement):
    length = len(replacement)
    index = 0
    for start, end in iter(lambda: find_first_sublist(seq, sublist, index), None):
        seq[start:end] = replacement
        index = start + length
        
def replaceTAGS(x):
    replace_sublist(x, ['<', 'NUM_INT', '>'], ["<NUM_INT>"])
    replace_sublist(x, ['<', 'NUM_FLOAT', '>'], ["<NUM_FLOAT>"])
    replace_sublist(x, ['<', 'STRING', '>'], ["<STRING>"])
    replace_sublist(x, ['<', 'BOC', '>'], ["<BOC>"])
    replace_sublist(x, ['<', 'EOC', '>'], ["<EOC>"])
    replace_sublist(x, ['<', 'BOTM', '>'], ["<BOTM>"])
    replace_sublist(x, ['<', 'BOT', '>'], ["<BOT>"])
    replace_sublist(x, ['<', 'EOT', '>'], ["<EOT>"])
    replace_sublist(x, ['<', 'BOM', '>'], ["<BOM>"])
    replace_sublist(x, ['<', 'EOM', '>'], ["<EOM>"])
    replace_sublist(x, ['<', 'EOTM', '>'], ["<EOTM>"])
    replace_sublist(x, ['<', 'CR', '>'], ["<CR>"])
    return x

def parseSentence(x):
    tokenizer =  RegexpTokenizer(r"[\w']+|[].,:!?;=+-\\*/@#$%^&_(){}~|\"[]")
    tokens=[]
    state="START"
    chrs=""
    for i in range(len(x)):
        if (ord(x[i])>255):
            inp="U"
        elif (ord(x[i])>=48 and ord(x[i])<=57):
            inp="D"
        else:
            inp="E"

        if state=="START":
            if inp=="D":
                state="NUMBER"
                tokens.append(x[i])
            elif inp=="E":
                state="ASCII"
                chrs=x[i]
            else:#U
                state="UNICODE"
                tokens.append(x[i])                
            
        elif state=="ASCII":	
            if inp=="D" or inp=="E":
                state="ASCII"
                chrs += x[i]
            else:#U
                state="UNICODE"
                tokens += tokenizer.tokenize(chrs) #wordpunct_tokenize(chrs) #nltk.word_tokenize(chrs)
                chrs=""
                tokens.append(x[i])

        elif state=="NUMBER":
            if inp=="D":
                state="NUMBER"
                tokens.append(x[i])
            elif inp=="E":
                state="ASCII"
                chrs=x[i]
            else:#U
                state="UNICODE"
                tokens.append(x[i])		

        elif state=="UNICODE":
            if inp=="D":
                state="NUMBER"
                tokens.append(x[i])
            elif inp=="E":
                state="ASCII"
                chrs=x[i]
            else:#U
                state="UNICODE"
                tokens.append(x[i])

    if len(chrs)>0:
        tokens += tokenizer.tokenize(chrs) #wordpunct_tokenize(chrs) #nltk.word_tokenize(chrs)
    return replaceTAGS(tokens)

def readcode(fname):
    with open(fname, encoding="utf-8") as f:
        data = f.read()
        return data

def outputsplit(txt): #txt: "<BOTM><BOT>32<EOT><BOM>XXX<EOM><EOTM>...."
    pattern = re.compile(r'<BOTM>(.*?)<EOTM>')
    x = re.findall(r"<BOTM> *<BOT>(.*?)<EOT>(.*?)<EOTM>", txt, re.DOTALL)
    y=list(zip((*x)))#[('32', ...), ('<BOM>XXX<EOM>', ...)]
    err_codes = [int(code) for code in y[0]]
    return err_codes, y[1] #erro int codes, messages
    
def saveMaxLen(filename, data): 
    with open(filename, 'w') as f:
        f.write(str(data))
        f.close()

def saveTestTrainData(filename, data): # e.g., 'test.npy'
    #print(len(data))
    #print(data[0].shape)
    #print(data[1].shape)
    with open(filename, 'wb') as f:
        np.save(f, data)

def saveDictionary(dt, file):
    import pickle
    a_file = open(file, "wb")
    pickle.dump(dt, a_file)
    a_file.close()

In [None]:
class TestTranslate(unittest.TestCase):
    def __init__(self):
        self.source_token_dict = {
            '<PAD>': 0,
            '<BOC>': 1,
            '<EOC>': 2,
            '<CR>': 3,
            '<NUM_INT>': 4,
            '<NUM_FLOAT>': 5,
            '<STRING>': 6,
        }
        
        self.target_token_dict = {
            '<PAD>': 0,
            '<START>': 1,
            '<END>': 2,
            '<BOM>': 3,
            '<EOM>': 4,
        }

    @staticmethod
    def _build_token_dict(token_dict, token_list):
        for tokens in token_list:
            for token in tokens:
                if token not in token_dict:
                    token_dict[token] = len(token_dict)
        return token_dict

    def test_translate(self):
        Input_Path = "D:\\Proportional Augmentation (替換字串與數字)\\Input\\**\\*.txt"
        Output_Path = "D:\\Proportional Augmentation (替換字串與數字)\\Output\\**\\*.txt"
        
        #Input_Path = "D:\\Augmentation\\Input\\**\\*.txt"
        #Output_Path = "D:\\Augmentation\\Output\\**\\*.txt"
        
        in_path = sorted(glob.glob(Input_Path))
        out_path = sorted(glob.glob(Output_Path))
         
        source_max_len = 0
        target_max_len = 0
        data_num = 17640
        block_num = 17640
        for loop in range(0, math.ceil(data_num/block_num)):
            source_tokens = []
            target_errors=[]
            target_tokens = []     
            if data_num % block_num == 0: 
                dirs = block_num
            else:
                dirs = block_num if loop < data_num // block_num else data_num % block_num
            Input_fullpath = []
            Output_fullpath = []
            
            for i in range(dirs):
                Input_fullpath.append(in_path[loop*block_num + i])
                
            for f in Input_fullpath:
                if isfile(f):
                    source_tokens.append(parseSentence(readcode(f)))
        
            for i in range(dirs):
                Output_fullpath.append(out_path[loop*block_num + i])

            for f in Output_fullpath:
                if isfile(f):  
                    o1, o2 = outputsplit(readcode(f))#o1: list of error codes
                    o2 = "".join(o2)
                    ps = parseSentence(o2) ##<-----parse messages
                    target_errors.append(o1)
                    target_tokens.append(ps)
            
            source_tokens2 = []
            target_errors2 = []
            target_tokens2 = []

            THRESHOLD_FILE_LEN = 1000

            for i in range(len(source_tokens)):
              src = source_tokens[i]
              target_error = target_errors[i]
              target = target_tokens[i]
              if (len(src)<=THRESHOLD_FILE_LEN and  len(target)<=THRESHOLD_FILE_LEN):
                source_tokens2.append(src)
                target_errors2.append(target_error)
                target_tokens2.append(target)
            source_tokens = source_tokens2
            target_errors = target_errors2 #list of intgers, error types
            target_tokens = target_tokens2
     
            # Generate dictionaries
            self._build_token_dict(self.source_token_dict, source_tokens)
            self._build_token_dict(self.target_token_dict, target_tokens)
            target_token_dict_inv = {v: k for k, v in self.target_token_dict.items()}

            # Add special tokens
            encode_tokens = [tokens for tokens in source_tokens]
            decode_tokens = [['<START>'] + tokens for tokens in target_tokens]
            output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
                           
            sl = max(map(len, encode_tokens))
            tl = max(map(len, decode_tokens))
            source_max_len = max(sl, tl, source_max_len)
            saveMaxLen(drive_path + "source_max_len.txt", source_max_len)
            target_max_len = max(sl, tl, target_max_len)
            saveMaxLen(drive_path + "target_max_len.txt", target_max_len)
         
        print("source_max_len:", source_max_len)
        print("target_max_len:", target_max_len)
        
        #ready to pad and save data
        for loop in range(0, math.ceil(data_num/block_num)):
            print("loop:", loop)
            source_tokens = []
            target_errors=[]
            target_tokens = []  
            if data_num % block_num == 0: 
                dirs = block_num
            else:
                dirs = block_num if loop < data_num // block_num else data_num % block_num
            Input_fullpath = []
            Output_fullpath = []
            
            for i in range(dirs):
                Input_fullpath.append(in_path[loop*block_num + i])
                
            for f in Input_fullpath:
                if isfile(f):
                    source_tokens.append(parseSentence(readcode(f)))
        
            for i in range(dirs):
                Output_fullpath.append(out_path[loop*block_num + i])

            for f in Output_fullpath:
                if isfile(f):  
                    o1, o2 = outputsplit(readcode(f))#o1: list of error codes
                    o2 = "".join(o2)
                    ps = parseSentence(o2) ##<-----parse messages
                    target_errors.append(o1)
                    target_tokens.append(ps)
            
            print("XXXX: " , len(source_tokens))
            print("YYYY: " , len(target_errors))
            print("ZZZZ: " , len(target_tokens))
            
            source_tokens2 = []
            target_errors2 = []
            target_tokens2 = []

            THRESHOLD_FILE_LEN = 1000

            for i in range(len(source_tokens)):
              src = source_tokens[i]
              target_error = target_errors[i]
              target = target_tokens[i]
              if (len(src)<=THRESHOLD_FILE_LEN and len(target)<=THRESHOLD_FILE_LEN):
                source_tokens2.append(src)
                target_errors2.append(target_error)
                target_tokens2.append(target)
            source_tokens = source_tokens2
            target_errors = target_errors2 #list of intgers, error types
            target_tokens = target_tokens2

            print("XXXX2: " , len(source_tokens)) #262 files
            print("YYYY2: " , len(target_errors)) #262 answers           
            print("ZZZZ2: " , len(target_tokens)) #262 answers           
            
            # Generate dictionaries
            self._build_token_dict(self.source_token_dict, source_tokens)
            self._build_token_dict(self.target_token_dict, target_tokens)
            target_token_dict_inv = {v: k for k, v in self.target_token_dict.items()}

            # Add special tokens
            encode_tokens = [tokens for tokens in source_tokens]
            decode_tokens = [['<START>'] + tokens for tokens in target_tokens]
            output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]
                    
            # Padding
            encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
            decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
            output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

            encode_input = [list(map(lambda x: self.source_token_dict[x], tokens)) for tokens in encode_tokens]
            decode_input = [list(map(lambda x: self.target_token_dict[x], tokens)) for tokens in decode_tokens]
            decode_output2 = [list(map(lambda x: [self.target_token_dict[x]], tokens)) for tokens in output_tokens]
            
            print("source_token_dict len: ", len(self.source_token_dict))
            print("target_token_dict len: ", len(self.target_token_dict))
            print("target_token_dict_inv len: ", len(target_token_dict_inv))
            
            saveDictionary(encode_tokens, drive_path + 'source_token_dict.pickle')
            saveDictionary(self.target_token_dict, drive_path + 'target_token_dict.pickle')
            saveDictionary(target_token_dict_inv, drive_path + 'target_token_dict_inv.pickle')  
            
            saveDictionary(self.source_token_dict, drive_path + 'source_token_dict.pickle')
            saveDictionary(self.target_token_dict, drive_path + 'target_token_dict.pickle')
            saveDictionary(target_token_dict_inv, drive_path + 'target_token_dict_inv.pickle')   

            #print("encode_input", np.asarray(encode_input).shape) #(271, 798)
            #print("decode_input", np.asarray(decode_input).shape) #(271, 798)
            #print("decode_output2",  np.asarray(decode_output2).shape) #(271, 798, 1)
            #target errors: into 0/1 arrays from target_errors
            decode_output1 =[ [0]*36 for i in range(len(target_errors))]
            for i in range(len(target_errors)):
                    codes= target_errors[i]
                    for code in codes:  
                            decode_output1[i][code-1] = 1
            #print(decode_output1)

            x=list(zip(np.array(encode_input), np.array(decode_input)))
            y=list(zip(np.array(decode_output1), np.array(decode_output2))) #np.array(decode_output2)

            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

            x_test = list(zip(*x_test))
            x_test[0] = np.array(x_test[0])
            x_test[1] = np.array(x_test[1])

            y_test = list(zip(*y_test))
            y_test[0] = np.array(y_test[0]) #decode_output1
            y_test[1] = np.array(y_test[1]) #decode_output2
            #print(y_test[0].shape)
            #print(y_test[1].shape)

            #x=[np.array(encode_input * 1), np.array(decode_input * 1)] #(2, 271, 798)
            #y=np.array(decode_output2 * 1) #(271, 798, 1)     
            
            # x_train, x_test: [array, array ]
            # y_train, y_test: array     
            
            x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

            x_train = list(zip(*x_train))
            x_train[0] = np.asarray(x_train[0]) #encode_input
            x_train[1] = np.asarray(x_train[1]) #decode_input
            print(x_train[0].shape)
            print(x_train[1].shape)

            y_train = list(zip(*y_train))
            y_train[0] = np.asarray(y_train[0]) #decode_output1
            y_train[1] = np.asarray(y_train[1]) #decode_output2
            print(y_train[0].shape)
            print(y_train[1].shape)
            
            x_validation = list(zip(*x_validation))
            x_validation[0] = np.asarray(x_validation[0])
            x_validation[1] = np.asarray(x_validation[1])

            y_validation = list(zip(*y_validation))
            y_validation[0] = np.asarray(y_validation[0]) #decode_output1
            y_validation[1] = np.asarray(y_validation[1]) #decode_output2
            
            print("x_train[0] shape:", x_train[0].shape)
            print("x_validation[0] shape:", x_validation[0].shape)
            print("x_test[0] shape:", x_test[0].shape)
            
            saveTestTrainData(drive_path + "x_train[0]_" + str(loop) + ".npy", x_train[0])
            saveTestTrainData(drive_path + "x_train[1]_" + str(loop) + ".npy", x_train[1])
            saveTestTrainData(drive_path + "x_test[0]_" + str(loop) + ".npy", x_test[0])
            saveTestTrainData(drive_path + "x_test[1]_" + str(loop) + ".npy", x_test[1])
            saveTestTrainData(drive_path + "y_train[0]_" + str(loop) + ".npy", y_train[0])
            saveTestTrainData(drive_path + "y_train[1]_" + str(loop) + ".npy", y_train[1])
            saveTestTrainData(drive_path + "y_test[0]_" + str(loop) + ".npy", y_test[0])
            saveTestTrainData(drive_path + "y_test[1]_" + str(loop) + ".npy", y_test[1])
            saveTestTrainData(drive_path + "x_validation[0]_" + str(loop) + ".npy", x_validation[0])
            saveTestTrainData(drive_path + "x_validation[1]_" + str(loop) + ".npy", x_validation[1])
            saveTestTrainData(drive_path + "y_validation[0]_" + str(loop) + ".npy", y_validation[0])
            saveTestTrainData(drive_path + "y_validation[1]_" + str(loop) + ".npy", y_validation[1])
            
x=TestTranslate()
x.test_translate()