<a href="https://colab.research.google.com/github/asukul/Training/blob/master/runTDQN_adisak.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@markdown #**Check GPU type**
#@markdown ### Factory reset runtime if you don't have the desired GPU.

#@markdown ---




#@markdown V100 = Excellent (*Available only for Colab Pro Users*)

#@markdown P100 = Very Good

#@markdown T4 = Good

#@markdown K80 = Meh

#@markdown P4 = (Not Recommended) *for heavy A.I Models like COCO, WikiArt 1024, WikiArt 16384, FacesHQ or S-FLCKR*

#@markdown ---

!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-d734f33f-9566-6955-97b2-89c303e7b8d3)


In [2]:
#@markdown #**Anti-Disconnect for Google Colab**
#@markdown ## Run this to stop it from disconnecting automatically 
#@markdown  **(It will anyhow disconnect after 6 - 12 hrs for using the free version of Colab.)**
#@markdown  *(Colab Pro users will get about 24 hrs usage time)*
#@markdown ---

import IPython
js_code = '''
function ClickConnect(){
console.log("Working");
document.querySelector("colab-toolbar-button#connect").click()
}
setInterval(ClickConnect,60000)
'''
display(IPython.display.Javascript(js_code))

<IPython.core.display.Javascript object>

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
#!python "/content/gdrive/MyDrive/Colab Notebooks/tdqn.py"

In [5]:
# dqn with keras
# please install environment according to the requirements.txt first
# recommend using pycharm to install virtual environment

__author__ = "Lei Qi"
__email__  = "leiqi@iastate.edu"
__version__ = "0.0.1"
__title__ = 'DQN code for political documents (three public datasets and PAPTï¼‰'
__license__ = 'MIT License'
__copyright__ = 'Copyright 2020 Lei Qi'

import os, sys, codecs
from sys import exit
from time import sleep
import numpy as np
import keras
import keras.backend as K
from keras.layers import Input
from keras.models import Model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras import regularizers
from keras.models import model_from_json
import tensorflow as tf

import warnings

warnings.filterwarnings("ignore")

In [6]:
def getContent(filename):
    with open(filename, "r") as f:
        lines = f.readlines()
    return lines


def getContentUTF8(filename):
    with codecs.open(filename, "r", "utf-8") as f:
        lines = f.readlines()
    return lines


# function to write file with add mode
def writeFileAdd(filename, content):
    # print "Writing into file:  %s\n" % filename
    with open(filename, "a+") as f:
        f.write(content)

    # function to write file with default mode


def writeFileDefault(filename, content):
    # print("Writing into file:  %s\n" % filename)
    with open(filename, "w") as f:
        f.write(content)


# function to write file with add mode
def writeFileAddUTF8(filename, content):
    # print "Writing into file:  %s\n" % filename
    with codecs.open(filename, "a+", "utf-8") as f:
        f.write(content)

    # function to write file with default mode


def writeFileDefaultUTF8(filename, content):
    print("Writing into file:  %s\n" % filename)
    with codecs.open(filename, "w", "utf-8") as f:
        f.write(content)


def getFileNames(dir_path):
    files = getFilesPath(dir_path)
    fileNames = []
    for file in files:
        file = os.path.split(file)[1]
        fileNames.append(file)
    return fileNames


def getFilesPath(dir_path):
    filesPath = []
    for parent, dirnames, filenames in os.walk(dir_path):
        for filename in filenames[:]:
            full_filename = os.path.join(parent, filename)
            filesPath.append(full_filename)
    return filesPath


# decide the character is ANSCII
def isAnscii(word):
    for elem in word:
        if ord(elem) >= 128:
            return True
    return False


# filter out non-ANSCII words    
def deleteUnAnscii(sentence):
    tempList = []
    for elem in sentence.split(" "):
        if not isAnscii(elem):
            tempList.append(elem)
    return " ".join(tempList)


In [7]:
def load_data(data_path):
    data_set = []
    label_set = []

    filenames = getFileNames(data_path)
    filenames = [filename for filename in filenames if filename.endswith('.txt')]
    print(filenames)
    # exit()
    for i, filename in enumerate(filenames):
        # print (filename)
        lines = getContent(os.path.join(data_path, filename))

        data_set += lines
        # label = int(filename[:-4].replace('topic', ''))
        # print(label)
        # label_set += [int(filename[:-4].replace('topic', '').strip())] * len(lines)
        # label_set += [filename[:-4]] * len(lines)
        label_set += [i] * len(lines)

    data_set = [line.replace("\n", "") for line in data_set]
    shortestword = 3
    longestword = 20
    temp = []
    for line in data_set:
        line = deleteUnAnscii(line)
        line = " ".join([word for word in line.split() if longestword >= len(word) >= shortestword])
        temp.append(line)

    data_set[:] = temp[:]

    return data_set, label_set

In [8]:
def generate_tuplet(data, labels, tuplet_size):
    # data: original documents
    # labels: orginal labels for documents
    # tuplet_size: the number of documents in a tuplet

    data, labels = shuffle_data(data, labels)
    tuplets = []
    tuplet_labels = []

    for i in range(int(len(data) / tuplet_size)):
        start_id = i * tuplet_size
        end_id = start_id + tuplet_size

        tuplets.append(data[start_id: end_id])

        ratio = [0] * len(labels[0])
        # print(ratio)

        for elem in list(labels[start_id: end_id]):
            index = list(elem).index(1.0)

            ratio[index] = ratio[index] + 1.0 / tuplet_size

        # print(ratio, len(ratio))
        # exit()
        # for i in range(len(ratio)):
        # ratio[i] = ratio[i] + 1.0 * list(labels[start_id: end_id]).count(i) / tuplet_size

        # tuplet_labels.append(tf.convert_to_tensor(ratio))
        tuplet_labels.append(np.array(ratio))

    return tuplets, tuplet_labels

In [9]:
def shuffle_data(data, labels):
    shuffle_indices = np.random.permutation(np.arange(len(data)))
    data = data[shuffle_indices]
    labels = labels[shuffle_indices]

    return data, labels


# user defined kl_divergence function
def kl_divergence(p, q):
    eps = 0.000001
    # p = np.array(p)
    # q = np.array(q)
    p = np.array(K.eval(p))
    q = np.array(K.eval(q))

    value = 0

    value = round((q * np.log(q / (p + eps)).sum(), 5))
    return value


# using the builtin kl_divergence function    
def kl_divergence2(y_true, y_pred):
    # return keras.losses.kullback_leibler_divergence(y_true, y_pred)    # keras function
    return tf.keras.losses.KLD(y_true, y_pred)  # tensorflow function


def JSD_Loss(p, q):
    #m = 0.5 * (np.array(p) + np.array(q)) #tf 1
    m = 0.5 * (p + q)    # for tf2, we need using this line instead of above
    # compute the JSD Loss
    return 0.5 * (kl_divergence2(p, m) + kl_divergence2(q, m))


def cal_mae(array1, array2):
    s = 0.0
    for i in range(len(array1)):
        s += abs(array1[i] - array2[i])
    return s / len(array1)

In [10]:
def data_pre_test(folder):
    test_data_path = folder + '/' + 'test_data/'

    test_data, test_label = load_data(test_data_path)
    data = [document for document in test_data]
    labels = test_label

    # exit()
    from keras.preprocessing.text import one_hot
    from keras.preprocessing.text import text_to_word_sequence
    # define the document

    # estimate the size of the vocabulary
    words = set(text_to_word_sequence(' '.join(data)))
    vocab_size = len(words)
    print(vocab_size)

    # integer encode the document
    encode_data = []
    for i, document in enumerate(data):
        encode_data.append(one_hot(document, round(vocab_size)))

    encode_label = tf.keras.utils.to_categorical(labels, num_classes=None)

    return encode_data, encode_label

In [11]:
def orgnize_King_Congress(folder):
    lines = getContent('control10.txt')[1:]
    train, test = [], []
    train_label, test_label = [], []

    for line in lines:
        line = line.strip()
        filename = line.split('\t')[0]
        print(filename)
        label = line.split('\t')[1]
        test_or_train = line.split('\t')[2]
        page_num = int(filename.split('_')[2][0:4])
        if page_num % 2 == 0:
            test_or_train = 1
        else:
            test_or_train = 0

        file_lines = getContentUTF8(os.path.join(folder, filename))
        content = deleteUnAnscii(' '.join(file_lines))
        if test_or_train == 1:
            train.append(content)
            if label == '3':
                label = '1'
            if label == '2':
                label = '4'
            train_label.append(label)
        else:
            test.append(content)
            if label == '3':
                label = '1'
            if label == '2':
                label = '4'
            test_label.append(label)

    print(len(train))
    print(len(test))

    for name in list(set(train_label)):
        writeFileDefault(os.path.join('./data/concomb/train', name + '.txt'), '')
        writeFileDefault(os.path.join('./data/concomb/test', name + '.txt'), '')

    for i, content in enumerate(train):
        writeFileAdd(os.path.join('./data/concomb/train', train_label[i] + '.txt'), content + '\n')
        print(i)
    for i, content in enumerate(test):
        writeFileAdd(os.path.join('./data/concomb/test', test_label[i] + '.txt'), content + '\n')
        print(i)


In [12]:
def orgnize_King_Enron(folder):
    lines = getContent('controlenron.txt')[1:]
    train, test = [], []
    train_label, test_label = [], []

    for line in lines:
        line = line.strip()
        filename = line.split(',')[0]
        print(filename)
        label = line.split(',')[1]
        test_or_train = int(line.split(',')[2])

        file_lines = getContentUTF8(os.path.join(folder, filename))
        file_lines = [line.strip() for line in file_lines]

        content = deleteUnAnscii(' '.join(file_lines))
        index = content.find('Subject:')
        if index != -1:
            content = content[index + len('Subject: '):]
        else:
            pass
        if test_or_train == 1:
            train.append(content)
            train_label.append(label)
        else:
            test.append(content)
            test_label.append(label)

    print(len(train))
    print(len(test))
    print(test_label.count('11'), test_label.count('13'), test_label.count('14'), test_label.count('15'),
          test_label.count('16'))

    shuffle_indices = np.random.permutation(np.arange(len(train)))
    train = np.array(train)[shuffle_indices]
    train_label = np.array(train_label)[shuffle_indices]

    train, train_label = list(train), list(train_label)

    test += train[:300]
    test_label += train_label[:300]

    train = train[300:]
    train_label = train_label[300:]

    for name in list(set(train_label)):
        writeFileDefault(os.path.join('./enron/train', name + '.txt'), '')
        writeFileDefault(os.path.join('./enron/test', name + '.txt'), '')

    for i, content in enumerate(train):
        writeFileAdd(os.path.join('./enron/train', train_label[i] + '.txt'), content + '\n')
        print(i)
    for i, content in enumerate(test):
        writeFileAdd(os.path.join('./enron/test', test_label[i] + '.txt'), content + '\n')
        print(i)

In [13]:
def orgnize_King_Immigraion(folder):
    lines = getContent('controlimmig.txt')[1:]
    train, test = [], []
    train_label, test_label = [], []

    for line in lines:
        line = line.strip()
        filename = line.split(',')[0]
        print(filename)
        label = line.split(',')[1]
        test_or_train = int(line.split(',')[2])

        file_lines = getContent(os.path.join(folder, filename))

        file_lines = [line.strip() for line in file_lines]

        content = deleteUnAnscii(' '.join(file_lines))
        index = content.find('All Rights Reserved')
        if index != -1:
            content = content[index + len('All Rights Reserved.'):]
        else:
            pass

        content = deleteUnAnscii(' '.join(file_lines))
        if test_or_train == 1:
            train.append(content)
            if label == '3':
                label = '1'

            train_label.append(label)
        else:
            test.append(content)
            if label == '3':
                label = '1'
            test_label.append(label)

    print(len(train))
    print(len(test))

    for name in list(set(train_label)):
        writeFileDefault(os.path.join('./immigration/train', name + '.txt'), '')
        writeFileDefault(os.path.join('./immigration/test', name + '.txt'), '')

    for i, content in enumerate(train):
        writeFileAdd(os.path.join('./immigration/train', train_label[i] + '.txt'), content + '\n')
        print(i)
    for i, content in enumerate(test):
        writeFileAdd(os.path.join('./immigration/test', test_label[i] + '.txt'), content + '\n')
        print(i)

In [14]:
# generating training data for each epoch
def generate_data(X_train, y_train, tuplet_size):
    tuplets, tuplet_labels = generate_tuplet(X_train, y_train, tuplet_size)
    tuplet_labels = np.array(tuplet_labels)
    print('tuplet_labels.shape', tuplet_labels.shape)
    print('len(tuplets)', len(tuplets))

    data = []
    for i in range(tuplet_size):
        data.append([])

    for tuplet in tuplets:

        for index in range(tuplet_size):
            data[index].append(tuplet[index])

    print('len(data), len(data[0]), len(data[0][0])', len(data), len(data[0]), len(data[0][0]))
     # for tf2 version
    for i in range(tuplet_size):
        data[i] = np.array(data[i])
        
    return data, tuplet_labels

In [15]:
def data_pre(folder):
    train_data_path = folder + '/' + 'train_data/'

    train_data, train_label = load_data(train_data_path)

    data = [document for document in train_data]
    labels = train_label
    # labels = [tf.convert_to_tensor(label) for label in labels]
    # print(len(list(set(labels))))
    # print(list(set(labels)))
    # exit()
    from keras.preprocessing.text import one_hot
    from keras.preprocessing.text import text_to_word_sequence
    # define the document

    # estimate the size of the vocabulary
    words = set(text_to_word_sequence(' '.join(data)))
    vocab_size = len(words)
    print("number of words:", vocab_size)

    max_word_per_doc = 0
    # integer encode the document
    encode_data = []
    for i, document in enumerate(data):
      encode_data.append(one_hot(document, round(vocab_size)))
      ntokens = len(document.split())

      if max_word_per_doc < ntokens:
          max_word_per_doc = ntokens
    
    print("max_#words_per_doc: ", max_word_per_doc)
    #print(encode_data[:5])
    #print(train_label[:5])
    # print(len(encode_data))
    
    encode_label = tf.keras.utils.to_categorical(labels, num_classes=None)
    # encode_label = labels
    # print(encode_label[-5:])
    # print(len(list(encode_label[0])))

    return encode_data, encode_label, vocab_size, max_word_per_doc

In [16]:
#def train_rand_king_enron(folder, num_classes):
#Pak changed the header to pass the parameters
def train_rand_king(datasetname=None, folder=None, num_classes=None, model_dir=None, ckptfolder=None, 
               tuplet_size = 20, epochs=200, max_length=200,
               embedding_vecor_length = 150):
  
    X_train, y_train, vocab_size, max_wordsperdoc = data_pre(folder)
    # truncate and pad input sequences

    X_train = sequence.pad_sequences(X_train, maxlen=max_length)

    tweets = []
    for index in range(tuplet_size):
        tweets.append(Input(shape=(max_length,)))

    # only keep the top n words 
    # top_words is an integer
    top_words = vocab_size

    # create the model
    #embedding_vecor_length = 150
    shared_embedd = Embedding(top_words, embedding_vecor_length, input_length=max_length)

    tweets_embedd = []
    for index in range(tuplet_size):
        tweets_embedd.append(shared_embedd(tweets[index]))

    print(len(tweets_embedd), tweets_embedd[0].shape)

    # reuse the same layer layer
    #shared_lstm = LSTM(128, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=regularizers.l2(0.02),
    #                   bias_regularizer=regularizers.l2(0.02))
    #Pak changed from 128 to 64 to match the paper.
    shared_lstm = LSTM(64, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=regularizers.l2(0.02),
                       bias_regularizer=regularizers.l2(0.02))
    
    # concatenate the vectors
    features = []
    for index in range(tuplet_size):
        features.append(shared_lstm(tweets_embedd[index]))

    # for the NN Method
    features_NN = []
    shared_NN = Dense(256, activation='relu')
    for index in range(tuplet_size):
        features_NN.append(shared_NN(features[index]))

    '''
    computes the maximum (element-wise) a list of inputs.
    It takes as input a list of tensors,
    all of the same shape, and returns
    a single tensor (also of the same shape).
    
    '''
    merged_vector = keras.layers.maximum(features)  #
    # merged_vector = keras.layers.concatenate(features, axis=-1)

    features = Dense(256, activation='relu')(merged_vector)
    features = Dropout(0.5)(features)

    predictions = Dense(num_classes, activation='softmax')(features)

    # define a trainable model linking inputs to the predictions
    model = Model(inputs=tweets, outputs=predictions)
    opt = tf.keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, amsgrad=True)

    model.compile(loss=JSD_Loss, optimizer=opt)

    print(model.summary())

    ckptfolder = os.path.join(model_dir, ckptfolder)

    if not os.path.exists(ckptfolder):
      os.makedirs(ckptfolder)

    # writeFileDefault('result__enron.txt', '')
    for epoch in range(epochs):
        print('epoch ', epoch)
        data, labels = generate_data(X_train, y_train, tuplet_size)
        #checkpoint_path = os.path.join(ckptfolder, "cp-{epoch:04d}")

      # Create a callback that saves the model's weights
        
        #Pak tried
        #cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
        #                                         save_weights_only=True,
        #                                         verbose=1)
        #

        model.fit(data, labels, epochs=1, batch_size=8, verbose=1)

        #Pak modified to save the checkpoints.
        #model.fit(data, labels, epochs=1, batch_size=8, verbose=1, callbacks=[cp_callback])
    
        #only save the model at the end
        model_json = model.to_json()
        model_json_filename = "model_" + datasetname + ".json"
        with open(os.path.join(model_dir, model_json_filename), "w") as json_file:
            json_file.write(model_json)

        # serialize weights to HDF5
        model_name = "model_" + datasetname + ".h5"
        model.save_weights(os.path.join(model_dir, model_name))
        print("epoch:", epoch, "saved ", model_name)
    #Pak added to return the maximum legnth used for training
    return max_length

In [17]:
def test_rand_king(folder, num_classes, modeldir, jsonfilename, model_filename, resultfile,
                         tuplet_size=100, epochs=100, max_length=200):
    # load json and create model
    opt = tf.keras.optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, amsgrad=True)
    jsonfilename = os.path.join(modeldir, jsonfilename)
    json_file = open(jsonfilename, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)

    # load weights into new model
    model_filename = os.path.join(modeldir, model_filename)
    loaded_model.load_weights(model_filename)
    print("Loaded ", model_filename)

    # evaluate loaded model on test data
    loaded_model.compile(loss=JSD_Loss, optimizer=opt)
    loaded_model.summary()

    X_test, y_test = data_pre_test(folder)
    # truncate and pad input sequences

    X_test = sequence.pad_sequences(X_test, maxlen=max_length)

    maes = []
    # writeFileDefault('result__enron.txt', '')

    truth = get_truth(folder)
    resultfile = os.path.join(modeldir, resultfile)


    pred_ratio = np.array([0.0] * num_classes)
    for k in range(epochs):
        tmp_pred_ratio = np.array([0.0] * num_classes)
        tuplets, tuplet_labels = generate_tuplet(X_test, y_test, tuplet_size)
        tuplet_labels = np.array(tuplet_labels)
        print('tuplet_labels.shape', tuplet_labels.shape)
        print('len(tuplets)', len(tuplets))

        data = []
        for i in range(tuplet_size):
            data.append([])

        for tuplet in tuplets:

            for index in range(tuplet_size):
                data[index].append(tuplet[index])

            # print('len(data), len(data[0]), len(data[0][0])', len(data), len(data[0]), len(data[0][0]))
       
        for i in range(tuplet_size):
            data[i] = np.array(data[i])
        
        predictions = loaded_model.predict(data)

        for p in predictions:
            tmp_pred_ratio += p / len(predictions)

        print('epoch:', k)

        writeFileAdd(resultfile, str(list(tmp_pred_ratio)) + '\n')

        pred_ratio = pred_ratio + tmp_pred_ratio

    pred_ratio = pred_ratio / epochs

    #write the results to a file

    print('truth: ', truth)
    writeFileAdd(resultfile, "truth:\n")
    writeFileAdd(resultfile, str(truth) + '\n')
  
    print("pred_ratio", pred_ratio)
    writeFileAdd(resultfile, str(list(pred_ratio)) + '\n')
    from sklearn.metrics import mean_squared_error
    result = mean_squared_error(truth, list(pred_ratio))
    print("result:mse", result)
    writeFileAdd(resultfile, 'result: mse: ' + str(result) + '\n')
    

In [18]:
def get_truth(folder):
    test_data_path = folder + '/' + 'test_data/'

    test_data, test_label = load_data(test_data_path)

    y_test = test_label

    y_test = list(y_test)

    # y_test = sorted(y_test)
    print(y_test)
    labels = sorted(list(set(y_test)))
    truth = []
    for l in labels:
        truth.append(y_test.count(l) * 1.0 / len(y_test))
    print(truth)
    # exit()
    return truth

In [19]:
def train_model_plus(datafolder, model_dir, jsonfilename, model_filename, tuplet_size=100, epochs=100, max_length=200, last_epoch=0):
    if not model_dir:
        print('wrong training data directory or number of classes or model directory provided.')
        exit()

    # load json and create model
    opt = tf.keras.optimizers.Adam(lr=0.00001, beta_1=0.9, beta_2=0.999, amsgrad=True)
    jsonfilename = os.path.join(model_dir, jsonfilename)
    json_file = open(jsonfilename, 'r')
    loaded_model_json = json_file.read()
    json_file.close()

    model = tf.keras.models.model_from_json(loaded_model_json)
    # load weights into new model
    model_filename = os.path.join(model_dir, model_filename)
    model.load_weights(model_filename)
    print("Loaded model from directory: ", model_filename)
    print(model.summary())

    X_train, y_train, vocab_size = data_pre(datafolder)
    # truncate and pad input sequences
  
    X_train = sequence.pad_sequences(X_train, maxlen=max_length)

    model.compile(loss=JSD_Loss, optimizer=opt)
    for epoch in range(last_epoch, epochs):
        print('epoch ', epoch)
        data, labels = generate_data(X_train, y_train, tuplet_size)
        model.fit(data, labels, epochs=1, batch_size=8, verbose=1)
        model_json = model.to_json()

        with open(jsonfilename, "w") as json_file:
            json_file.write(model_json)

        # serialize weights to HDF5
        
        model.save_weights(model_filename)
        print("Saved model to directory: ", model_filename)

In [20]:
if __name__ == '__main__':
    model_dir = '/content/gdrive/MyDrive/TDQN/models'
    # first you need preprocess the orginal king's data
    # orgnize_King_Congress('./concomb')
    #orgnize_King_Enron('./enrontexts')
    # orgnize_King_Immigraion('./immigtexts')

    # folder = './congress'
    # train_rand_king_congress(folder, 2)
    # test_rand_king_congress(folder)
  
    
    
    # get_truth(folder)

    #train_rand_king_enron(folder, 5) 

    datafolder = '/content/gdrive/MyDrive/TDQN/data/README/enron'
    ckptfolder = 'checkpoints-enron'
    jsonfilename='model_enron.json'
    model_filename = 'model_enron.h5'
    resultfile = 'result__enron.txt'
    enron_epochs =11
    enron_tuplet_size = 100 

    #default, but get adjusted after we process the document
    enron_avg_nwords = 200
    
    train_rand_king("enron", datafolder, 5, model_dir, ckptfolder,
               tuplet_size = enron_tuplet_size, epochs= enron_epochs, 
               max_length= enron_avg_nwords, embedding_vecor_length = 150)

    print("max_length for enron:", enron_avg_nwords)
    
    #train_model_plus(datafolder, model_dir, jsonfilename, model_filename, tuplet_size=10, epochs=20, max_length=enron_max_length, last_epoch=10)

    #test_rand_king_enron(folder, 5)
    """ 
    test_rand_king(datafolder, 5, model_dir, jsonfilename, model_filename, resultfile, 
                   tuplet_size=enron_tuplet_size, epochs=enron_epochs, max_length=enron_avg_nwords)
    """
    
    print("\n\ndone...\n\n")

['11.txt', '16.txt', '14.txt', '13.txt', '15.txt']
number of words: 30109
max_#words_per_doc:  30014
100 (None, 200, 150)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 200)]        0           []    