In [None]:
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

def cleaning(text):
    """
        Filter the text from punctuations and stopwords. Lemmatize the words in the text.
    """
        
    # clear the text from punctuation
    punct = string.punctuation.replace('\'', '')
    text = [word.translate("".maketrans(punct, " "*len(punct))) for word in text.split()]

    # lemmatize text
    stemmer = WordNetLemmatizer()
    text = [stemmer.lemmatize(word).lower() for word in text]
    
    sw = stopwords.words('english')
    text = [word for word in text if word not in sw]
    
    # remove unecessary spaces
    text = re.sub('\s+', ' ', ' '.join(text).strip())
    
    return text

In [None]:
import tensorflow as tf

#load and clean the moliere complete dataset
with open('./data/moliere_complete.txt', 'r') as f:
    moliere = ''.join(list(f))
    clean_moliere = cleaning(moliere)
dataset_moliere = [clean_moliere[i*100:(i+1)*100] for i in range(int(len(clean_moliere)/100))][:3000]

#load and clean the shakespeare complete dataset
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 
                                       'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
shakespeare = open(path_to_file, 'rb').read().decode(encoding='utf-8')
clean_shakespeare= cleaning(shakespeare)
dataset_shakespeare = [clean_shakespeare[i*100:(i+1)*100] for i in range(int(len(clean_shakespeare)/100))][:3000]

In [None]:
import pickle as pkl

# load and clear the generated datasets
with open('./result/lstm_moliere.pkl', 'rb') as f:
    lstm_moliere = [cleaning(i)[:100] for i in pkl.load(f)]
    lstm_moliere = [i for i in lstm_moliere if len(i)==100]
    
with open('./result/lstm_shakespeare.pkl', 'rb') as f:
    lstm_shakespeare = [cleaning(i)[:100] for i in pkl.load(f)]
    lstm_shakespeare = [i for i in lstm_shakespeare if len(i)==100]

with open('./result/transformer_moliere.pkl', 'rb') as f:
    transformer_moliere = [cleaning(i)[:100] for i in pkl.load(f)]
    transformer_moliere = [i for i in lstm_moliere if len(i)==100]
    
with open('./result/transformer_shakespeare.pkl', 'rb') as f:
    transformer_shakespeare = [cleaning(i)[:100] for i in pkl.load(f)]
    transformer_shakespeare = [i for i in lstm_shakespeare if len(i)==100]

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

def create_input_output(moliere, shakespeare):
    dataset = moliere+shakespeare
    # create labels 
    Y = np.zeros([len(dataset), 2])

    Y[:len(moliere), 0] = 1
    Y[len(shakespeare):, 1] = 1

    # tokenize the text
    tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
    tokenizer.fit_on_texts(dataset)

    # create the data matrix, truncate it to 100 length
    x = tokenizer.texts_to_sequences(dataset)
    X = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=100)
    print(len(tokenizer.word_index))
    
    return X, Y

# create the input matrices from the original datasets
X, Y = create_input_output(dataset_moliere, dataset_shakespeare)

# create the input matrices from the generated datasets
# for transformer experiments change to transformer_...
X_aug, Y_aug = create_input_output(lstm_moliere, lstm_shakespeare)

# train, validation and test splits
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.1)
X_train,  X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.1*(1/0.9))

# concatenate the generated datasets to the train
X_train = np.concatenate([X_train, X_aug])
Y_train = np.concatenate([Y_train, Y_aug])

In [None]:
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.models import Sequential

# size of the english character vocabulary
vocab_size = 38

# define early stopping
callback = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=5, restore_best_weights=True)

# create a simple LSTM model
model = Sequential()
model.add(Embedding(
    input_dim=vocab_size+1,
    output_dim=64,
    trainable=True))
model.add(LSTM(units=128, name='lstm_layer_1'))
model.add(Dropout(0.1))
model.add(Dense(units=256, activation='relu', name='dense_layer_1'))
model.add(Dropout(0.1))
model.add(Dense(units=2, name='output_layer', activation='softmax'))

#define optimizer function
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

model.compile(loss = 'binary_crossentropy', 
      optimizer =  optimizer,
      metrics   =  ['acc'])

#print the average accuracy of 10 run
results = []
for i in range(10):
    model.fit(x=X_train, y = Y_train, validation_data=(X_valid, Y_valid), epochs=20, callbacks = [callback])
    result = model.evaluate(X_test, Y_test)
    results.append(result[1])

print(sum(results)/len(results))