In [1]:
import numpy as np
import pandas as pd
import gc
from nltk import FreqDist
import time
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Embedding, Bidirectional
from keras.optimizers import RMSprop
from keras.layers.recurrent import LSTM
from utils import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
input_vocab_size = 250
target_vocab_size = 346    # 1000 in full dataset
# target_vocab_size = 1000
num_samples = 200000
context_size = 3
padding_entity = [0]
self_sil_retention_percent = 0.5
X_seq_len = 60
y_seq_len = 20
hidden = 256
layers = 2
epochs = 5    # used just 1 for full dataset
batch_size = 128
val_split = 0.1
learning_rate = 0.1

In [3]:
# Compiling model before loading any data (some GPUs fail to compile if data sets are large)
model = Sequential()

# Creating encoder network
model.add(Embedding(input_vocab_size+2, hidden, input_length=X_seq_len, mask_zero=True))
print('Embedding layer created')
model.add(Bidirectional(LSTM(hidden, return_sequences = True), merge_mode = 'concat'))
model.add(Bidirectional(LSTM(hidden, return_sequences = True), merge_mode = 'concat'))
model.add(Bidirectional(LSTM(hidden), merge_mode = 'concat'))
model.add(RepeatVector(y_seq_len))
print('Encoder layer created')

# Creating decoder network
for _ in range(layers):
    model.add(LSTM(hidden, return_sequences=True))
model.add(TimeDistributed(Dense(target_vocab_size+1)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print('Decoder layer created')

# checking the model summary
model.summary()

Embedding layer created
Encoder layer created
Decoder layer created
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 60, 256)           64512     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 60, 512)           1050624   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 60, 512)           1574912   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 512)               1574912   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 20, 512)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 20, 256)           787456    
_________________________________________________________________
lstm_5 (

In [4]:
start = time.time()
# Load training data
X_train_data = pd.read_csv("en_train.csv")
X_train_data['before'] = X_train_data['before'].apply(str)
X_train_data['after'] = X_train_data['after'].apply(str)

# Class counts
# DATE = 258,348
# LETTERS = 152,795
# CARDINAL = 133744
# VERBATIM - has lots of special symbols - 78108
# MEASURE = 14783
# MONEY = 6128

print('Training data loaded in {0} s.'.format(time.time()-start))
print(X_train_data.shape)
X_train_data = X_train_data.iloc[:num_samples]
print(X_train_data.shape)
X_train_data.head()

Training data loaded in 10.44523811340332 s.
(9918441, 5)
(200000, 5)


Unnamed: 0,sentence_id,token_id,class,before,after
0,0,0,PLAIN,Brillantaisia,Brillantaisia
1,0,1,PLAIN,is,is
2,0,2,PLAIN,a,a
3,0,3,PLAIN,genus,genus
4,0,4,PLAIN,of,of


In [5]:
start = time.time()

# Create vocabularies
# Target vocab
y = list(np.where(X_train_data['class'] == "PUNCT", "sil.",
      np.where(X_train_data['before'] == X_train_data['after'], "<self>",
               X_train_data['after'])))

y = [token.split() for token in y]
dist = FreqDist(np.hstack(y))
temp = dist.most_common(target_vocab_size-1)
temp = [word[0] for word in temp]
temp.insert(0, 'ZERO')
temp.append('UNK')

target_vocab = {word:ix for ix, word in enumerate(temp)}
target_vocab_reversed = {ix:word for word,ix in target_vocab.items()}

# Input vocab
X = list(X_train_data['before'])
X = [list(token) for token in X]

dist = FreqDist(np.hstack(X))
temp = dist.most_common(input_vocab_size-1)
temp = [char[0] for char in temp]
temp.insert(0, 'ZERO')
temp.append('<norm>')
temp.append('UNK')

input_vocab = {char:ix for ix, char in enumerate(temp)}

gc.collect()

print('Vocab created in {0} s.'.format(time.time()-start))

Vocab created in 3.219439744949341 s.


In [6]:
start = time.time()

# Converting input and target tokens to index values
X = index(X, input_vocab)
y = index(y, target_vocab)

gc.collect()

print('Replaced tokens with integers in {0} s.'.format(time.time()-start))

Replaced tokens with integers in 0.4380378723144531 s.


In [7]:
start = time.time()

# Adding a context window of 3 words in Input, with token separated by <norm>
X = add_context_window(X, context_size, padding_entity, input_vocab)

print('Added context window to X in {0} s.'.format(time.time()-start))

Added context window to X in 0.8442680835723877 s.


In [8]:
start = time.time()

X = batch_wise_padding(X, X_seq_len) # Padding
y = batch_wise_padding(y, y_seq_len)

# Convert X_test to integer array, batch-wise (converting full data to array at once takes a lot of time)
X = array_batchwise(X, X_seq_len)
y = array_batchwise(y, y_seq_len)

print('Added padding and converted to np array in {0} s.'.format(time.time()-start))

Added padding and converted to np array in 3.2257697582244873 s.


In [9]:
start = time.time()

y_sequences = sequences(y, y_seq_len, target_vocab)

print('Fitting model...')

#Fitting the model on the validation data with batch size set to 128 for a total of 10 epochs:
history_full_data = model.fit(np.asarray(X), np.asarray(y_sequences), batch_size=batch_size, epochs=epochs, verbose=1)

Fitting model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
model.save_weights('text_norm_model_full.hdf5')

In [None]:
from matplotlib import pyplot as plt 

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [12]:
# Load weights from training
model.load_weights('text_norm_model.hdf5')

In [19]:
# Prepare test data in the right format
X_test_data = pd.read_csv("en_test.csv")
X_test_data['before'] = X_test_data['before'].apply(str)
X_test_data.head()

Unnamed: 0,sentence_id,token_id,before
0,0,0,Another
1,0,1,religious
2,0,2,family
3,0,3,is
4,0,4,of


In [22]:
X_test_data = X_test_data.iloc[:10000]
X_test = list(X_test_data['before'])
X_test = [list(token) for token in X_test]

X_test = index(X_test, input_vocab) # Convert to integer index
X_test = add_context_window(X_test, context_size, padding_entity, input_vocab) # Add context window
X_test = batch_wise_padding(X_test, X_seq_len) # Padding

# Convert X_test to integer array, batch-wise (converting full data to array at once takes a lot of time)
X_test = array_batchwise(X_test, X_seq_len)

In [23]:
# Make predictions
# Predicting for 1000 sequences at a time
for i in range(0, len(X_test), 10000):
    if i + 10000 >= len(X_test):
        i_end = len(X_test)
    else:
        i_end = i + 10000
    X_test_small = X_test[i:i_end]
    print('Predictions done for {}/{} samples '.format(i, len(X_test)))
    test_predictions = np.argmax(model.predict(np.asarray(X_test_small), batch_size = 64, verbose=1), axis=2)

predicted_test_sequences = []
for prediction in test_predictions:
    sequence = ' '.join([target_vocab_reversed[index] for index in prediction if index > 0])
    predicted_test_sequences.append(sequence)
np.savetxt('test_result', predicted_test_sequences, fmt='%s')

Predictions done for 0/10000 samples 
