# Import 

In [1]:
# https://towardsdatascience.com/neural-machine-translation-with-python-c2f0a34f7dd

import collections
import pandas as pd

import helper
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


# Load Data

In [220]:
abbrev = '.csv'
clean = '.csv'

In [227]:
def tokenize(x):
    """
    Tokenize x
    :param x: List of sentences/strings to be tokenized
    :return: Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    num_words=3000
    tokenizer = Tokenizer(num_words=num_words, oov_token="<UKN>")
    tokenizer.fit_on_texts(x)
    
    return tokenizer.texts_to_sequences(x), tokenizer


In [228]:
def pad(x, length=None):
    """
    Pad x
    :param x: List of sequences.
    :param length: Length to pad the sequence to.  If None, use length of longest sequence in x.
    :return: Padded numpy array of sequences
    """
    return pad_sequences(x, maxlen=length, padding='post')

In [229]:
def preprocess(x, y):
    """
    Preprocess x and y
    :param x: Feature List of sentences
    :param y: Label List of sentences
    :return: Tuple of (Preprocessed x, Preprocessed y, x tokenizer, y tokenizer)
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [230]:
data = all_data(abbrev, clean) 

In [231]:
abbrev_lst = list(data.abbrev_df['Original'])
clean_lst = list(data.clean_df['Label'])

In [232]:
preproc_abbrev, preproc_clean, abbrev_tokenizer, clean_tokenizer = preprocess(abbrev_lst, clean_lst)
    
max_abbrev_length = preproc_abbrev.shape[1]
max_clean_length = preproc_clean.shape[1]
abbrev_vocab_size = len(abbrev_tokenizer.word_index)
clean_vocab_size = len(clean_tokenizer.word_index)

print('Data Preprocessed')
print("Max Abbreviation length:", max_abbrev_length)
print("Max Clean length:", max_clean_length)
print("Abbreviation vocabulary size:", abbrev_vocab_size)
print("Clean vocabulary size:", clean_vocab_size)

Data Preprocessed
Max Abbreviation length: 5
Max Clean length: 5
Abbreviation vocabulary size: 3103
Clean vocabulary size: 1162


In [234]:
def logits_to_text(preds_index,preds,prob_threshold,tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'
    
    counter = 0
    
    for x in preds:
        if x < prob_threshold:
            preds_index[counter] = 1.0
        else:
            x
        counter += 1

    return ' '.join([index_to_words[prediction] for prediction in preds_index])

In [235]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a RNN model using word embedding on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """

    # Hyperparameters
    learning_rate = 0.005
    
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
#     model.add(GRU(256, return_sequences=True))   
    model.add(Bidirectional(GRU(256, return_sequences=True), input_shape=input_shape[1:]))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [237]:
tmp_x = pad(preproc_abbrev, preproc_clean.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_clean.shape[-2]))

In [239]:
embed_rnn_model = embed_model(
    tmp_x.shape,
    preproc_clean,
    len(abbrev_tokenizer.word_index)+1,
    len(clean_tokenizer.word_index)+1)

embed_rnn_model.summary()


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 5, 256)            794624    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 5, 512)            787968    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 5, 1024)           525312    
_________________________________________________________________
dropout_3 (Dropout)          (None, 5, 1024)           0         
_________________________________________________________________
time_distributed_6 (TimeDist (None, 5, 1163)           1192075   
Total params: 3,299,979
Trainable params: 3,299,979
Non-trainable params: 0
_________________________________________________________________


In [240]:
from time import time
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping, ModelCheckpoint

# filename = 'cheez_translate_005.h5'
# checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

callbacks = [EarlyStopping(monitor='val_loss', patience=200),
            ModelCheckpoint(filepath='1000_epochs_100_batch_patience_200_spell_check_12_10_19.h5', 
                            monitor='val_loss', 
                            save_best_only=True),
            TensorBoard(log_dir="logs/{}".format(time()))
            ]

embed_rnn_model.fit(tmp_x, 
                    preproc_clean, 
                    batch_size=100, 
                    epochs=1000, 
                    validation_split=0.1,
                    callbacks=callbacks
                   )

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 3389 samples, validate on 377 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1

Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000
Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000
Epoch 133/1000
Epoch 134/1000
Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000
Epoch 139/1000
Epoch 140/1000
Epoch 141/1000
Epoch 142/1000
Epoch 143/1000
Epoch 144/1000
Epoch 145/1000
Epoch 146/1000
Epoch 147/1000
Epoch 148/1000
Epoch 149/1000
Epoch 150/1000
Epoch 151/1000
Epoch 152/1000
Epoch 153/1000
Epoch 154/1000
Epoch 155/1000
Epoch 156/1000
Epoch 157/1000
Epoch 158/1000
Epoch 159/1000
Epoch 160/1000
Epoch 161/1000
Epoch 162/1000
Epoch 163/1000
Epoch 164/1000
Epoch 165/1000
Epoch 166/1000
Epoch 167/1000
Epoch 168/1000
Epoch 169/1000
Epoch 170/1000
Epoch 171/1000
Epoch 172/1000
Epoch 173/1000
Epoch 174/1000
Epoch 175/1000
Epoch 176/1000
Epoch 177/1000
Epoch 178/

<keras.callbacks.callbacks.History at 0x141eeac10>

In [476]:
from keras.models import load_model

embed_rnn_model = load_model('abbrev_model.h5')


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [477]:
# https://www.tensorflow.org/tensorboard/r2/tensorboard_in_notebooks
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 4166), started 2:59:34 ago. (Use '!kill 4166' to kill it.)

In [520]:
# Use abbrev_tokenizer here and clean tokenizer below
result = np.zeros((1,5))
result

da = abbrev_tokenizer.texts_to_sequences(["Boat GRN ONIN"])
# !!! "GRN ONIN" is not in the label file, so it is generalizising at the character level
da = pad(np.array([da[0][:5]]))
# da = da.astype(int)
da.shape

(1, 3)

In [526]:
da

array([[   1, 2004, 1163]], dtype=int32)

In [528]:
clean_tokenizer.sequences_to_texts(da)

['<UKN> <UKN> <UKN>']

In [521]:
result[:da.shape[0],:da.shape[1]] = da

In [531]:
#all preds
result

array([[1.000e+00, 2.004e+03, 1.163e+03, 0.000e+00, 0.000e+00]])

In [523]:
#  https://www.kaggle.com/hamishdickson/using-keras-oov-tokens
# <unk> Can just use it as a repository for words we dont know and may want to add

preds = np.max(embed_rnn_model.predict(result)[0], 1)
preds

array([0.37835822, 1.        , 0.9999918 , 1.        , 1.        ],
      dtype=float32)

In [524]:
preds_index = np.argmax(embed_rnn_model.predict(result)[0],1)
preds_index

preds = np.max(embed_rnn_model.predict(result)[0], 1)
preds

prob_threshold = .80

In [525]:
logits_to_text(preds_index,preds,prob_threshold, clean_tokenizer)

'<UKN> green onion <PAD> <PAD>'

In [456]:
# abbrev_tokenizer.word_index
# clean_tokenizer.word_index

In [78]:
# https://github.com/keras-team/keras/issues/9574
    
num_words=10
tokenizer = Tokenizer(num_words=num_words, oov_token="<UKN>")
tokenizer.fit_on_texts(["The quick brown fox jumps over the lazy dog. The lazy dog jumps over a frog."])
print(tokenizer.word_index)
print(tokenizer.texts_to_sequences(["the dog jumps over the elephant"]))

{'<UKN>': 1, 'the': 2, 'jumps': 3, 'over': 4, 'lazy': 5, 'dog': 6, 'quick': 7, 'brown': 8, 'fox': 9, 'a': 10, 'frog': 11}
[[2, 6, 3, 4, 2, 1]]


In [1]:
# https://github.com/Currie32/Spell-Checker/blob/master/SpellChecker.ipynb

letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]

def noise_maker(sentence, threshold):
    '''Relocate, remove, or add characters to create spelling mistakes'''
    
    noisy_sentence = []
    i = 0
    while i < len(sentence):
        random = np.random.uniform(0,1,1)
        # Most characters will be correct since the threshold value is high
        if random < threshold:
            noisy_sentence.append(sentence[i])
        else:
            new_random = np.random.uniform(0,1,1)
            # ~33% chance characters will swap locations
            if new_random > 0.67:
                if i == (len(sentence) - 1):
                    # If last character in sentence, it will not be typed
                    continue
                else:
                    # if any other character, swap order with following character
                    noisy_sentence.append(sentence[i+1])
                    noisy_sentence.append(sentence[i])
                    i += 1
            # ~33% chance an extra lower case letter will be added to the sentence
            elif new_random < 0.33:
                random_letter = np.random.choice(letters, 1)[0]
                noisy_sentence.append(vocab_to_int[random_letter])
                noisy_sentence.append(sentence[i])
            # ~33% chance a character will not be typed
            else:
                pass     
        i += 1
    return noisy_sentence

In [2]:
training_sorted = ["the cow ate grass","the dog barked"]

In [1]:
import numpy as np

# Check to ensure noise_maker is making mistakes correctly.
threshold = 0.9
for sentence in training_sorted[:5]:
    print(sentence)
    print(noise_maker(sentence, threshold))

In [30]:
big_df.to_csv('bi_rnn_output.csv')

In [None]:
# Rules: 
#     -- Give it the words around for context. don't have to be next to eachother in product name
#     -- Get rid of extra random stuff
#     -- seperate words that are together 'HERSHEYSCHOCOLATE'
#     --can often just copy and paste bad text into google and it will fix it (using quality model on ~0.02 items)
#         --or could use google API: https://developers.google.com/custom-search/v1/overview?authuser=1
#         --custom search API key: AIzaSyDEvYVrdsqqfe6XwxLxEgSI3ph2sMfLMrc 
#                 --https://github.com/googleapis/google-api-python-client
#     --don't need to do 'chkn' a bunch.  Need a balanced dataset.  Can do undersampling/oversampling also
    
# try active learning next...
# https://hackernoon.com/teach-seq2seq-models-to-learn-from-their-mistakes-using-deep-curriculum-learning-tutorial-8-a730a387754
