# ML n DL for Programmers
-------------------------------
### Session IV

## Improving input tensor - Embeddings
* One hot encoding is not an ideal way to represent words/characters.
* They don't capture semantic relationship.


## First method
* Use embedding layer provided by keras.
* This will be **version 4.**

In [None]:
# Load training data
import gensim.downloader as api
from smart_open import smart_open

text8_path = api.load("text8", return_path=True)
text8_data = ""
with smart_open(text8_path, 'rb') as file:
    for line in file:
        line = line.decode('utf8')
        text8_data += line
text8_data = text8_data.strip()
text8_data = text8_data[:1000000]
print(f'Lenght of Corpus: {len(text8_data)}')

# Prepare dictionaries
chars = sorted(list(set(text8_data)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(f'unique chars: {len(chars)}')

In [None]:
import numpy as np
from tqdm import tqdm_notebook as tqdm

# prepare integer label for our text8_data
text8_data = [char_indices[char] for char in tqdm(text8_data)]

# Prepare training data
SEQUENCE_LENGTH = 30
STEP = 3
sentences = []
next_chars = []
for i in tqdm(range(0, len(text8_data)-SEQUENCE_LENGTH, STEP)):
    sentences.append(text8_data[i:i+SEQUENCE_LENGTH])
    next_chars.append(text8_data[i+SEQUENCE_LENGTH])

sentences = np.array(sentences)
next_chars = np.array(next_chars)

print(f'number of training sentences: {len(sentences)}')
print(f'2nd sentence: {sentences[2]}')
print(f'char after 2nd sentence: {next_chars[2]}')
print(f'3rd sentence: {sentences[3]}')
print(f'shape of sentences: {sentences.shape}')
print(f'shape of next_chars: {next_chars.shape}')

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, BatchNormalization, Dropout

model = Sequential()
model.add(Embedding(len(chars), 20, input_length=SEQUENCE_LENGTH, name='input_layer'))
model.add(LSTM(150, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(40, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(len(chars), activation='softmax', name='output_layer'))

model.summary()

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical

early_stop = EarlyStopping(patience=5)
reduce_lr = ReduceLROnPlateau(factor=0.2, patience=3, verbose=1)
callbacks = [early_stop, reduce_lr]

# Convert labels (integers to categorical data), basically one-hot encode labels
next_chars = to_categorical(next_chars, len(chars))

# Train 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(sentences, next_chars, validation_split=0.1, batch_size=64, epochs=50, callbacks=callbacks, shuffle=True)

#save model and its history
model.save('models/predictive_keyboard_v4.h5')
pickle.dump(history.history, open('models/history_pk_v4.p', 'wb'))