# ML n DL for Programmers
-------------------------------
### Session IV

## Improving input tensor - Embeddings
* One hot encoding is not an ideal way to represent words/characters.
* They don't capture semantic relationship.


## First method
* Use embedding layer provided by keras.

In [9]:
# Load training data
import gensim.downloader as api
from smart_open import smart_open

text8_path = api.load("text8", return_path=True)
text8_data = ""
with smart_open(text8_path, 'rb') as file:
    for line in file:
        line = line.decode('utf8')
        text8_data += line
text8_data = text8_data.strip()
text8_data = text8_data[:1000000]
print(f'Lenght of Corpus: {len(text8_data)}')

# Prepare dictionaries
chars = sorted(list(set(text8_data)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(f'unique chars: {len(chars)}')

Lenght of Corpus: 1000000
unique chars: 27


In [13]:
from tqdm import tqdm_notebook as tqdm
# prepare integer label for our text8_data
text8_data = [char_indices[char] for char in tqdm(text8_data)]

# Prepare training data
SEQUENCE_LENGTH = 30
STEP = 3
sentences = []
next_chars = []
for i in tqdm(range(0, len(text8_data)-SEQUENCE_LENGTH, STEP)):
    sentences.append(text8_data[i:i+SEQUENCE_LENGTH])
    next_chars.append(text8_data[i+SEQUENCE_LENGTH])

print(f'number of training sentences: {len(sentences)}')
print(f'2nd sentence: {sentences[2]}')
print(f'char after 2nd sentence: {next_chars[2]}')
print(f'3rd sentence: {sentences[3]}')

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=333324), HTML(value='')))


number of training sentences: 333324
2nd sentence: [9, 19, 13, 0, 15, 18, 9, 7, 9, 14, 1, 20, 5, 4, 0, 1, 19, 0, 1, 0, 20, 5, 18, 13, 0, 15, 6, 0, 1, 2]
char after 2nd sentence: 21
3rd sentence: [0, 15, 18, 9, 7, 9, 14, 1, 20, 5, 4, 0, 1, 19, 0, 1, 0, 20, 5, 18, 13, 0, 15, 6, 0, 1, 2, 21, 19, 5]


In [15]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, BatchNormalization, Dropout

model = Sequential()
model.add(Embedding(len(chars), 20, input_length=SEQUENCE_LENGTH, name='input_layer'))
model.add(LSTM(150, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(40, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(len(chars), activation='softmax', name='output_layer'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (Embedding)      (None, 30, 20)            540       
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 150)           102600    
_________________________________________________________________
batch_normalization_1 (Batch (None, 30, 150)           600       
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 150)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               100400    
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
__________

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(patience=5)
reduce_lr = ReduceLROnPlateau(factor=0.2, patience=3, verbose=1)
callbacks = [early_stop, reduce_lr]

# Train 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(input_sent, output_char, validation_split=0.1, batch_size=64, epochs=50, callbacks=callbacks, shuffle=True)

#save model and its history
model.save('models/predictive_keyboard_v2.h5')
pickle.dump(history.history, open('models/history_pk_v2.p', 'wb'))