# ML n DL for Programmers
-------------------------------
### Session IV

## Improving input tensor - Embeddings
![embedding and nn](images/embeddingandNN.jpeg)
* One hot encoding is not an ideal way to represent words/characters.
* They don't capture semantic relationship.


## First method
* Use embedding layer provided by keras.
* This will be **version 4.**

In [None]:
# Load training data
import gensim.downloader as api
from smart_open import smart_open

text8_path = api.load("text8", return_path=True)
text8_data = ""
with smart_open(text8_path, 'rb') as file:
    for line in file:
        line = line.decode('utf8')
        text8_data += line
text8_data = text8_data.strip()
text8_data = text8_data[:1000000]
print(f'Lenght of Corpus: {len(text8_data)}')

# Prepare dictionaries
chars = sorted(list(set(text8_data)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(f'unique chars: {len(chars)}')

In [None]:
import numpy as np
from tqdm import tqdm_notebook as tqdm

# prepare integer label for our text8_data
text8_data = [char_indices[char] for char in tqdm(text8_data)]

# Prepare training data
SEQUENCE_LENGTH = 30
STEP = 3
sentences = []
next_chars = []
for i in tqdm(range(0, len(text8_data)-SEQUENCE_LENGTH, STEP)):
    sentences.append(text8_data[i:i+SEQUENCE_LENGTH])
    next_chars.append(text8_data[i+SEQUENCE_LENGTH])

sentences = np.array(sentences)
next_chars = np.array(next_chars)

print(f'number of training sentences: {len(sentences)}')
print(f'2nd sentence: {sentences[2]}')
print(f'char after 2nd sentence: {next_chars[2]}')
print(f'3rd sentence: {sentences[3]}')
print(f'shape of sentences: {sentences.shape}')
print(f'shape of next_chars: {next_chars.shape}')

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, BatchNormalization, Dropout

model = Sequential()
model.add(Embedding(len(chars), 5, input_length=SEQUENCE_LENGTH, name='input_layer'))
model.add(LSTM(150, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(40, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(len(chars), activation='softmax', name='output_layer'))

model.summary()

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical
import pickle

early_stop = EarlyStopping(patience=5)
reduce_lr = ReduceLROnPlateau(factor=0.2, patience=3, verbose=1)
callbacks = [early_stop, reduce_lr]

# Convert labels (integers to categorical data), basically one-hot encode labels
next_chars = to_categorical(next_chars, len(chars))

# Train 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(sentences, next_chars, validation_split=0.1, batch_size=64, epochs=50, callbacks=callbacks, shuffle=True)

#save model and its history
model.save('models/predictive_keyboard_v4.h5')
pickle.dump(history.history, open('models/history_pk_v4.p', 'wb'))

In [None]:
# load model back again
from keras.models import load_model
model = load_model('models/predictive_keyboard_v4.h5')
history = pickle.load(open("models/history_pk_v4.p", "rb"))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# plot accuracy
plt.plot(history['acc'])
plt.plot(history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left');

In [None]:
# Prepare test data
test_data = text8_data[-50000:]
test_sentences = []
test_chars = []
for i in range(0, len(test_data)-SEQUENCE_LENGTH, STEP):
    test_sentences.append(test_data[i:i+SEQUENCE_LENGTH])
    test_chars.append(test_data[i+SEQUENCE_LENGTH])

print(f'number of test sentences: {len(test_sentences)}')
print(f'2nd sentence: {test_sentences[2]}')
print(f'char after 2nd sentence: {test_chars[2]}')
print(f'3rd sentence: {test_sentences[3]}')

test_sentences = np.array(test_sentences)
test_chars = np.array(test_chars)

test_chars = to_categorical(test_chars, len(chars))
model.evaluate(test_sentences, test_chars)

In [None]:
# Post processing
import heapq


def prepare_input(text):
    text = [[char_indices[char] for char in text]]
    x = np.array(text)
    return x

def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [None]:
def predict_completion(text):
    original_text = text
    generated = text
    completion = ''
    while True:
        x = prepare_input(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]
        text = text[1:] + next_char
        completion += next_char
        
        if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
            return completion

def predict_completions(text, n=3):
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [indices_char[idx] + predict_completion(text[1:] + indices_char[idx]) for idx in next_indices]

In [None]:
# Test model
test_sent = ["He told us a very exciting adventure story",
             "She wrote him a long letter but he did not read it",
             "The sky is clear black with shining stars",
             "I am counting my calories yet I really want dessert",
             "We need to rent a room for our party"
            ]
for sent in  test_sent:
    sent_4_NN = sent[:30].lower()
    print(sent_4_NN)
    print(predict_completions(sent_4_NN, 5))
    print()

## Embeddings, now for real

* Dense representation of word/character, unlike one-hot encoding which is sparse representation.
* Captures semantics in language.

### Many different ways:

* Word2Vec
* Glove
* fasttext
* WordRank
* ULMFit
* Elmo
* BERT
 and many more ....

## Word2Vec - As an example
* Based on philosophy that words that often come together are often used in same context.

* Two ways:
    - Skipgram model
    - CBOW model

## Skipgram

![skipgram](images/skipgram.png)

## CBOW(Continuous Bag of Words)
<center><img src="images/cbow.png" width="350px" height="450px"/></center>