# Import Library

In [1]:
#######
## Imports
#######
import tensorflow as tf
import numpy as np
import os


import tensorlayer as tl
from tensorlayer.layers import *
from nltk.tokenize import RegexpTokenizer

  from ._conv import register_converters as _register_converters


# 1.0  Clean Data


### 1.1 Get  Word Frequency

In [2]:
def getDictElems(dictn, n = 2):
    return [{k: dictn[k] for k in list(dictn.keys())[:n]  }]

#.rstrip(os.linesep)

def correct_chat_text(line, corrector_dict = {} ):     
    line =  (line.encode('ascii', 'ignore')).decode("utf-8").lower()\
            .replace('.',' . ')\
        .replace('?',' ? ').replace(',',' , ')#.replace('<CHAT_EOD>','...'+os.linesep)
    new_line = []
    for word in line.split():
        #if w not in set(stopwords.words("english")):
        if word in  corrector_dict:
            new_line.append(corrector_dict[word] )
        else:
            new_line.append(word)
    line = " ".join(new_line)
    
    # Implement post correction of i'm  or else it breaks i'm to [i, m]
    tokenizer = RegexpTokenizer(r'\w+')
    line = " ".join(tokenizer.tokenize(line))
        
    return line

def getXYnWordFrequencyFromText(file  = "chatter.txt", corrector_dict = {} ):    
    print('Corrector dict len is', len(corrector_dict) )
    word_count = {}
    X,Y = [],[]
    cur_chat_response = ''
    next_line_is_new_chat = True
    
    with open(file) as fin:
        for line in fin:
            if line.replace('...','EOChat') != 'EOChat':
                line = correct_chat_text(line, corrector_dict)                         
            
            if next_line_is_new_chat :
                X.append(line.split() )
            else:
                cur_chat_response += line+' '
            
            if len(line.split()) == 0:
                next_line_is_new_chat = True
                Y.append(cur_chat_response.split())
                cur_chat_response =  ''
            else:
                next_line_is_new_chat = False
                           
            for word in line.split():
                if word in word_count.keys():
                    word_count[word] += 1
                else:
                    word_count[word] = 1
    return word_count, X,Y
                    
word_count, X,Y  = getXYnWordFrequencyFromText("chatter.txt")
print('Sample word count dictionary is :', getDictElems(word_count, 50) )
print('X is :', X[0:2] )
print('Y is :', Y[0:2] )


Corrector dict len is 0
Sample word count dictionary is : [{'are': 891, 'you': 2912, 'real': 40, 'no': 540, 'a': 962, 'bot': 64, 'is': 661, 'i': 2291, 'm': 361, 'not': 596, 'yes': 475, 'prove': 24, 'it': 607, 'first': 24, 'can': 225, 't': 685, 'something': 62, 'don': 418, 'believe': 66, 'the': 710, 'halting': 1, 'problem': 12, 'see': 55, 're': 289, 'that': 729, 's': 405, 'what': 627, 'told': 24, 'would': 126, 'marry': 5, 'for': 186, 'love': 111, 'of': 391, 'my': 225, 'life': 59, 'me': 492, 'sure': 93, 'course': 28, 'everyone': 11, 'how': 208, 'did': 149, 'learn': 19, 'learned': 5, 'in': 237, 'age': 6, 'legends': 1, 'people': 63, 'nowadays': 1, 'know': 370, 'so': 219}]
X is : [['are', 'you', 'real'], ['i', 'would', 'marry', 'for', 'the', 'love', 'of', 'my', 'life']]
Y is : [['no', 'a', 'bot', 'is', 'real', 'i', 'm', 'not', 'a', 'bot', 'yes', 'you', 'are', 'prove', 'it', 'you', 'first', 'i', 'can', 't', 'prove', 'something', 'i', 'don', 't', 'believe', 'prove', 'the', 'halting', 'problem

###  1.2  Add Words not in Glove embeddings to wordvec_embedding

Although limitin word embeddings reduces search space, we won't be doing so because
because we want our model to be able to also generate words not previously seen i.e If we only had "sad, happy" words  in embeddings then we would not be able to predict  words other than that   i.e "surprised".

Because we would not be able to reverse lookup the word surprised fromt he word embedding vectors if it is not there at all.

In [3]:
import zipfile
#vocabulary_size
#pret_embeddings = np.empty(shape=(vocabulary_size,50),dtype=np.float32)

def getWordEmbedNFounds(word_count, wordvec_embedding = {}):     
    words_found = []

    with zipfile.ZipFile('glove.6B.zip') as glovezip:
        with glovezip.open('glove.6B.50d.txt') as glovefile:
            for li, line in enumerate(glovefile):
                # Progress
                #if (li+1)%10000==0: print('.',end='')
                line_tokens = line.decode('utf-8').split(' ')
                word = line_tokens[0]            
                
                vector = [float(v) for v in line_tokens[1:]]
                wordvec_embedding[word] = vector                
                    
                if word in word_count.keys():
                    words_found.append(word)
    return wordvec_embedding, words_found

wordvec_embedding, words_found = getWordEmbedNFounds(word_count)

print('\n Words in text and glove  ', words_found[:10])
print('\n Words  in text but not in glove = ', len(word_count) - len(words_found))


 Words in text and glove   ['the', 'of', 'to', 'and', 'in', 'a', 'for', 'that', 'on', 'is']

 Words  in text but not in glove =  193


In [4]:
def getWordsNotFound(word_count, words_found):
    not_found_words = {}
    for word in word_count:
        if word not in words_found:
            not_found_words[word] = word_count[word]
    return not_found_words

not_found_words = getWordsNotFound(word_count, words_found)
print('Not found words sample : ', getDictElems(not_found_words,5))


Not found words sample :  [{'maam': 1, 'k___': 7, 'gooooood': 1, 'geekiest': 1, 'j_______': 1}]


### 1.3 Invalid Words and Correction

USe count  to filter  must / good to have / no fixes

In [5]:
print('Not found words: ' )
print(not_found_words)

Not found words: 
{'maam': 1, 'k___': 7, 'gooooood': 1, 'geekiest': 1, 'j_______': 1, 'diden': 1, 'j____': 12, 'hahahaha': 1, 'yuppers': 1, 'awwwww': 1, 'jabberwacky': 18, 'uhmm': 1, 't___': 1, 'unthing': 1, 'superthing': 2, 's__': 1, 'l____': 3, 'srsly': 1, 'bloodtypes': 1, 'internetworks': 1, 'aaaaaaaaaa': 1, 'noumenal': 1, 'philolsopher': 1, 'philolsophers': 2, 'misapplications': 1, 'priviledge': 1, 'naturlich': 1, 'yeaup': 1, 'achali': 1, 'escopo': 1, 'alvo': 1, 'asno': 1, 'significar': 1, 'concordo': 1, 'mquina': 3, 'sonho': 1, 'acordar': 1, 'ento': 1, 'reconhece': 1, 'apesar': 1, 'devemos': 1, 'discutir': 1, 'chatbot': 6, 'mellifluity': 1, 'aleatorically': 1, 'recurse': 1, 'interweb': 1, 'gumph': 1, 'hahaha': 10, 'quantuum': 1, '085': 1, 'clarabel': 1, 'goodybe': 1, 'dienstag': 1, 'papagei': 3, 'pinapple': 1, 'brillig': 3, 'slithey': 3, 'toves': 3, 'counfounding': 1, 'wouln': 1, 'c____': 3, 'rythmically': 1, 'existense': 1, 'jsut': 1, 'gaaagooo': 1, 'gagagagagoo': 1, 'inscrutible

In [6]:
def rerunWordVecEmbedsPostCorrection(corrector_dict, wordvec_embedding= {} ):    
    word_count, X,Y = getXYnWordFrequencyFromText("chatter.txt", corrector_dict )
    wordvec_embedding, words_found = getWordEmbedNFounds(word_count, wordvec_embedding)
    not_found_words = getWordsNotFound(word_count, words_found)    
    return word_count, wordvec_embedding, not_found_words, X, Y

#word_count  = getWordFrequencyFromText("chatter.txt")


### Add unk and pad to WordVec embedding 

Because we want these to be the first embedding. We will later create word2id and id2word dict based on it and want pad to be of  0 index hence.

In [7]:
wordvec_embedding = {}
wordvec_embedding['_pad_'] = list( np.random.uniform(0, 0, 50 ) )
wordvec_embedding['_unk_'] = list( np.random.uniform(-1.0, 1.0, 50 ) )

In [8]:
# Corrector dict from stackoverflow
corrector_dict = { 
    ",": " , " ,"i'm": "i am", "ain't": "am not","aren't": "are not","can't": "can not","can't've": "cannot have",\
    "'cause": "because","could've": "could have","couldn't": "could not",\
    "couldn't've": "could not have","didn't": "did not","doesn't": "does not",\
    "don't": "do not","hadn't": "had not","hadn't've": "had not have","hasn't": "has not",\
    "haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",\
    "he's": "he is","how'd": "how did","how'll": "how will","how's": "how is","i'd": "i would",\
    "i'll": "i will","i'm": "i am","i've": "i have","isn't": "is not","it'd": "it would",\
    "it'll": "it will","it's": "it is","let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not","must've": "must have","mustn't": "must not","needn't": "need not","oughtn't": "ought not","shan't": "shall not","sha'n't": "shall not","she'd": "she would","she'll": "she will","she's": "she is","should've": "should have","shouldn't": "should not","that'd": "that would","that's": "that is","there'd": "there had","there's": "there is","they'd": "they would","they'll": "they will","they're": "they are","they've": "they have","wasn't": "was not","we'd": "we would","we'll": "we will","we're": "we are","we've": "we have","weren't": "were not","what'll": "what will","what're": "what are","what's": "what is","what've": "what have","where'd": "where did","where's": "where is","who'll": "who will","who's": "who is","won't": "will not","wouldn't": "would not","you'd": "you would","you'll": "you will","you're": "you are"
}

word_count, wordvec_embedding, not_found_words, X,Y = \
    rerunWordVecEmbedsPostCorrection(corrector_dict, wordvec_embedding )
print('\n Words  in text but not in glove = ', len(word_count) - len(words_found))
print(not_found_words) 


Corrector dict len is 74

 Words  in text but not in glove =  185
{'maam': 1, 'k___': 7, 'gooooood': 1, 'geekiest': 1, 'j_______': 1, 'diden': 1, 'j____': 12, 'hahahaha': 1, 'yuppers': 1, 'awwwww': 1, 'jabberwacky': 18, 'uhmm': 1, 't___': 1, 'unthing': 1, 'superthing': 2, 's__': 1, 'l____': 3, 'srsly': 1, 'bloodtypes': 1, 'internetworks': 1, 'aaaaaaaaaa': 1, 'noumenal': 1, 'philolsopher': 1, 'philolsophers': 2, 'misapplications': 1, 'priviledge': 1, 'naturlich': 1, 'yeaup': 1, 'achali': 1, 'escopo': 1, 'alvo': 1, 'asno': 1, 'significar': 1, 'concordo': 1, 'mquina': 3, 'sonho': 1, 'acordar': 1, 'ento': 1, 'reconhece': 1, 'apesar': 1, 'devemos': 1, 'discutir': 1, 'chatbot': 6, 'mellifluity': 1, 'aleatorically': 1, 'recurse': 1, 'interweb': 1, 'gumph': 1, 'hahaha': 10, 'quantuum': 1, '085': 1, 'clarabel': 1, 'goodybe': 1, 'dienstag': 1, 'papagei': 3, 'pinapple': 1, 'brillig': 3, 'slithey': 3, 'toves': 3, 'counfounding': 1, 'wouln': 1, 'c____': 3, 'rythmically': 1, 'existense': 1, 'jsut': 

### More cleaning to do later

In [9]:
#getDictElems( wordvec_embedding, 2)
print("Length of word embedding is :", len(wordvec_embedding) )
print("WordVector embedding of dimensions :",len(wordvec_embedding['the']) )


Length of word embedding is : 400002
WordVector embedding of dimensions : 50


### 1.4 Add new words (Unfound) to Wordvec_embeddings

1. WordVec_embeddings : Replaceall one occurence with UNK :: TO DO
2. WordVec : Extend wordvec embedding to include these unknown words and 50d vector.
3. Dictionary : Create Word2Id and Id2Word from new extended WordVec.
4. Create input, encode, decode and target data
5. Run it with script

In [10]:
new_embed = list( np.random.uniform(-1.0, 1.0, 50 ) )
len(new_embed)

50

In [11]:
print('Embedding length prior to new word addition is :', len(wordvec_embedding))
emb_dims = len(wordvec_embedding['the']) #or use static 50 if it fails
for word in not_found_words:
      # if word not in glove embedding, create random embed
      wordvec_embedding[word] = list( np.random.uniform(-1.0, 1.0, 50 ) ) 
    
print('Embedding length post new word addition is :', len(wordvec_embedding))    

Embedding length prior to new word addition is : 400002
Embedding length post new word addition is : 400195


### 1.5 Create Word2id,  id2word dict from wordvec_embedding

In [12]:
idx = 0 
word2id, id2word = {}, {}
for word in wordvec_embedding:
    word2id[word] = idx
    id2word[idx] = word
    idx +=1

print('Word 2 id is : ', getDictElems(word2id, 4))
print('Id 2 word is : ',getDictElems(id2word, 4) )

Word 2 id is :  [{'_pad_': 0, '_unk_': 1, 'the': 2, ',': 3}]
Id 2 word is :  [{0: '_pad_', 1: '_unk_', 2: 'the', 3: ','}]


### 1.6 Add Start_id , end_id  to dict

In [13]:
if 'start_id'  not in word2id.keys():
    start_id = len(word2id)
    end_id = len(word2id) + 1

word2id['start_id'], word2id['end_id'] = start_id, end_id
id2word[start_id], id2word[end_id] = 'start_id', 'end_id'
 

## 2.0 Input Data, Label

1. X  : First line after '...'
2. Y : All lines until  next '...'
3. Convert X, Y to word_id  format for further processing


In [14]:
print('X is ', X[0:2])
print('Y is ', Y[0:2])

X is  [['are', 'you', 'real'], ['i', 'would', 'marry', 'for', 'the', 'love', 'of', 'my', 'life']]
Y is  [['no', 'a', 'bot', 'is', 'real', 'i', 'am', 'not', 'a', 'bot', 'yes', 'you', 'are', 'prove', 'it', 'you', 'first', 'i', 'can', 'not', 'prove', 'something', 'i', 'do', 'not', 'believe', 'prove', 'the', 'halting', 'problem', 'you', 'first', 'see', 'you', 'are', 'not', 'that', 'is', 'what', 'i', 'told', 'you'], ['would', 'you', 'marry', 'me', 'sure', 'can', 'you', 'love', 'of', 'course', 'everyone', 'can', 'love', 'how', 'did', 'you', 'learn', 'i', 'learned', 'in', 'the', 'age', 'of', 'legends', 'people', 'nowadays', 'know', 'so', 'little', 'now', 'lie', 'on', 'the', 'bed', 'yes', 'maam', 'giggles', 'giggles', 'back']]


###  2.1  Input Data (words) to word_id

In [15]:
X_wordid, Y_wordid = [], []
for chat in X:
    X_wordid.append( [ word2id[word] for word in chat] )

for chat_response in Y:
    Y_wordid.append( [ word2id[word] for word in chat_response ]  )
        
        
print('X_wordid is :',X_wordid[0:2])
print('Y_wordid is :',Y_wordid[0:2])

X_wordid is : [[34, 83, 569], [43, 56, 6638, 12, 2, 837, 5, 194, 216]]
Y_wordid is : [[86, 9, 25458, 16, 569, 43, 915, 38, 9, 25458, 2774, 83, 34, 2829, 22, 83, 60, 43, 88, 38, 2829, 647, 43, 90, 38, 735, 2829, 2, 13059, 749, 83, 60, 255, 83, 34, 38, 14, 16, 104, 43, 156, 83], [56, 83, 6638, 287, 1087, 88, 83, 837, 5, 748, 1404, 88, 837, 199, 121, 83, 2370, 43, 2274, 8, 2, 466, 5, 10252, 71, 11204, 348, 102, 335, 116, 4623, 15, 2, 3829, 2774, 400002, 50912, 50912, 139]]


### 2.2 Calc  Encoding  and  decoding  sequence length


In [16]:
########
## Calc mean and stdev to fix  rnn_seq_in_lenght and rnn_seq_out_length
#####
X_lengths = [len(lst) for lst in X]
Y_lengths = [len(lst) for lst in Y]
#print('X_length is ', X_lengths[:2])

rnn_in_seq_mean = np.mean(X_lengths)
rnn_in_seq_stdv = np.std(X_lengths)
rnn_out_seq_mean = np.mean(Y_lengths)
rnn_out_seq_stdv = np.std(Y_lengths)

#print('Conv starter is ', conv_starter[:2])
#print('Rnn out seq is ', responses[:2])
print('rnn in seq mean & stdev :',  rnn_in_seq_mean, ' ', rnn_in_seq_stdv)
print('rnn out seq mean & stdev :',  rnn_out_seq_mean, ' ', rnn_out_seq_stdv)    

rnn in seq mean & stdev : 5.601489757914339   3.5508329885869556
rnn out seq mean & stdev : 68.77467411545624   67.88573123796462


###  2.3 Calc Rnn in/out seq lengths @ 84% coverage

In [17]:
# 1 sd away from mean = 84% coverage
rnn_in_seq_len =  rnn_in_seq_mean + rnn_in_seq_stdv
rnn_out_seq_len = rnn_out_seq_mean + rnn_out_seq_stdv

print('rnn_in_seq_len = ', rnn_in_seq_len, ' .rnn_out_seq_len :', rnn_out_seq_len)


rnn_in_seq_len =  9.152322746501294  .rnn_out_seq_len : 136.66040535342086


In [18]:
print("Total No of chats :", len(X_wordid), len(X))
trainX, trainY = X_wordid, Y_wordid

print("Sample 2 trainX :", trainX[0:2])
print("Sample 2 trainY :", trainY[0:2])


Total No of chats : 537 537
Sample 2 trainX : [[34, 83, 569], [43, 56, 6638, 12, 2, 837, 5, 194, 216]]
Sample 2 trainY : [[86, 9, 25458, 16, 569, 43, 915, 38, 9, 25458, 2774, 83, 34, 2829, 22, 83, 60, 43, 88, 38, 2829, 647, 43, 90, 38, 735, 2829, 2, 13059, 749, 83, 60, 255, 83, 34, 38, 14, 16, 104, 43, 156, 83], [56, 83, 6638, 287, 1087, 88, 83, 837, 5, 748, 1404, 88, 837, 199, 121, 83, 2370, 43, 2274, 8, 2, 466, 5, 10252, 71, 11204, 348, 102, 335, 116, 4623, 15, 2, 3829, 2774, 400002, 50912, 50912, 139]]


### 2.4 Create Dataset : Batch major (Not Time major)

Encoding and decoding length fixed.

Dataset should look like


In [19]:
"""
Encode_seq : ['How', 'are' , 'you', <PAD_ID>, <PAD_ID>]
====================== Thought vectors =======================================
Decode_seq : ['<START_ID>', 'I',  'am',   'fine',   <PAD_ID>, <PAD_ID>, <PAD_ID>, <PAD_ID>  ]
Target_seq : [ 'I',         'am', 'fine', <END_ID>  <PAD_ID>, <PAD_ID>, <PAD_ID>, <PAD_ID>  ]
Target_mask: [  1           ,1     ,1      ,1        ,0          ,0         ,0      ,0      ]
"""

def getEncodeNDecode(X,Y):    
    #max_len_x = max([ len(chats) for chats in  X ]   )
    #max_len_y = max([ len(response) for response in  Y ]   )
    
    max_len_x, max_len_y = 99999999,999999999
    
    max_padlen_x =  int(rnn_in_seq_len) if (int(rnn_in_seq_len) < max_len_x ) else max_len_x
    max_padlen_y =  int(rnn_out_seq_len) if (int(rnn_out_seq_len) < max_len_y ) else max_len_y
    
    _encode_seqs = tl.prepro.pad_sequences(X, maxlen=max_padlen_x, \
                                padding='post', truncating='post', value=word2id['_pad_'] )
    
    _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
    _decode_seqs = tl.prepro.pad_sequences(_decode_seqs, maxlen=max_padlen_y,\
                                padding='post', truncating='post', value=word2id['_pad_'] )
    
    _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
    _target_seqs = tl.prepro.pad_sequences(_target_seqs, maxlen=max_padlen_y,\
                                padding='post', truncating='post', value=word2id['_pad_'])

    _target_mask = tl.prepro.sequences_get_mask(_target_seqs)
    
    return _encode_seqs, _decode_seqs, _target_seqs, _target_mask
    
_encode_seqs, _decode_seqs, _target_seqs, _target_mask = getEncodeNDecode(\
                                                    X_wordid[3:5], Y_wordid[3:5])

print('\n  start_id : ',word2id['start_id'],' . end_id : ', word2id['end_id'])
#print('\n_encode_seqs : ',seqId2Words( _encode_seqs) )
print('\n_encode_seqs : ',_encode_seqs) 
print('\n_decode_seqs : ',_decode_seqs)
print('\n_target_seqs : ',_target_seqs)
print('\n_target_mask : ',_target_mask)


  start_id :  400195  . end_id :  400196

_encode_seqs :  [[104, 16, 394, 313, 0, 0, 0, 0, 0], [35, 83, 663, 53, 8, 837, 0, 0, 0]]

_decode_seqs :  [[400195, 194, 313, 16, 400003, 38, 2, 400003, 86, 122, 400003, 14, 16, 194, 313, 319, 590, 83, 392, 32, 19021, 38, 590, 104, 16, 394, 313, 400003, 14, 16, 194, 313, 319, 400003, 2774, 83, 119, 103, 43, 90, 38, 348, 104, 83, 1704, 3204, 2774, 83, 90, 194, 1411, 83, 34, 12312, 287, 14, 16, 221, 86, 22, 16, 38, 129, 104, 16, 221, 104, 16, 394, 2802, 194, 2802, 43, 90, 38, 348, 43, 915, 93, 10738, 14, 16, 66, 250, 88, 83, 1363, 287, 647, 2774, 104, 90, 83, 305, 6, 348, 43, 56, 119, 6, 348, 1176, 43, 88, 38, 1363, 83, 1176, 145, 122, 467, 7, 1183, 55, 45, 171, 133, 22, 43, 90, 38, 348, 113, 6, 467, 24, 2, 1086, 104, 16, 2, 1086, 104, 1581, 51, 2], [400195, 2774, 63, 37, 83, 8, 837, 43, 915, 151, 8, 837, 40, 34, 83, 8, 837, 19, 9, 1751, 41, 34, 60180, 43, 271, 102, 319, 102, 740, 90, 38, 83, 119, 103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [20]:
def seqId2Words(seq, get_first_n_words_only = 5):
    lst = []
    for word_lst in seq:
        #print(word_lst)
        lst.append([ id2word[word_id] if word_id in id2word.keys() else word_id\
                    for word_id  in word_lst[0:get_first_n_words_only]  ] )            
    return  lst

lst = [[104, 16, 394, 313, 0, 0, 0, 0, 0], [35, 83, 663, 53, 8, 837, 0, 0, 0]]

seqId2Words(lst)

[['what', 'is', 'your', 'name', '_pad_'],
 ['have', 'you', 'ever', 'been', 'in']]

In [21]:
id2word[104]

'what'

## 3.0 Rnn Modeling

### 3.1 Rnn param

In [22]:
learning_rate = 0.1
batch_size = 4
xvocab_size = len(id2word) 
emb_dim = emb_dims
rnn_num_layers = 1
dropout_rate = 0.2


#  decay learning rate by 25% every 10 step
lr_decay_rate =  0.75
lr_decay_steps = 10
max_grad_clip_to = 5.0

### 3.1.1 Word Embedding Dict to Array

Because : EmbeddingInput layers expects the word embeddings to be in array format

In [23]:
word_embedding_matrix = []
for  word in word2id:    
    #print('word not found :', word)
    if word not in ('start_id','end_id'):
        word_embedding_matrix.append(list(wordvec_embedding[word]) )              
print( word_embedding_matrix[0:2])

[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.2780087830967479, -0.2291490430363281, 0.4263150069620718, 0.8240796365346232, 0.11578251058121602, 0.9783727681590155, 0.6802844059978832, -0.5722520069753132, -0.08437865859280058, 0.007096877190771345, -0.7691950386127993, 0.2704172290790836, -0.20767185852296466, -0.7476782399241471, 0.7596679100535166, -0.8262741784207053, 0.28771884679222604, 0.5728380437464076, -0.4025040775341664, -0.6707764355845043, -0.3360247065696329, -0.8520812026620561, -0.012002413791471156, -0.4155591553458333, 0.05833922642948175, 0.8510067796579759, 0.8050541550133667, 0.6686013492498821, -0.17922428851998817, 0.513702643378094, -0.13408779635199886, 0.6940429323134196, 0.7563422564788356, -0.10907479377709217, -0.09714066317431103, 0.336635776603707

In [24]:
word_embedding_matrix[1] 

[0.2780087830967479,
 -0.2291490430363281,
 0.4263150069620718,
 0.8240796365346232,
 0.11578251058121602,
 0.9783727681590155,
 0.6802844059978832,
 -0.5722520069753132,
 -0.08437865859280058,
 0.007096877190771345,
 -0.7691950386127993,
 0.2704172290790836,
 -0.20767185852296466,
 -0.7476782399241471,
 0.7596679100535166,
 -0.8262741784207053,
 0.28771884679222604,
 0.5728380437464076,
 -0.4025040775341664,
 -0.6707764355845043,
 -0.3360247065696329,
 -0.8520812026620561,
 -0.012002413791471156,
 -0.4155591553458333,
 0.05833922642948175,
 0.8510067796579759,
 0.8050541550133667,
 0.6686013492498821,
 -0.17922428851998817,
 0.513702643378094,
 -0.13408779635199886,
 0.6940429323134196,
 0.7563422564788356,
 -0.10907479377709217,
 -0.09714066317431103,
 0.33663577660370736,
 0.5674116625579169,
 0.6331483626142311,
 0.8262918688801566,
 0.9123801030584786,
 -0.9773072563426572,
 -0.44262581247627697,
 0.7593012716463687,
 0.7422919238810088,
 0.5070318924622625,
 -0.8808271737635853,


###  3.2 RNN Many to Many Model Defn

1. **Encoding layer**: Look up layer to lookup word id into embedding.
2. **Decoding layer**: Look up layer to lookup word id into embedding.
3. **Seq2Seq layer**: A simple dynamic Rnn layer.
      

REF: https://tensorlayer.readthedocs.io/en/stable/modules/layers.html#tensorlayer.layers.Seq2Seq

In [25]:
def model(encode_seqs, decode_seqs, is_train=True, reuse=False):
    with tf.variable_scope("model", reuse=reuse):
        with tf.variable_scope("embedding") as vs:
            # same embedding / thought vector to be used in both encoding and decoding
            # Embedding Input layers is a look up table for  wordid to embedding.            
            # same embedding / thought vector to be used in both encoding and decoding
            encoding_layer = EmbeddingInputlayer(
                inputs = encode_seqs,
                vocabulary_size = xvocab_size,
                embedding_size = emb_dim,
                # use glove embedding vectors as starting point
                E_init = tf.constant_initializer(word_embedding_matrix, dtype=tf.float32),
                name = 'encode_decode_seq_embedding')
            vs.reuse_variables()
            # Embedding Input layers is a look up table for  wordid to embedding.
            decoding_layer = EmbeddingInputlayer(
                inputs = decode_seqs,
                vocabulary_size = xvocab_size,
                embedding_size = emb_dim,
                # use glove embedding vectors as starting point
                E_init = tf.constant_initializer(word_embedding_matrix, dtype=tf.float32),
                name = 'encode_decode_seq_embedding')
        
        print('Seq2seq i/p : Encodinglayer must be of size [batch_sie, None, n_featueres] ',\
              encoding_layer)
       
        net_rnn = Seq2Seq(encoding_layer, decoding_layer,
                cell_fn = tf.contrib.rnn.BasicLSTMCell,
                n_hidden = emb_dim,
                # Using Xavier initializer for better perf
                #initializer = tf.random_uniform_initializer(-0.1, 0.1),
                initializer = tf.contrib.layers.xavier_initializer(),
                encode_sequence_length = retrieve_seq_length_op2(encode_seqs),
                decode_sequence_length = retrieve_seq_length_op2(decode_seqs),
                initial_state_encode = None,
                dropout = (dropout_rate if is_train else None),                
                n_layer = rnn_num_layers,
                return_seq_2d = True,
                name = 'seq2seq_rnn')   
        
        # Fully connected layer with relu activation
        net_out = DenseLayer(net_rnn, n_units=xvocab_size, act=tf.nn.relu, name='output')
    return net_out, net_rnn

## 3.3 Tf Placeholder

1. **Training Input Placeholder** :
2. **Inference Input Placeholders**

In [26]:
# model for training
encode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="encode_seqs")
decode_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="decode_seqs")
target_seqs = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_seqs")
target_mask = tf.placeholder(dtype=tf.int64, shape=[batch_size, None], name="target_mask") 
# tl.prepro.sequences_get_mask()


### 3.3.1 Model Train Defn

VIMP : During training, the model should not be reused because we will be iteratively making it more and more better. However in test  case, we must set the model reuse to true, as we do want the previously trained model without any modification.

In [27]:
net_out, _ = model(encode_seqs, decode_seqs, is_train=True, reuse=False)

[TL] EmbeddingInputlayer model/embedding/encode_decode_seq_embedding: (400197, 50)
[TL] EmbeddingInputlayer model/embedding/encode_decode_seq_embedding: (400197, 50)
Seq2seq i/p : Encodinglayer must be of size [batch_sie, None, n_featueres]    Last layer is: EmbeddingInputlayer (model/embedding/encode_decode_seq_embedding) [4, None, 50]
[TL] [*] Seq2Seq model/seq2seq_rnn: n_hidden: 50 cell_fn: BasicLSTMCell dropout: 0.2 n_layer: 1
[TL] DynamicRNNLayer model/seq2seq_rnn/encode: n_hidden: 50, in_dim: 3 in_shape: (4, ?, 50) cell_fn: BasicLSTMCell dropout: 0.2 n_layer: 1
[TL]        batch_size (concurrent processes): 4
[TL] DynamicRNNLayer model/seq2seq_rnn/decode: n_hidden: 50, in_dim: 3 in_shape: (4, ?, 50) cell_fn: BasicLSTMCell dropout: 0.2 n_layer: 1
[TL]        batch_size (concurrent processes): 4
[TL] DenseLayer  model/output: 400197 relu


In [28]:
# model for inferencing
encode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_seqs")
decode_seqs2 = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_seqs")
net, net_rnn = model(encode_seqs2, decode_seqs2, is_train=False, reuse=True)
y = tf.nn.softmax(net.outputs)


[TL] EmbeddingInputlayer model/embedding/encode_decode_seq_embedding: (400197, 50)
[TL] EmbeddingInputlayer model/embedding/encode_decode_seq_embedding: (400197, 50)
Seq2seq i/p : Encodinglayer must be of size [batch_sie, None, n_featueres]    Last layer is: EmbeddingInputlayer (model/embedding/encode_decode_seq_embedding) [1, None, 50]
[TL] [*] Seq2Seq model/seq2seq_rnn: n_hidden: 50 cell_fn: BasicLSTMCell dropout: None n_layer: 1
[TL] DynamicRNNLayer model/seq2seq_rnn/encode: n_hidden: 50, in_dim: 3 in_shape: (1, ?, 50) cell_fn: BasicLSTMCell dropout: None n_layer: 1
[TL]        batch_size (concurrent processes): 1
[TL] DynamicRNNLayer model/seq2seq_rnn/decode: n_hidden: 50, in_dim: 3 in_shape: (1, ?, 50) cell_fn: BasicLSTMCell dropout: None n_layer: 1
[TL]        batch_size (concurrent processes): 1
[TL] DenseLayer  model/output: 400197 relu


## 3.4 Loss  Defn

1. **Mask Loss**: is used. Mask is multiplied by calculated loss, to ensure padding losses are not accounted during loss calculation

In [29]:
#  we calculate  cross entropy with mask 
# because we do not want  the padding to have any effect on the lonss
loss = tl.cost.cross_entropy_seq_with_mask(logits=net_out.outputs, target_seqs=target_seqs,\
                                    input_mask=target_mask, return_details=False, name='cost')

### 3.4.1 Optimizer Defn : Decaying rate & Grad Clip

1. **Vanishing Gradient :** Since we are using relu, vanishing gradient is not a  big problem. 
2. **Exploding Gradient :** : Clipping gradients to prevent the problem
3. **Optimiser :** Adam Optimisers tend to converge faster. SGD although slow tend to outperfom Adam however
4. **Decaying learning rate**: For adam it is not required, because ADAM guarantees square root decay as per theorem 4.1 

t <- t +1

lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)

PS:  Theorem 4.1 of their ICLR article, one of their hypotheses is that the learning rate has a square root decay, αt=α/t√. Furthermore, for their logistic regression experiments they use the square root decay as well. 
Ref: https://stats.stackexchange.com/questions/200063/adam-optimizer-with-exponential-decay

In [30]:
global_step = tf.Variable(0, trainable=False)
inc_gstep = tf.assign(global_step,global_step + 1)

# Learning rate decay ~ lr * decay_rate ^(global_step/ decay_step)
print('learning rate :', learning_rate)
#learning_rate = tf.train.exponential_decay(
#    lr, global_step, decay_steps=lr_decay_steps,\
#    decay_rate=lr_decay_rate, staircase=True)

#lr = tf.placeholder(tf.float32, name = 'learning_rate')
# We define Adam Optimizer
train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

# Gradient clipping
#gradients, v = zip(*optimizer.compute_gradients(loss))
#gradients, _ = tf.clip_by_global_norm(gradients, max_grad_clip_to)
#optimizer = optimizer.apply_gradients(
#    zip(gradients, v))

#train_op = optimizer

#net_out.print_params(False)
#train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)


learning rate : 0.1


# 4.0 Model Training

In [31]:
learning_rate = 0.3

import time
from sklearn.utils import shuffle

sess = tf.Session()
sess.run( tf.global_variables_initializer() )

n_epoch = 1
for epoch in range(n_epoch):
    
    total_err, steps = 0, 0
    epoch_time = time.time()    
    trainX, trainY = shuffle(trainX, trainY, random_state=0)
    
    # use minibatches generated from trainX, trainY 
    for X, Y in tl.iterate.minibatches(inputs=trainX, targets=trainY,\
                                batch_size=batch_size, shuffle=False):
        step_time = time.time()
        _encode_seqs, _decode_seqs, _target_seqs, _target_mask = getEncodeNDecode(X,Y)
        
        print(_encode_seqs)

[[83, 34, 38, 194, 1411, 0, 0, 0, 0], [194, 713, 16, 994, 0, 0, 0, 0, 0], [102, 122, 115, 43, 1473, 656, 14983, 83, 271], [34, 83, 1091, 1098, 0, 0, 0, 0, 0]]
[[104, 34, 394, 2722, 0, 0, 0, 0, 0], [34, 83, 9, 953, 0, 0, 0, 0, 0], [7476, 2253, 0, 0, 0, 0, 0, 0, 0], [83, 34, 194, 193, 256, 1411, 0, 0, 0]]
[[43, 119, 7377, 0, 0, 0, 0, 0, 0], [88, 55, 467, 76, 0, 0, 0, 0, 0], [57, 0, 0, 0, 0, 0, 0, 0, 0], [83, 34, 9, 1858, 0, 0, 0, 0, 0]]
[[63, 37, 83, 957, 129, 0, 0, 0, 0], [104, 90, 83, 843, 6, 90, 0, 0, 0], [43, 915, 6334, 14, 2, 4365, 16, 38, 10016], [43, 915, 224, 6, 2, 5579, 0, 0, 0]]
[[43, 915, 38, 1903, 6, 83, 132, 58, 0], [43, 915, 9, 475, 0, 0, 0, 0, 0], [83, 34, 9, 953, 0, 0, 0, 0, 0], [2474, 104, 34, 83, 1903, 61, 0, 0, 0]]
[[83, 35, 86, 13340, 199, 6, 26949, 48429, 19], [88, 83, 1681, 0, 0, 0, 0, 0, 0], [102, 104, 121, 83, 35, 12, 4155, 0, 0], [90, 83, 3712, 758, 15, 2, 927, 0, 0]]
[[43, 915, 2544, 8, 6880, 7636, 7, 8078, 0], [90, 83, 119, 2, 45999, 0, 0, 0, 0], [199, 262, 9, 

[[43, 915, 9, 11870, 0, 0, 0, 0, 0], [43, 5752, 643, 1948, 0, 0, 0, 0, 0], [83, 34, 9, 441, 122780, 85, 43, 90, 38], [43, 35, 6, 244, 116, 7, 893, 83, 0]]
[[4864, 888, 97, 284, 0, 0, 0, 0, 0], [664, 83, 34, 12219, 24, 2, 1602, 102, 85], [83, 2090, 194, 5046, 0, 0, 0, 0, 0], [90, 83, 119, 3558, 0, 0, 0, 0, 0]]
[[43, 743, 6, 1714, 83, 61, 2, 3977, 993], [199, 169, 34, 83, 0, 0, 0, 0, 0], [104, 16, 234, 234, 0, 0, 0, 0, 0], [221, 577, 0, 0, 0, 0, 0, 0, 0]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0], [40, 16, 394, 8370, 0, 0, 0, 0, 0], [205, 647, 5468, 0, 0, 0, 0, 0, 0], [104, 90, 83, 119, 6, 90, 12, 2907, 0]]
[[43, 90, 38, 153, 348, 83, 0, 0, 0], [43, 915, 9, 1285, 0, 0, 0, 0, 0], [104, 16, 2, 1830, 0, 0, 0, 0, 0], [590, 4003, 43, 806, 14, 83, 37, 9, 953]]
[[43, 90, 38, 271, 0, 0, 0, 0, 0], [43, 271, 394, 1011, 34, 3962, 12318, 0, 0], [2, 6474, 16, 177, 1558, 0, 0, 0, 0], [7944, 65, 0, 0, 0, 0, 0, 0, 0]]
[[88, 55, 244, 139, 6, 2, 4365, 61, 1535], [86, 43, 915, 224, 6, 2201, 0, 0, 0], [4347, 1622, 0, 0,

[[36, 43, 305, 79, 4296, 0, 0, 0, 0], [199, 16, 394, 124, 53, 0, 0, 0, 0], [83, 34, 12312, 287, 116, 0, 0, 0, 0], [44, 36354, 121, 83, 808, 24, 0, 0, 0]]
[[1535, 16, 194, 8382, 0, 0, 0, 0, 0], [34, 83, 569, 0, 0, 0, 0, 0, 0], [121, 83, 119, 22, 0, 0, 0, 0, 0], [199, 56, 43, 256, 15353, 83, 0, 0, 0]]
[[104, 34, 83, 0, 0, 0, 0, 0, 0], [83, 34, 4436, 0, 0, 0, 0, 0, 0], [104, 16, 3368, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[83, 0, 0, 0, 0, 0, 0, 0, 0], [86, 1089, 0, 0, 0, 0, 0, 0, 0], [104, 16, 2, 5161, 81, 0, 0, 0, 0], [88, 43, 35, 394, 2253, 0, 0, 0, 0]]
[[22, 16, 5493, 19, 1187, 4754, 0, 0, 0], [90, 83, 837, 287, 0, 0, 0, 0, 0], [34, 83, 9, 9249, 0, 0, 0, 0, 0], [7392, 8, 1882, 14, 16, 193, 992, 2414, 0]]
[[88, 83, 685, 5213, 0, 0, 0, 0, 0], [83, 34, 4801, 15, 39, 2559, 0, 0, 0], [83, 34, 9, 4436, 25497, 400012, 0, 0, 0], [199, 61, 9, 188, 5, 7164, 0, 0, 0]]
[[199, 90, 83, 664, 0, 0, 0, 0, 0], [1363, 287, 647, 58, 27, 394, 9203, 0, 0], [45, 953, 8078, 50, 124, 192, 76, 2, 87]

In [33]:
#learning_rate = 0.3
#asdasda
import time
from sklearn.utils import shuffle

sess = tf.Session()
sess.run( tf.global_variables_initializer() )

n_epoch = 2000
for epoch in range(n_epoch):
    
    total_err, steps = 0, 0
    epoch_time = time.time()    
    trainX, trainY = shuffle(trainX, trainY, random_state=0)
    
    # use minibatches generated from trainX, trainY 
    for X, Y in tl.iterate.minibatches(inputs=trainX, targets=trainY,\
                                batch_size=batch_size, shuffle=False):
        step_time = time.time()
        _encode_seqs, _decode_seqs, _target_seqs, _target_mask = getEncodeNDecode(X,Y)
        
        _, err = sess.run([train_op, loss],
                        {encode_seqs: _encode_seqs,    decode_seqs: _decode_seqs,
                        target_seqs: _target_seqs,     target_mask: _target_mask } )
        #                lr: learning_rate }  )
        total_err += err; 
        
        if steps % (20) == 0:
            print("Epoch[%d/%d] Iter: %d loss:%f learning rate:%f " \
                    % (epoch, n_epoch, steps,  err, 1) )
            #print('global steps is ', global_step)
        steps += 1

        ####### 
        ## test for  bot responses
        #######
        if steps % (100) == 0:
            test_qns = ["happy birthday have a nice day", "is it going to rain tomorrow"  ]
            for qn in test_qns:
                print("Qn >", qn)
                seed_id = [word2id[w] for w in qn.split(" ")]
                
                # Get 2  responses
                for _ in range(2):
                    # Use encoding layers final state
                    state = sess.run(net_rnn.final_state_encode,
                                    {encode_seqs2: [seed_id]})
                    
                    # Start decoding : decode indicator : start_id, get first word                    
                    o, state = sess.run([y, net_rnn.final_state_decode],
                                    {net_rnn.initial_state_decode: state,\
                                     decode_seqs2: [[start_id]]})
                    
                    w_id = tl.nlp.sample_top(o[0], top_k=3)
                    w = id2word[w_id]
                    
                    # 3. decode, feed state iteratively
                    sentence = [w]
                    for _ in range(30): # max sentence length
                        o, state = sess.run([y, net_rnn.final_state_decode],
                                        {net_rnn.initial_state_decode: state,
                                        decode_seqs2: [[w_id]]})
                        w_id = tl.nlp.sample_top(o[0], top_k=2)
                        w = id2word[w_id]
                        if w_id == end_id:
                            break
                        sentence = sentence + [w]
                    print("BOT ===> ", ' '.join(sentence))
    ##########
    ## Decay Learning rate @ every epoch
    ##########
    #learning_rate = learning_rate * lr_decay_rate
    #if learning_rate < min_learning_rate:
    #    learning_rate = min_learning_rate
        
        
    print("Epoch[%d/%d] averaged loss:%f took:%.5fs" % (epoch, n_epoch, total_err/steps, time.time()-epoch_time))
    

print('Completed')

Epoch[0/2000] Iter: 0 loss:12.930230 learning rate:1.000000 
Epoch[0/2000] Iter: 20 loss:10.471587 learning rate:1.000000 


KeyboardInterrupt: 

# 4.0 Computation Space/ time reduction :

1. Negative Sampling for target sequences :
2. Training : Smaller / limited WordVector embeddings
3. Prediction : Trained embedding + Bigger embedding containing words not in the trained embeddings

During training, use  only the words in the training instance. But since we want to be able to predict words that  are limited not only  by the  training words. During the prediction or chat bot resonse generation phase, we can use bigger embedding matrix for lookup to supplement the trained embedding.


## 5.0 Future  :  To Dos



1. **Loss Function** : Evaluate it against other loss functions such as perplexity, BLEU or other measures
2. **Beam Search**: Increased performance implied by literatures.
3. **Attention Mechanism** : Increased performance implied by literatures.
