In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tqdm import tqdm
from datasets import list_datasets, load_dataset
from pprint import pprint
import random
import keras
import os

In [13]:
dataset = load_dataset('squad') ## downloading squad dataset

Found cached dataset squad (C:/Users/user/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [15]:
train_squad = pd.DataFrame(dataset["train"])
test_squad = pd.DataFrame(dataset["validation"])

In [16]:
train_squad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87599 entries, 0 to 87598
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        87599 non-null  object
 1   title     87599 non-null  object
 2   context   87599 non-null  object
 3   question  87599 non-null  object
 4   answers   87599 non-null  object
dtypes: object(5)
memory usage: 3.3+ MB


In [17]:
train_squad.head()
## we dont need id gonna drop it

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [18]:
train_squad.drop("id",axis=1,inplace=True)
test_squad.drop("id",axis=1,inplace=True)

In [19]:
train_squad["title"].value_counts()

New_York_City            817
American_Idol            802
Beyoncé                  758
Frédéric_Chopin          697
Queen_Victoria           680
                        ... 
Great_Plains              47
Tristan_da_Cunha          44
Pitch_(music)             36
Matter                    24
Myocardial_infarction     22
Name: title, Length: 442, dtype: int64

In [20]:
avg_words_context = round(sum([len(i.split()) for i in train_squad["context"]])/len(train_squad))
avg_words_question = round(sum([len(i.split()) for i in train_squad["question"]])/len(train_squad))
avg_words_context,avg_words_question

(120, 10)

In [21]:
max_words_context = (max([len(i.split()) for i in train_squad["context"]]))
max_words_question = (max([len(i.split()) for i in train_squad["question"]]))
max_words_context,max_words_question

(653, 40)

In [22]:
(653*0.1 + 120*1.9) / 2

146.65

## Tokenization

In [83]:
encoder_text = train_squad["context"][:4000]
decoder_text = train_squad["question"][:4000]

In [84]:
from keras.preprocessing.text import Tokenizer

def token_fit(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    dictionary = tokenizer.word_index
    
    word2i ={k:v for (k,v) in dictionary.items()}
    i2word = {v:k for (k,v) in word2i.items()}
    
    return tokenizer, word2i,i2word


In [85]:
tokenizer,word2i,i2word = token_fit(encoder_text+decoder_text)

In [86]:
print("Most occured 10 words in vocab")
sorted(tokenizer.word_counts,key=tokenizer.word_counts.get,reverse=True)[:10]

Most occured 10 words in vocab


['the', 'of', 'in', 'and', 'to', 'a', 'was', 'for', 'on', 'as']

In [87]:
vocab_size= len(word2i)
vocab_size

12113

In [88]:
def text2sequence(context,question,tokenizer):
    encoder_seq = tokenizer.texts_to_sequences(context)
    decoder_seq = tokenizer.texts_to_sequences(question)
    
    return encoder_seq,decoder_seq

In [89]:
encoder_seq,decoder_seq = text2sequence(encoder_text,decoder_text,tokenizer)

In [100]:
print("Original Text \n-----------\n",encoder_text[0])
print("Turned into sequences \n-----------\n",encoder_seq[0])

Original Text 
-----------
 Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Turned into sequences 
-----------
 [8672, 1, 97, 35, 6, 556, 658, 779, 1, 274, 8673, 1148, 2376, 16, 6, 1170, 2451, 2, 1, 3917, 1400, 1926, 3, 3329, 2, 1, 274, 258, 4, 3454, 23, 16, 6, 5041, 2451, 2, 5778, 12, 5779, 8674, 12, 1, 702, 8675, 8676, 739, 8677, 599, 5, 1, 274, 25

In [115]:

max_len_enc = 0
max_len_dec = 0

for enc,dec in zip(encoder_seq,decoder_seq):
    if len(enc) > max_len_enc:
        max_len_enc = len(enc)
    if len(dec) > max_len_dec:
        max_len_dec= len(dec)

In [118]:
max_len_enc,max_len_dec

(518, 29)

In [131]:
from keras.utils import pad_sequences
encoder_data = pad_sequences(encoder_seq,maxlen=max_len_enc,dtype="int32",padding="post",truncating="post")
decoder_data = pad_sequences(decoder_seq,maxlen=max_len_dec,dtype="int32",padding="post",truncating="post")

#### as we can see it added extra zeros to make them same size(we selected post to add zeros after)

In [139]:
encoder_data[0][40:200]

array([  12,    1,  702, 8675, 8676,  739, 8677,  599,    5,    1,  274,
        258,   16,    1, 2531,    2,    1, 4684, 2076, 1926,  907,    1,
       2531,   16,    1, 2618,    6, 8678,  351,    2, 4117,    4, 8679,
         23,   16,    6, 5780,    2,    1, 2618,   14, 3739,  606,   70,
          1, 3917, 1400, 8680,  480,    5, 2532, 8681, 8682,    3, 7988,
         14,    1,  327,    2,    1,  274, 1548,    4,    3,    6, 1081,
        233,   11, 6389,  206,  158, 4118,    4,    1, 1148, 2376,   16,
          6, 2182,  499, 2131, 2451,    2, 1400,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

### Pretrained embedding using glove

In [143]:
!wget http://nlp.stanford.edu/data/glove.6B.zip;

--2022-11-11 20:14:10--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-11-11 20:14:11--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-11 20:14:11--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: 'glove.6B.zip'

    

 30500K .......... .......... .......... .......... ..........  3% 3,22M 7m8s
 30550K .......... .......... .......... .......... ..........  3% 3,24M 7m7s
 30600K .......... .......... .......... .......... ..........  3% 3,26M 7m7s
 30650K .......... .......... .......... .......... ..........  3% 3,23M 7m7s
 30700K .......... .......... .......... .......... ..........  3% 3,24M 7m6s
 30750K .......... .......... .......... .......... ..........  3% 2,34M 7m6s
 30800K .......... .......... .......... .......... ..........  3% 3,25M 7m6s
 30850K .......... .......... .......... .......... ..........  3% 3,26M 7m6s
 30900K .......... .......... .......... .......... ..........  3% 3,24M 7m5s
 30950K .......... .......... .......... .......... ..........  3% 3,18M 7m5s
 31000K .......... .......... .......... .......... ..........  3% 3,21M 7m5s
 31050K .......... .......... .......... .......... ..........  3% 3,14M 7m4s
 31100K .......... .......... .......... .......... ..........  

 63800K .......... .......... .......... .......... ..........  7% 3,05M 5m20s
 63850K .......... .......... .......... .......... ..........  7% 3,05M 5m20s
 63900K .......... .......... .......... .......... ..........  7% 3,23M 5m20s
 63950K .......... .......... .......... .......... ..........  7% 2,60M 5m20s
 64000K .......... .......... .......... .......... ..........  7% 3,42M 5m20s
 64050K .......... .......... .......... .......... ..........  7% 3,22M 5m20s
 64100K .......... .......... .......... .......... ..........  7% 2,65M 5m20s
 64150K .......... .......... .......... .......... ..........  7% 3,14M 5m20s
 64200K .......... .......... .......... .......... ..........  7% 2,57M 5m20s
 64250K .......... .......... .......... .......... ..........  7% 3,16M 5m20s
 64300K .......... .......... .......... .......... ..........  7% 3,16M 5m20s
 64350K .......... .......... .......... .......... ..........  7% 2,41M 5m20s
 64400K .......... .......... .......... .......... 

 74150K .......... .......... .......... .......... ..........  8% 3,77M 5m4s
 74200K .......... .......... .......... .......... ..........  8% 3,01M 5m4s
 74250K .......... .......... .......... .......... ..........  8% 4,07M 5m4s
 74300K .......... .......... .......... .......... ..........  8% 3,29M 5m4s
 74350K .......... .......... .......... .......... ..........  8% 2,45M 5m4s
 74400K .......... .......... .......... .......... ..........  8% 4,00M 5m4s
 74450K .......... .......... .......... .......... ..........  8% 4,38M 5m4s
 74500K .......... .......... .......... .......... ..........  8% 3,29M 5m4s
 74550K .......... .......... .......... .......... ..........  8% 4,34M 5m4s
 74600K .......... .......... .......... .......... ..........  8% 4,12M 5m3s
 74650K .......... .......... .......... .......... ..........  8% 3,74M 5m3s
 74700K .......... .......... .......... .......... ..........  8% 2,85M 5m3s
 74750K .......... .......... .......... .......... ..........  

124100K .......... .......... .......... .......... .......... 14% 4,39M 4m4s
124150K .......... .......... .......... .......... .......... 14% 4,10M 4m4s
124200K .......... .......... .......... .......... .......... 14% 4,48M 4m4s
124250K .......... .......... .......... .......... .......... 14% 4,05M 4m4s
124300K .......... .......... .......... .......... .......... 14% 4,42M 4m4s
124350K .......... .......... .......... .......... .......... 14% 3,27M 4m4s
124400K .......... .......... .......... .......... .......... 14% 4,00M 4m4s
124450K .......... .......... .......... .......... .......... 14% 4,48M 4m4s
124500K .......... .......... .......... .......... .......... 14% 4,34M 4m4s
124550K .......... .......... .......... .......... .......... 14% 4,12M 4m4s
124600K .......... .......... .......... .......... .......... 14% 4,36M 4m4s
124650K .......... .......... .......... .......... .......... 14% 4,14M 4m4s
124700K .......... .......... .......... .......... .......... 1

138600K .......... .......... .......... .......... .......... 16% 4,46M 3m53s
138650K .......... .......... .......... .......... .......... 16% 4,37M 3m53s
138700K .......... .......... .......... .......... .......... 16% 4,05M 3m53s
138750K .......... .......... .......... .......... .......... 16% 3,23M 3m53s
138800K .......... .......... .......... .......... .......... 16% 4,43M 3m52s
138850K .......... .......... .......... .......... .......... 16% 4,17M 3m52s
138900K .......... .......... .......... .......... .......... 16% 4,34M 3m52s
138950K .......... .......... .......... .......... .......... 16% 4,08M 3m52s
139000K .......... .......... .......... .......... .......... 16% 4,40M 3m52s
139050K .......... .......... .......... .......... .......... 16% 4,37M 3m52s
139100K .......... .......... .......... .......... .......... 16% 4,08M 3m52s
139150K .......... .......... .......... .......... .......... 16% 3,32M 3m52s
139200K .......... .......... .......... .......... 

174050K .......... .......... .......... .......... .......... 20% 4,36M 3m30s
174100K .......... .......... .......... .......... .......... 20% 4,04M 3m30s
174150K .......... .......... .......... .......... .......... 20% 3,88M 3m30s
174200K .......... .......... .......... .......... .......... 20% 3,20M 3m30s
174250K .......... .......... .......... .......... .......... 20% 3,73M 3m29s
174300K .......... .......... .......... .......... .......... 20% 4,01M 3m29s
174350K .......... .......... .......... .......... .......... 20% 3,08M 3m29s
174400K .......... .......... .......... .......... .......... 20% 3,56M 3m29s
174450K .......... .......... .......... .......... .......... 20% 3,96M 3m29s
174500K .......... .......... .......... .......... .......... 20% 4,07M 3m29s
174550K .......... .......... .......... .......... .......... 20% 4,43M 3m29s
174600K .......... .......... .......... .......... .......... 20% 4,00M 3m29s
174650K .......... .......... .......... .......... 

199050K .......... .......... .......... .......... .......... 23% 4,48M 3m17s
199100K .......... .......... .......... .......... .......... 23% 3,99M 3m17s
199150K .......... .......... .......... .......... .......... 23% 2,86M 3m17s
199200K .......... .......... .......... .......... .......... 23% 4,12M 3m17s
199250K .......... .......... .......... .......... .......... 23% 3,73M 3m17s
199300K .......... .......... .......... .......... .......... 23% 4,07M 3m17s
199350K .......... .......... .......... .......... .......... 23% 4,07M 3m17s
199400K .......... .......... .......... .......... .......... 23% 4,08M 3m17s
199450K .......... .......... .......... .......... .......... 23% 4,05M 3m17s
199500K .......... .......... .......... .......... .......... 23% 3,07M 3m17s
199550K .......... .......... .......... .......... .......... 23% 2,70M 3m17s
199600K .......... .......... .......... .......... .......... 23% 4,00M 3m17s
199650K .......... .......... .......... .......... 

249000K .......... .......... .......... .......... .......... 29% 3,78M 2m55s
249050K .......... .......... .......... .......... .......... 29% 4,00M 2m55s
249100K .......... .......... .......... .......... .......... 29% 4,48M 2m55s
249150K .......... .......... .......... .......... .......... 29% 3,06M 2m55s
249200K .......... .......... .......... .......... .......... 29% 4,45M 2m55s
249250K .......... .......... .......... .......... .......... 29% 4,07M 2m55s
249300K .......... .......... .......... .......... .......... 29% 4,36M 2m55s
249350K .......... .......... .......... .......... .......... 29% 4,12M 2m55s
249400K .......... .......... .......... .......... .......... 29% 4,37M 2m55s
249450K .......... .......... .......... .......... .......... 29% 4,06M 2m55s
249500K .......... .......... .......... .......... .......... 29% 4,35M 2m55s
249550K .......... .......... .......... .......... .......... 29% 3,09M 2m55s
249600K .......... .......... .......... .......... 

274000K .......... .......... .......... .......... .......... 32% 4,31M 2m45s
274050K .......... .......... .......... .......... .......... 32% 3,11M 2m45s
274100K .......... .......... .......... .......... .......... 32% 4,42M 2m45s
274150K .......... .......... .......... .......... .......... 32% 4,06M 2m45s
274200K .......... .......... .......... .......... .......... 32% 4,29M 2m45s
274250K .......... .......... .......... .......... .......... 32% 4,20M 2m45s
274300K .......... .......... .......... .......... .......... 32% 4,04M 2m45s
274350K .......... .......... .......... .......... .......... 32% 3,26M 2m45s
274400K .......... .......... .......... .......... .......... 32% 4,31M 2m45s
274450K .......... .......... .......... .......... .......... 32% 4,18M 2m45s
274500K .......... .......... .......... .......... .......... 32% 4,08M 2m45s
274550K .......... .......... .......... .......... .......... 32% 3,40M 2m45s
274600K .......... .......... .......... .......... 

308350K .......... .......... .......... .......... .......... 36%  196M 2m33s
308400K .......... .......... .......... .......... .......... 36%  280M 2m33s
308450K .......... .......... .......... .......... .......... 36%  215M 2m33s
308500K .......... .......... .......... .......... .......... 36%  285M 2m33s
308550K .......... .......... .......... .......... .......... 36%  283M 2m33s
308600K .......... .......... .......... .......... .......... 36%  218M 2m33s
308650K .......... .......... .......... .......... .......... 36%  283M 2m33s
308700K .......... .......... .......... .......... .......... 36%  214M 2m33s
308750K .......... .......... .......... .......... .......... 36%  224M 2m33s
308800K .......... .......... .......... .......... .......... 36% 4,20M 2m33s
308850K .......... .......... .......... .......... .......... 36% 2,80M 2m33s
308900K .......... .......... .......... .......... .......... 36% 3,06M 2m33s
308950K .......... .......... .......... .......... 

323950K .......... .......... .......... .......... .......... 38% 3,29M 2m28s
324000K .......... .......... .......... .......... .......... 38% 4,05M 2m28s
324050K .......... .......... .......... .......... .......... 38% 4,43M 2m28s
324100K .......... .......... .......... .......... .......... 38% 4,05M 2m28s
324150K .......... .......... .......... .......... .......... 38% 4,44M 2m28s
324200K .......... .......... .......... .......... .......... 38% 4,03M 2m28s
324250K .......... .......... .......... .......... .......... 38% 4,04M 2m28s
324300K .......... .......... .......... .......... .......... 38% 4,45M 2m28s
324350K .......... .......... .......... .......... .......... 38% 3,09M 2m28s
324400K .......... .......... .......... .......... .......... 38% 4,26M 2m28s
324450K .......... .......... .......... .......... .......... 38% 4,26M 2m28s
324500K .......... .......... .......... .......... .......... 38% 4,33M 2m28s
324550K .......... .......... .......... .......... 

391500K .......... .......... .......... .......... .......... 46% 3,81M 2m7s
391550K .......... .......... .......... .......... .......... 46% 2,81M 2m7s
391600K .......... .......... .......... .......... .......... 46% 3,36M 2m7s
391650K .......... .......... .......... .......... .......... 46% 3,43M 2m7s
391700K .......... .......... .......... .......... .......... 46% 4,10M 2m7s
391750K .......... .......... .......... .......... .......... 46% 2,67M 2m7s
391800K .......... .......... .......... .......... .......... 46% 2,46M 2m7s
391850K .......... .......... .......... .......... .......... 46% 2,68M 2m7s
391900K .......... .......... .......... .......... .......... 46% 3,93M 2m7s
391950K .......... .......... .......... .......... .......... 46% 1,96M 2m7s
392000K .......... .......... .......... .......... .......... 46% 3,37M 2m7s
392050K .......... .......... .......... .......... .......... 46% 3,78M 2m7s
392100K .......... .......... .......... .......... .......... 4

411000K .......... .......... .......... .......... .......... 48% 2,95M 2m2s
411050K .......... .......... .......... .......... .......... 48% 4,38M 2m2s
411100K .......... .......... .......... .......... .......... 48% 4,30M 2m2s
411150K .......... .......... .......... .......... .......... 48% 3,06M 2m2s
411200K .......... .......... .......... .......... .......... 48% 2,88M 2m2s
411250K .......... .......... .......... .......... .......... 48% 3,32M 2m2s
411300K .......... .......... .......... .......... .......... 48% 3,50M 2m2s
411350K .......... .......... .......... .......... .......... 48% 3,20M 2m2s
411400K .......... .......... .......... .......... .......... 48% 3,28M 2m2s
411450K .......... .......... .......... .......... .......... 48% 3,64M 2m2s
411500K .......... .......... .......... .......... .......... 48% 3,33M 2m2s
411550K .......... .......... .......... .......... .......... 48% 1,96M 2m2s
411600K .......... .......... .......... .......... .......... 4

448850K .......... .......... .......... .......... .......... 53% 4,58M 1m51s
448900K .......... .......... .......... .......... .......... 53% 2,41M 1m51s
448950K .......... .......... .......... .......... .......... 53% 4,06M 1m51s
449000K .......... .......... .......... .......... .......... 53% 3,54M 1m51s
449050K .......... .......... .......... .......... .......... 53% 4,41M 1m51s
449100K .......... .......... .......... .......... .......... 53% 4,06M 1m51s
449150K .......... .......... .......... .......... .......... 53% 3,25M 1m51s
449200K .......... .......... .......... .......... .......... 53% 4,29M 1m51s
449250K .......... .......... .......... .......... .......... 53% 4,46M 1m51s
449300K .......... .......... .......... .......... .......... 53% 4,42M 1m51s
449350K .......... .......... .......... .......... .......... 53% 2,47M 1m51s
449400K .......... .......... .......... .......... .......... 53% 2,87M 1m51s
449450K .......... .......... .......... .......... 

472400K .......... .......... .......... .......... .......... 56% 4,05M 1m46s
472450K .......... .......... .......... .......... .......... 56% 4,24M 1m46s
472500K .......... .......... .......... .......... .......... 56% 2,99M 1m46s
472550K .......... .......... .......... .......... .......... 56% 4,32M 1m46s
472600K .......... .......... .......... .......... .......... 56% 3,66M 1m46s
472650K .......... .......... .......... .......... .......... 56% 2,66M 1m46s
472700K .......... .......... .......... .......... .......... 56% 3,11M 1m46s
472750K .......... .......... .......... .......... .......... 56% 2,45M 1m46s
472800K .......... .......... .......... .......... .......... 56% 2,17M 1m45s
472850K .......... .......... .......... .......... .......... 56% 2,69M 1m45s
472900K .......... .......... .......... .......... .......... 56% 3,06M 1m45s
472950K .......... .......... .......... .......... .......... 56% 3,48M 1m45s
473000K .......... .......... .......... .......... 

509450K .......... .......... .......... .......... .......... 60% 4,45M 95s
509500K .......... .......... .......... .......... .......... 60% 4,39M 95s
509550K .......... .......... .......... .......... .......... 60% 3,06M 95s
509600K .......... .......... .......... .......... .......... 60% 4,51M 95s
509650K .......... .......... .......... .......... .......... 60% 4,28M 95s
509700K .......... .......... .......... .......... .......... 60% 4,09M 95s
509750K .......... .......... .......... .......... .......... 60% 4,45M 95s
509800K .......... .......... .......... .......... .......... 60% 4,42M 95s
509850K .......... .......... .......... .......... .......... 60% 4,13M 95s
509900K .......... .......... .......... .......... .......... 60% 4,26M 95s
509950K .......... .......... .......... .......... .......... 60% 3,29M 94s
510000K .......... .......... .......... .......... .......... 60% 4,16M 94s
510050K .......... .......... .......... .......... .......... 60% 4,35M 94s

523800K .......... .......... .......... .......... .......... 62% 3,28M 91s
523850K .......... .......... .......... .......... .......... 62% 2,63M 91s
523900K .......... .......... .......... .......... .......... 62% 3,55M 90s
523950K .......... .......... .......... .......... .......... 62% 2,62M 90s
524000K .......... .......... .......... .......... .......... 62% 3,33M 90s
524050K .......... .......... .......... .......... .......... 62% 3,48M 90s
524100K .......... .......... .......... .......... .......... 62% 3,38M 90s
524150K .......... .......... .......... .......... .......... 62% 3,04M 90s
524200K .......... .......... .......... .......... .......... 62% 3,74M 90s
524250K .......... .......... .......... .......... .......... 62% 2,18M 90s
524300K .......... .......... .......... .......... .......... 62% 3,48M 90s
524350K .......... .......... .......... .......... .......... 62% 2,56M 90s
524400K .......... .......... .......... .......... .......... 62% 3,08M 90s

561050K .......... .......... .......... .......... .......... 66% 1,88M 81s
561100K .......... .......... .......... .......... .......... 66% 3,06M 81s
561150K .......... .......... .......... .......... .......... 66% 1,99M 81s
561200K .......... .......... .......... .......... .......... 66% 3,00M 81s
561250K .......... .......... .......... .......... .......... 66% 2,49M 81s
561300K .......... .......... .......... .......... .......... 66% 2,39M 81s
561350K .......... .......... .......... .......... .......... 66% 2,18M 80s
561400K .......... .......... .......... .......... .......... 66% 2,74M 80s
561450K .......... .......... .......... .......... .......... 66% 3,17M 80s
561500K .......... .......... .......... .......... .......... 66% 1,79M 80s
561550K .......... .......... .......... .......... .......... 66% 2,60M 80s
561600K .......... .......... .......... .......... .......... 66% 2,59M 80s
561650K .......... .......... .......... .......... .......... 66% 2,98M 80s

573750K .......... .......... .......... .......... .......... 68% 3,77M 77s
573800K .......... .......... .......... .......... .......... 68% 2,95M 77s
573850K .......... .......... .......... .......... .......... 68% 2,96M 77s
573900K .......... .......... .......... .......... .......... 68% 3,72M 77s
573950K .......... .......... .......... .......... .......... 68% 2,71M 77s
574000K .......... .......... .......... .......... .......... 68% 4,12M 77s
574050K .......... .......... .......... .......... .......... 68% 4,36M 77s
574100K .......... .......... .......... .......... .......... 68% 4,01M 77s
574150K .......... .......... .......... .......... .......... 68% 3,30M 77s
574200K .......... .......... .......... .......... .......... 68% 3,40M 77s
574250K .......... .......... .......... .......... .......... 68% 2,35M 77s
574300K .......... .......... .......... .......... .......... 68% 3,26M 77s
574350K .......... .......... .......... .......... .......... 68% 2,58M 77s

623700K .......... .......... .......... .......... .......... 74% 4,57M 63s
623750K .......... .......... .......... .......... .......... 74% 3,97M 63s
623800K .......... .......... .......... .......... .......... 74% 4,47M 63s
623850K .......... .......... .......... .......... .......... 74% 4,38M 63s
623900K .......... .......... .......... .......... .......... 74% 4,13M 63s
623950K .......... .......... .......... .......... .......... 74% 3,23M 62s
624000K .......... .......... .......... .......... .......... 74% 4,35M 62s
624050K .......... .......... .......... .......... .......... 74% 4,17M 62s
624100K .......... .......... .......... .......... .......... 74% 4,35M 62s
624150K .......... .......... .......... .......... .......... 74% 4,04M 62s
624200K .......... .......... .......... .......... .......... 74% 4,47M 62s
624250K .......... .......... .......... .......... .......... 74% 4,06M 62s
624300K .......... .......... .......... .......... .......... 74% 4,41M 62s

648700K .......... .......... .......... .......... .......... 77% 4,07M 55s
648750K .......... .......... .......... .......... .......... 77% 2,33M 55s
648800K .......... .......... .......... .......... .......... 77% 3,87M 55s
648850K .......... .......... .......... .......... .......... 77% 4,25M 55s
648900K .......... .......... .......... .......... .......... 77% 3,71M 55s
648950K .......... .......... .......... .......... .......... 77% 3,76M 55s
649000K .......... .......... .......... .......... .......... 77% 4,03M 55s
649050K .......... .......... .......... .......... .......... 77% 3,70M 55s
649100K .......... .......... .......... .......... .......... 77% 3,05M 55s
649150K .......... .......... .......... .......... .......... 77% 2,48M 55s
649200K .......... .......... .......... .......... .......... 77% 3,69M 55s
649250K .......... .......... .......... .......... .......... 77% 3,08M 55s
649300K .......... .......... .......... .......... .......... 77% 3,19M 55s

688700K .......... .......... .......... .......... .......... 81% 3,61M 44s
688750K .......... .......... .......... .......... .......... 81% 2,88M 44s
688800K .......... .......... .......... .......... .......... 81% 4,30M 44s
688850K .......... .......... .......... .......... .......... 81% 4,41M 44s
688900K .......... .......... .......... .......... .......... 81% 3,81M 44s
688950K .......... .......... .......... .......... .......... 81% 2,72M 44s
689000K .......... .......... .......... .......... .......... 81% 2,86M 44s
689050K .......... .......... .......... .......... .......... 81% 3,59M 44s
689100K .......... .......... .......... .......... .......... 81% 2,81M 44s
689150K .......... .......... .......... .......... .......... 81% 3,21M 44s
689200K .......... .......... .......... .......... .......... 81% 3,77M 44s
689250K .......... .......... .......... .......... .......... 81% 3,54M 44s
689300K .......... .......... .......... .......... .......... 81% 4,13M 44s

698650K .......... .......... .......... .......... .......... 82% 4,48M 41s
698700K .......... .......... .......... .......... .......... 82% 4,10M 41s
698750K .......... .......... .......... .......... .......... 82% 1,65M 41s
698800K .......... .......... .......... .......... .......... 83% 3,04M 41s
698850K .......... .......... .......... .......... .......... 83% 2,66M 41s
698900K .......... .......... .......... .......... .......... 83% 2,06M 41s
698950K .......... .......... .......... .......... .......... 83% 2,51M 41s
699000K .......... .......... .......... .......... .......... 83% 3,18M 41s
699050K .......... .......... .......... .......... .......... 83% 2,38M 41s
699100K .......... .......... .......... .......... .......... 83% 4,30M 41s
699150K .......... .......... .......... .......... .......... 83% 3,26M 41s
699200K .......... .......... .......... .......... .......... 83% 4,48M 41s
699250K .......... .......... .......... .......... .......... 83% 3,72M 41s

744500K .......... .......... .......... .......... .......... 88% 4,35M 28s
744550K .......... .......... .......... .......... .......... 88% 4,06M 28s
744600K .......... .......... .......... .......... .......... 88% 4,40M 28s
744650K .......... .......... .......... .......... .......... 88% 4,30M 28s
744700K .......... .......... .......... .......... .......... 88% 4,25M 28s
744750K .......... .......... .......... .......... .......... 88% 3,22M 28s
744800K .......... .......... .......... .......... .......... 88% 4,07M 28s
744850K .......... .......... .......... .......... .......... 88% 4,44M 28s
744900K .......... .......... .......... .......... .......... 88% 4,44M 28s
744950K .......... .......... .......... .......... .......... 88% 4,03M 28s
745000K .......... .......... .......... .......... .......... 88% 4,35M 28s
745050K .......... .......... .......... .......... .......... 88% 4,19M 28s
745100K .......... .......... .......... .......... .......... 88% 4,28M 28s

773600K .......... .......... .......... .......... .......... 91% 4,16M 19s
773650K .......... .......... .......... .......... .......... 91% 4,41M 19s
773700K .......... .......... .......... .......... .......... 91% 4,14M 19s
773750K .......... .......... .......... .......... .......... 91% 4,29M 19s
773800K .......... .......... .......... .......... .......... 91% 3,56M 19s
773850K .......... .......... .......... .......... .......... 91% 3,17M 19s
773900K .......... .......... .......... .......... .......... 91% 4,08M 19s
773950K .......... .......... .......... .......... .......... 91% 3,28M 19s
774000K .......... .......... .......... .......... .......... 91% 4,13M 19s
774050K .......... .......... .......... .......... .......... 91% 4,43M 19s
774100K .......... .......... .......... .......... .......... 91% 4,06M 19s
774150K .......... .......... .......... .......... .......... 91% 4,32M 19s
774200K .......... .......... .......... .......... .......... 91% 4,14M 19s

798550K .......... .......... .......... .......... .......... 94% 3,11M 12s
798600K .......... .......... .......... .......... .......... 94% 1,98M 12s
798650K .......... .......... .......... .......... .......... 94% 3,28M 12s
798700K .......... .......... .......... .......... .......... 94% 3,22M 12s
798750K .......... .......... .......... .......... .......... 94% 2,75M 12s
798800K .......... .......... .......... .......... .......... 94% 1,88M 12s
798850K .......... .......... .......... .......... .......... 94% 3,21M 12s
798900K .......... .......... .......... .......... .......... 94% 2,63M 12s
798950K .......... .......... .......... .......... .......... 94% 2,26M 12s
799000K .......... .......... .......... .......... .......... 94% 3,44M 12s
799050K .......... .......... .......... .......... .......... 94% 3,11M 12s
799100K .......... .......... .......... .......... .......... 94% 1,87M 12s
799150K .......... .......... .......... .......... .......... 94% 2,74M 12s

823100K .......... .......... .......... .......... .......... 97% 3,63M 5s
823150K .......... .......... .......... .......... .......... 97% 2,93M 5s
823200K .......... .......... .......... .......... .......... 97% 4,34M 5s
823250K .......... .......... .......... .......... .......... 97% 4,12M 5s
823300K .......... .......... .......... .......... .......... 97% 4,40M 5s
823350K .......... .......... .......... .......... .......... 97% 4,48M 5s
823400K .......... .......... .......... .......... .......... 97% 3,76M 5s
823450K .......... .......... .......... .......... .......... 97% 4,42M 5s
823500K .......... .......... .......... .......... .......... 97% 4,34M 5s
823550K .......... .......... .......... .......... .......... 97% 3,27M 5s
823600K .......... .......... .......... .......... .......... 97% 4,11M 5s
823650K .......... .......... .......... .......... .......... 97% 4,43M 5s
823700K .......... .......... .......... .......... .......... 97% 4,34M 5s
823750K ....

In [155]:
path_to_glove_file = os.path.join(
 "./glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [156]:
num_tokens = vocab_size + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))

In [157]:
for word, i in word2i.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

In [158]:
print("Converted %d words (%d misses)" % (hits, misses))

Converted 11076 words (1037 misses)


In [159]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [162]:
num_samples = len(encoder_seq)
num_samples

4000

In [160]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

latent_dim = 128
encoder_inputs = Input(shape=(None, max_len_enc))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_data)

encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, max_len_dec))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_data,
                                     initial_state=encoder_states)
decoder_dense = Dense(max_len_dec, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

ValueError: Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (4000, 518)