In [None]:
total_samples = 20000

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import os
import re
import string
import random
import tensorflow_datasets as tfds
import pickle
from keras.layers.normalization import BatchNormalization

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [9]:
train = tf.data.TextLineDataset("/content/drive/MyDrive/data/sentences_file.txt")

In [10]:
def custom_standard(input_string):
    return tf.strings.lower(input_string)

In [11]:
maxlen=20 # max sequence length
vectorize_layer_train = TextVectorization(
    standardize=custom_standard,
    max_tokens=None,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)

In [12]:
vectorize_layer_train.adapt(train)

In [13]:
vocab = vectorize_layer_train.get_vocabulary()
vocab_size = len(vocab)
print(vocab_size)

28376


In [14]:
def prepare_train_X1(text):
    '''
    text = [1,2,3,4,5] seq_len = 4
    x = [1,2,3,4] y = [2,3,4,5]
    '''
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer_train(text)
    x = tokenized_sentences[:, :-1]
    #y = tokenized_sentences[:, 1:]
    return x

In [15]:
def prepare_train_y1(text):
    '''
    text = [1,2,3,4,5] seq_len = 4
    x = [1,2,3,4] y = [2,3,4,5]
    '''
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer_train(text)
    #x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return y

In [16]:
X1,y1 = train.map(prepare_train_X1),train.map(prepare_train_y1)

In [17]:
lids = tf.data.TextLineDataset("/content/drive/MyDrive/data/lid_file.txt")

In [18]:
maxlen=20 # max sequence length
vectorize_layer_lids = TextVectorization(
    standardize=None,
    max_tokens=None,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)

In [19]:
vectorize_layer_lids.adapt(lids)

In [20]:
vocab_lids = vectorize_layer_lids.get_vocabulary()
vocab_lids_size = len(vocab_lids)
print(vocab_lids_size)

5


In [21]:
print(vocab_lids)

['', '[UNK]', '1', '0', '2']


In [22]:
def prepare_lids_X2(text):
    '''
    text = [1,2,3,4,5] seq_len = 4
    x = [1,2,3,4] y = [2,3,4,5]
    '''
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer_train(text)
    x = tokenized_sentences[:, :-1]
    #y = tokenized_sentences[:, 1:]
    return x

In [23]:
def prepare_lids_y2(text):
    '''
    text = [1,2,3,4,5] seq_len = 4
    x = [1,2,3,4] y = [2,3,4,5]
    '''
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer_train(text)
    #x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return y

In [24]:
X2,y2 = lids.map(prepare_lids_X2),lids.map(prepare_lids_y2)

In [25]:
seq_length = 20
embed_dim = 50
lstm_cells = 50
dense_cells = 100

In [26]:
from keras.layers.merge import concatenate

In [27]:
def get_model():
  inputs1 = tf.keras.layers.Input(shape=(seq_length))
  embed1 = tf.keras.layers.Embedding(vocab_size,embed_dim, input_length=seq_length)(inputs1)
  lstm1 = tf.keras.layers.LSTM(lstm_cells, return_sequences=True)(embed1)
  dense1 = tf.keras.layers.Dense(dense_cells, activation='relu')(lstm1)

  inputs2 = tf.keras.layers.Input(shape=(seq_length))
  embed2 = tf.keras.layers.Embedding(vocab_size,embed_dim, input_length=seq_length)(inputs2)
  lstm2 = tf.keras.layers.LSTM(lstm_cells, return_sequences=True)(embed2)
  dense2 = tf.keras.layers.Dense(dense_cells, activation='relu')(lstm2)

  merged = concatenate([dense1, dense2])
  dense_merged = tf.keras.layers.Dense(dense_cells,activation='relu')(merged)

  word_dense = tf.keras.layers.Dense(vocab_size)(dense_merged)


  model = tf.keras.Model(inputs=[inputs1,inputs2], outputs=[word_dense])
  return model

In [28]:
l_model = get_model()
loss_fn1 = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
l_model.compile("adam", loss=[loss_fn1])

In [29]:
l_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 50)       1418800     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 20, 50)       1418800     input_2[0][0]                    
______________________________________________________________________________________________

In [30]:
lt = []
for i in X1:
  lt.append(i.numpy())

In [31]:
ut = []
for i in X2:
  ut.append(i.numpy())

In [32]:
ik = []
for i in y1:
  ik.append(i.numpy())

In [33]:
mk = []
for i in y2:
  mk.append(i.numpy())

In [34]:
x_1 = np.array(lt).reshape(16220,-1,)
x_2 = np.array(ut).reshape(16220,-1,)
y_1 = np.array(ik).reshape(16220,-1,)
y_2 = np.array(mk).reshape(16220,-1,)

In [37]:
sample = 8000

In [38]:
class Custom_Saver_for_lm(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
      if(epoch%4 == 0):
        self.model.save("/content/drive/MyDrive/fin_model/"+str(epoch)+"/model/")

In [39]:
saver_final = Custom_Saver_for_lm()

In [40]:
l_model.fit(x=[x_1[:sample],x_2[:sample]],y=y_1[:sample],epochs = 10,callbacks=[saver_final])

Epoch 1/10




INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/0/model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/0/model/assets


Epoch 2/10
Epoch 3/10
Epoch 4/10




INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/3/model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/3/model/assets


Epoch 5/10
Epoch 6/10
Epoch 7/10




INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/6/model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/6/model/assets


Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/9/model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/9/model/assets


<tensorflow.python.keras.callbacks.History at 0x7fb17df316d0>

In [41]:
l_model.save("/content/drive/MyDrive/fin_model/"+str(10)+"/model/")



INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/10/model/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/fin_model/10/model/assets


In [61]:
class TextGenerator():
    def __init__( 
        self, max_tokens, start_tokens, index_to_word, top_k=10,maxlen=20):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.k = top_k
        self.maxlen=maxlen

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def get_sen(self, model):
        start_tokens = [_ for _ in self.start_tokens]
        num_tokens_generated = 0
        tokens_generated = []
        # print(start_tokens)
        while num_tokens_generated <= self.max_tokens:
            pad_len = self.maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:self.maxlen]
                sample_index = self.maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            print(x)
            num_tokens_generated+=1 #rem
        #     y = model.predict([x)
        #     sample_token = self.sample_from(y[0][sample_index])
        #     tokens_generated.append(sample_token)
        #     start_tokens.append(sample_token)
        #     num_tokens_generated = len(tokens_generated)
        # txt = " ".join(
        #     [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        # )
        # return txt

In [None]:
model = keras.models.load_model('/content/drive/MyDrive/naive_model/naive_simple_model_20')

In [66]:
start_prompt = "ye sab kya"
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index
word_to_lid = {}
for index, word in enumerate(vocab_lids):
    word_to_lid[word] = index
num_tokens_generated = 100
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
lid_tokens = [word_to_lid.get(_,1) for _ in start_prompt.split()]

In [62]:
gen = TextGenerator(num_tokens_generated, start_tokens, vocab,lid_tokens,vocab_lid)

In [63]:
gen.get_sen(model=None)

[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
[[18 52 22  0  0  0  0  0  0  0  0  0  0

In [51]:
print(start_tokens)

[18, 52, 22]


In [52]:
l_model.predict([start_tokens,[1,1,1]])

ValueError: ignored

In [50]:
gen.get_sen(l_model)

[18, 52, 22]
[18, 52, 22]
