In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import os
import re
import string
import random
import tensorflow_datasets as tfds
import pickle
from keras.layers.normalization import BatchNormalization

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [None]:
train = tf.data.TextLineDataset("/content/drive/MyDrive/data/sentences_file.txt")

In [None]:
def custom_standard(input_string):
    return tf.strings.lower(input_string)

In [None]:
maxlen=20 # max sequence length
vectorize_layer = TextVectorization(
    standardize=custom_standard,
    max_tokens=None,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)

In [None]:
vectorize_layer.adapt(train)

In [None]:
for i in train.take(2):
  print(i.numpy().decode())
# data is lowercased

ye to hona hi tha .... kabhi kisi neta ko saja mili hi ... always clean chit .. 
hey frd . hw r u ? msg now 


In [None]:
vocab = vectorize_layer.get_vocabulary()
vocab_size = len(vocab)

In [None]:
print(vocab_size)

28376


In [None]:
def prepare_dataset(text):
    '''
    text = [1,2,3,4,5] seq_len = 4
    x = [1,2,3,4] y = [2,3,4,5]
    '''
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

In [None]:
train_ds = train.map(prepare_dataset) # train_ds[0] = tuple(x[0],y[0])

In [None]:
type(train_ds)

tensorflow.python.data.ops.dataset_ops.MapDataset

In [None]:
# data is tokenised and ready for the model
for i in train_ds.take(1):
  print(i[0].numpy())
  print(i[1].numpy())

[[  18    4  193   19   48   28  216   87  221    5  780 1054   19   13
   922 1641 3479   16    0    0]]
[[   4  193   19   48   28  216   87  221    5  780 1054   19   13  922
  1641 3479   16    0    0    0]]


In [None]:
seq_length = 20
embed_dim = 50
lstm_cells = 50
dense_cells = 100

In [None]:
def simple_naive():
  model = Sequential()
  model.add(Embedding(vocab_size,embed_dim, input_length=seq_length))
  model.add(LSTM(lstm_cells, return_sequences=True))
  model.add(Dense(vocab_size))
  #loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
  model.compile("adam",loss='sparse_categorical_crossentropy')
  return model

In [None]:
#simple naive
class Custom_Saver_for_simple_naive(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
      if(epoch%4 == 0):
        self.model.save("/content/drive/MyDrive/naive_model/"+str(epoch)+"/model/")
        #self.model.save("/content/drive/MyDrive/naive_model/naive_simple_model_"+str(epoch)+".hd5")

In [None]:
naive_model = simple_naive()
print(naive_model.summary())
saver_simple = Custom_Saver_for_simple_naive()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 20, 50)            1418800   
_________________________________________________________________
lstm_20 (LSTM)               (None, 20, 50)            20200     
_________________________________________________________________
dense_23 (Dense)             (None, 20, 28376)         1447176   
Total params: 2,886,176
Trainable params: 2,886,176
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
print(type(train_ds))

<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>


In [None]:
naive_model.fit(train_ds,epochs=21,callbacks=[saver_simple])

Epoch 1/21




INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_0/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_0/assets


Epoch 2/21
Epoch 3/21
Epoch 4/21
Epoch 5/21




INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_4/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_4/assets


Epoch 6/21
Epoch 7/21
Epoch 8/21
Epoch 9/21




INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_8/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_8/assets


Epoch 10/21
Epoch 11/21
Epoch 12/21
Epoch 13/21




INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_12/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_12/assets


Epoch 14/21
Epoch 15/21
Epoch 16/21
Epoch 17/21




INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_16/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_16/assets


Epoch 18/21
Epoch 19/21
Epoch 20/21
Epoch 21/21




INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_20/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/naive_model/naive_simple_model_20/assets


<tensorflow.python.keras.callbacks.History at 0x7fa44f475210>

In [47]:
class TextGenerator():
    def __init__(
        self, max_tokens, start_tokens, index_to_word, top_k=10,maxlen=20):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.k = top_k
        self.maxlen=maxlen

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def get_sen(self, model):
        start_tokens = [_ for _ in self.start_tokens]
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = self.maxlen - len(start_tokens)
            print(pad_len) #rem
            sample_index = len(start_tokens) - 1
            print(sample_index)
            if pad_len < 0:
                x = start_tokens[:self.maxlen]
                sample_index = self.maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            print(x)
            x = np.array([x])
            y = model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        return txt

In [45]:
model = keras.models.load_model('/content/drive/MyDrive/naive_model/naive_simple_model_20')

In [49]:
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index
num_tokens_generated = 10
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]

In [50]:
start_prompt = "ye sab kya"
gen = TextGenerator(num_tokens_generated, start_tokens, vocab)

In [51]:
gen.get_sen(model)

17
2
[18, 52, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
16
3
[18, 52, 22, 2706, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
15
4
[18, 52, 22, 2706, 15240, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
14
5
[18, 52, 22, 2706, 15240, 8999, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
13
6
[18, 52, 22, 2706, 15240, 8999, 17031, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
12
7
[18, 52, 22, 2706, 15240, 8999, 17031, 15240, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
11
8
[18, 52, 22, 2706, 15240, 8999, 17031, 15240, 8999, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
10
9
[18, 52, 22, 2706, 15240, 8999, 17031, 15240, 8999, 9998, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
9
10
[18, 52, 22, 2706, 15240, 8999, 17031, 15240, 8999, 9998, 22782, 0, 0, 0, 0, 0, 0, 0, 0, 0]
8
11
[18, 52, 22, 2706, 15240, 8999, 17031, 15240, 8999, 9998, 22782, 19087, 0, 0, 0, 0, 0, 0, 0, 0]
7
12
[18, 52, 22, 2706, 15240, 8999, 17031, 15240, 8999, 9998, 22782, 19087, 17031, 0, 0, 0, 0, 0, 0, 0]


'ye sab kya pellet smone waar prayer smone waar potli gujaara martaba prayer prayer'