<a href="https://colab.research.google.com/github/bharathkp/api_a/blob/main/Text_Generation_using_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Text Generation** using a **Recurrent Neural Networks**, specifically a **Long Short-Term Memory Network**, implementing this network in Python, and use it to generate some text.

In [2]:
#importing dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from google.colab import files


file = files.upload()

Saving New.txt to New.txt


In [8]:
#loading the dataset
file  = file['New.txt'].decode('utf-8')  # Decode from bytes to string

# Now you can work with the file_content as a string

In [9]:
#now tokenizing words
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [10]:
# preprocess the input data, makes tokens
processed_inputs = tokenize_words(file)

In [11]:
#converting characters to numbers as neural networks works on numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [12]:
#checking number of characters and vocabularies
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 2376
Total vocab: 53


In [13]:
seq_length = 100
x_data = []
y_data = []

In [14]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [15]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 2276


In [16]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [19]:
from keras.utils import to_categorical # Import np_utils from keras.utils

y = to_categorical(y_data) # Now np_utils is accessible

In [20]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

  super().__init__(**kwargs)


In [23]:
filepath = "model_weights_saved.keras"  # Changed the file extension to .keras
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [24]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [27]:
model.fit(X, y, epochs=10, batch_size=256, callbacks=desired_callbacks)

Epoch 1/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - loss: 3.0945
Epoch 1: loss improved from inf to 3.07534, saving model to model_weights_saved.keras
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 4s/step - loss: 3.0926
Epoch 2/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - loss: 3.0246
Epoch 2: loss improved from 3.07534 to 3.03473, saving model to model_weights_saved.keras
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4s/step - loss: 3.0256
Epoch 3/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - loss: 2.9963
Epoch 3: loss improved from 3.03473 to 3.00931, saving model to model_weights_saved.keras
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 4s/step - loss: 2.9976
Epoch 4/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - loss: 2.9866
Epoch 4: loss improved from 3.00931 to 3.00846, saving model to model_weights_saved.keras
[

<keras.src.callbacks.history.History at 0x7d2287f62210>

In [29]:
filename = "model_weights_saved.keras"  # Use the correct filename and extension
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [30]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [39]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" ctice cryptographic algorithms designed around computational hardness assumptions making algorithms  "


In [32]:
# generate the text
for i in range(1000):
  x = numpy.reshape(pattern, (1,len(pattern), 1))
  x = x/float(vocab_len)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = num_to_char[index]
  seq_in = [num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[ 1:len(pattern)]


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [38]:
print(prediction)

[[0.11163992 0.00068991 0.0007865  0.00098244 0.00103986 0.00084841
  0.00090733 0.00080171 0.00078755 0.06717286 0.00884308 0.05428566
  0.02820126 0.10842822 0.00664265 0.02388936 0.02085463 0.07610501
  0.00126399 0.00304311 0.03727389 0.03202148 0.05583789 0.05637149
  0.03925689 0.00170329 0.06519409 0.05131968 0.06512408 0.02357564
  0.01222755 0.00502228 0.00217644 0.02609301 0.00098546 0.00072572
  0.00042367 0.00083973 0.00097104 0.00092914 0.00065928 0.00066011
  0.00032995 0.0011426  0.00115146 0.00077066]]
