<a href="https://colab.research.google.com/github/Yashwant-SriSai/Shakesphere_Copy_Cat/blob/main/text_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#Imports
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding,Dropout
from tensorflow.keras.optimizers import RMSprop
import random
import sys

In [11]:
#Load he Shakesphere's Dataset
print("Downloading Sakesphere's Data....")
path = tf.keras.utils.get_file('shakespeare.txt',
    origin='https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
with open(path,encoding="utf-8") as f:
  text=f.read().lower() # why lower? so that model won't have to remember that both "a" and "A" or same.
# lets take first 200,000 characters for our vocabulary
text=text[:200000]
print(f'Corpus Length:{len(text)}')


Downloading Sakesphere's Data....
Corpus Length:200000


In [12]:
#Map characters to integers
# 1st we need to sort the characters to get unique characters
chars=sorted(list(set(text)))
print(f'total no.of unique characters:{len(chars)}')
# lets map these unique characters
char_indices={c:i for i,c in enumerate(chars)}
indices_char={i:c for i,c in enumerate(chars)}

total no.of unique characters:37


In [13]:
# now we create a sequence from the text as we ae using lstms it learns from sequences to get vast traing data we use sliding windows wth a step of 3-5
# Here with creating sequence we are nothing but creating two arrays says for every let day 40 characters we need to fnd 41th character so we creating a "sentence[]" which contains characters of certain length say 40 and "next_char[]" will contain 41th character we continue this throught all the text with a step
max_len=40
step=3
sentences=[]
next_char=[]
for i in range(0,len(text)-max_len,3):
  sentences.append(text[i:i+max_len])
  next_char.append(text[i+max_len])
print(f'Number of training Sequences:{len(sentences)}')

Number of training Sequences:66654


In [14]:
# Now we will do integer encoding meaning we are converting
print("Integer Ecoding data for the embedding layer...")
# Now we create x,y where x is a 2d array of dimensions len(sentences),max_len i.e (66654,40) and y of shape 66654.
# These x,y are all zeros later we change the specific characters position "0" to its integer ID
x=np.zeros((len(sentences),max_len),dtype=np.int32)
y=np.zeros((len(sentences),),dtype=np.int32)

for i,sentence in enumerate(sentences):
  for t,char in enumerate(sentence):
    x[i,t]=char_indices[char]
  y[i]=char_indices[next_char[i]]
print(f'Id_s of x: {x}')
print(f'Target values Id_s: {y}')

Integer Ecoding data for the embedding layer...
Id_s of x: [[16 19 28 ... 16 31 28]
 [29 30  1 ... 30 18 15]
 [13 19 30 ... 28  5  1]
 ...
 [15 24  1 ... 15 15 10]
 [30 18 25 ...  0  0 13]
 [31  1 18 ... 22 11 28]]
Target values Id_s: [30 28 18 ...  0 22 15]


In [15]:
# Lets build our Emmbedding+LSTM Model
print("Building Model...")
model=Sequential([
    # in emdeddings layer we have teo parameters input_dim and out_Dim .
    # Here input_dim=vocabulary size i.e length(unique chars) and out_dim are the size of vector we want to compress into
    Embedding(input_dim=len(chars),output_dim=64),
    LSTM(256, return_sequences=False),
    Dropout(0.2),
    Dense(len(chars),activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy",optimizer=RMSprop(learning_rate=0.001,clipnorm=1.0))
#categorical_crossentropy: Expects your target $Y$ to be a One-Hot Array. It expects to see something like [0, 0, 0, 1, 0, 0] to know the 4th character is the correct answer.
#sparse_categorical_crossentropy: Expects your target $Y$ to just be a Single Integer. It expects to see the number 3.

Building Model...


In [16]:
# Now lets Create a sample function to adjust the randomness of the predictions
def sample(preds,temp):
  preds=np.asarray(preds).astype("float64")
  preds=np.log(preds+ 1e-7)/temp # here we are chaging our values into log divides by temp to get values that have some significant difference between them
                                 # so that model won't go into loops selecting/predicting the same char with highest probbility
                                 #np.log(preds): We take the raw percentages (0.0 to 1.0) and convert them back into "logits" (raw mathematical scores).
                                 # temperature: We divide those scores by our temperature setting.
                                 # If Temp is low (0.2), it stretches the scores apart. The #1 likely character becomes massively dominant.
                                 # If Temp is high (1.5), it squishes the scores together. The #1 character and the #5 character suddenly look very similar.
                                 # we use 1e-7 to prevent getting value 0 as log(0) is mathematically undefined
  exp_preds=np.exp(preds) #Here we change he log values into prbabilities (float values) to choose the one randomly
  preds=exp_preds/np.sum(exp_preds)

  probas= np.random.multinomial(1,preds,1)
  return np.argmax(probas)

In [17]:
epochs = 50          # Increased to 50 for actual coherent text
batch_size = 512
generation_temp = 0.5 # Set your manual temperature here (0.4 - 0.6 is best)

print(f"Starting training for {epochs} epochs...")

for epoch in range(epochs):
    print(f"\n--- Epoch {epoch + 1} ---")

    # Train for one epoch
    model.fit(x, y, batch_size=batch_size, epochs=1)

    # Pick a random starting seed
    start_index = random.randint(0, len(text) - max_len - 1)
    sentence = text[start_index: start_index + max_len]

    print(f"\n--- Generating with Temperature: {generation_temp} ---")
    print(f'Seed: "{sentence}"\n')

    generated = sentence
    sys.stdout.write(generated)

    # Generate 100 characters
    for i in range(100):
        # Format the input for the Embedding layer
        x_pred = np.zeros((1, max_len), dtype=np.int32)
        for t, char in enumerate(sentence):
            x_pred[0, t] = char_indices[char]

        # Predict, Sample, and Decode
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, generation_temp)
        next_char = indices_char[next_index]

        # Slide the window
        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print("\n") # Add a final newline for clean formatting

Starting training for 50 epochs...

--- Epoch 1 ---
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 1s/step - loss: 3.1647

--- Generating with Temperature: 0.5 ---
Seed: "d in the leash,
to let him slip at will."

d in the leash,
to let him slip at will. tout teuu chet yon
tou aers
hte tose sars ten ttece ne wouttn
ttod touun:
es
g: ant berunnes tuntto


--- Epoch 2 ---
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 1s/step - loss: 2.6496

--- Generating with Temperature: 0.5 ---
Seed: " behold'st!

aufidius:
you keep a consta"

 behold'st!

aufidius:
you keep a constace inco wal vanent one bas bere
doul caveeng,

wand setf som inen
bans siuredd
ant sore beakd tous h


--- Epoch 3 ---
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 1s/step - loss: 2.4141

--- Generating with Temperature: 0.5 ---
Seed: "eir tides for their glories,
an outward "

eir tides for their glories,
an outward mone int haml thith were on maron mane th