<a href="https://colab.research.google.com/github/alexuqt/neural-networks/blob/main/6_RNN_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from numpy import array

import matplotlib.pyplot as plt
import pickle
import time

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [None]:
# source text
data = "I am not the kind of girl"

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# path = '/content/drive/My Drive/CTTC/1661-0.txt'
path = '/content/drive/My Drive/CTTC/RNN-data/all_tswift_lyrics.txt'
# path = '/content/drive/My Drive/CTTC/RNN-data/NosotrosEnLaLuna.txt'
data = open(path).read().lower()
print('length of the corpus is: :', len(data))

MessageError: ignored

In [None]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data = tokenizer.texts_to_sequences([data])[0]
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # 0 is reserved for padding so that's why we added 1

print('Tokenizer/Dictionary size (0 padding included):', vocab_size, word_index)
print("Encoded data:", encoded_data)

In [None]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded_data)):
	sequence = encoded_data[i-1:i+1]
	sequences.append(sequence)
print('Total sequences:', len(sequences))

for sequence in sequences[:5]:
  decoded_words = [tokenizer.index_word[idx] for idx in sequence]
  # decoded_text = ' '.join(decoded_words) # passa de ['you', "don't"] a "you don't"
  print(decoded_words)

In [None]:
# split sequences into input (X) and output (y)
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]
print(X[:5])
print(y[:5])

In [None]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)
# define model
y[:5]

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [None]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# fit network
start_time = time.time()

history = model.fit(X, y, epochs=30)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"It took {elapsed_time:.2f} seconds to train the model")

In [None]:
model.save('ts.h5')
pickle.dump(history, open("history.p", "wb"))
model = load_model('ts.h5')
history = pickle.load(open("history.p", "rb"))

In [None]:
plt.figure(figsize=(12, 6))  # Create a larger figure to accommodate both plots

# Plot for model accuracy
plt.subplot(1, 2, 1)  # Create subplot 1 (rows, columns, index)
plt.plot(history.history['accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')

# Plot for model loss
plt.subplot(1, 2, 2)  # Create subplot 2 (rows, columns, index)
plt.plot(history.history['loss'], color='orange')  # Use a different color for distinction
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')

plt.tight_layout()  # Adjust spacing between subplots for better layout
plt.show()  # Display the combined plot

### **Option 1: get a phrase adding N words**

In [None]:
def generate_seq(model, tokenizer, enter_text, n_pred):
    index = tokenizer.word_index.get(enter_text.lower())
    if index is not None:
      in_text, result = enter_text, enter_text

      # Generate a fixed number of words
      for _ in range(n_pred):
        # Encode the text as integers
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = np.array(encoded)

        # Predict probabilities for each word in the vocabulary
        yhat_probs = model.predict(encoded.reshape(1, -1), verbose=0)[0]

        # Get the index of the predicted word
        yhat_idx = np.argmax(yhat_probs)

        # Map predicted word index to word
        out_word = tokenizer.index_word.get(yhat_idx, '')  # Get the word if it exists, otherwise use an empty string

        # Append to input
        in_text, result = out_word, result + ' ' + out_word

      return result
    else:
        return f"'{enter_text}' is not in the tokenizer's vocabulary"





In [None]:
# evaluate
print(generate_seq(model, tokenizer, 'this', 13))

### **Option 2: get next N words**

In [None]:
def get_next(model, tokenizer, enter_text, top_n=3):
    index = tokenizer.word_index.get(enter_text.lower())
    if index is not None:
      in_text, result = enter_text, enter_text

      # Encode the text as integers
      encoded = tokenizer.texts_to_sequences([in_text])[0]
      encoded = np.array(encoded)

      # Predict probabilities for each word in the vocabulary
      yhat_probs = model.predict(encoded.reshape(1, -1), verbose=0)[0]

      top_indices = np.argsort(-yhat_probs)[:top_n]

      for position, idx in enumerate(top_indices, start=1):
        print(f"{position:<3} {tokenizer.index_word[idx]:<12} {yhat_probs[idx]*100:.1f}%")


      decoded_words = [tokenizer.index_word[idx] for idx in top_indices]
      return decoded_words
    else:
        return f"'{enter_text}' is not in the tokenizer's vocabulary"

In [None]:
get_next(model, tokenizer, 'why', 3);

### **Option 3: build a phrase choosing between N words**

In [None]:
def build_phrase(model, tokenizer, enter_text, top_n=3):
  index = tokenizer.word_index.get(enter_text.lower())
  if index is not None:
    in_text, result = enter_text, enter_text

    quit = False
    print("Press 'q' to quit")
    while not quit:
      options = get_next(model, tokenizer, in_text, top_n)
      while True:
        choice = input(f"Please enter a number between 1 and {top_n}: ")
        if(choice=='q'):
          quit = True
          break
        if choice.isdigit():
          chosen_index = int(choice)
          if 1 <= chosen_index <= top_n:
            break
          else:
              print(f"Number is not in the range 1-{top_n}. Please try again.")

      out_word = options[chosen_index-1]

      in_text = out_word
      if not quit:
        result += ' ' + out_word
      print("-----")
      print(result)
      print("-----")
  else:
    return f"'{enter_text}' is not in the tokenizer's vocabulary"

In [None]:
build_phrase(model, tokenizer, 'hola', 3)