In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from google.colab import drive
import os
import string
import random

# Mount Google Drive
drive.mount('/content/drive')

# Load and preprocess dataset
file_path = '/content/drive/MyDrive/wonderland.txt'

# Read and clean the text
with open(file_path, 'r', encoding='utf-8') as file:
    raw_text = file.read()

# Remove punctuation and convert to lowercase
raw_text = raw_text.translate(str.maketrans('', '', string.punctuation)).lower()

# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
print(chars)

#here we can see all the unique charecter are mapped to a unique number
print(char_to_int)

n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([raw_text])
sequences = tokenizer.texts_to_sequences([raw_text])[0]
vocab_size = len(tokenizer.word_index) + 1
print (vocab_size)

# Prepare input-output pairs for training
seq_length = 50
X = []
y = []
for i in range(seq_length, len(sequences)):
    X.append(sequences[i - seq_length:i])
    y.append(sequences[i])

# Pad sequences and one-hot encode the output
X = np.array(pad_sequences(X, maxlen=seq_length))
y = to_categorical(y, num_classes=vocab_size)

# Build the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length),
    LSTM(256, return_sequences=True),
    Dropout(0.4),
    LSTM(256),
    Dropout(0.4),
    Dense(vocab_size, activation='softmax')
])

# Build the model explicitly
model.build(input_shape=(None, seq_length))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display the model summary
model.summary()



Mounted at /content/drive
['\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '‘', '’', '“', '”', '\ufeff']
{'\n': 0, ' ': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27, '‘': 28, '’': 29, '“': 30, '”': 31, '\ufeff': 32}
Total Characters:  139054
Total Vocab:  33




In [3]:
# Define a checkpoint callback
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# Train the model
epochs = 100  # Increased epochs
batch_size = 32  # Experiment with different batch sizes
model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)

# Menyimpan model yang telah dilatih
model.save("trained_model.keras")
print("Model telah disimpan sebagai 'trained_model.keras'")

# Menampilkan file .keras di direktori kerja
files = [f for f in os.listdir() if f.endswith('.keras')]
print("Files found:", files)


Epoch 1/100
[1m820/823[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.0595 - loss: 6.6446
Epoch 1: loss improved from inf to 6.45643, saving model to weights-improvement-01-6.4564.keras
[1m823/823[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - accuracy: 0.0595 - loss: 6.6437
Epoch 2/100
[1m822/823[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.0612 - loss: 6.1774
Epoch 2: loss improved from 6.45643 to 6.15935, saving model to weights-improvement-02-6.1594.keras
[1m823/823[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.0612 - loss: 6.1774
Epoch 3/100
[1m819/823[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.0674 - loss: 6.0522
Epoch 3: loss improved from 6.15935 to 5.99741, saving model to weights-improvement-03-5.9974.keras
[1m823/823[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.0674 - loss: 6.0518
Epoch

In [4]:
# Train the model
epochs = 100  # Increased epochs
batch_size = 64  # Experiment with different batch sizes
model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)

# Menyimpan model yang telah dilatih
model.save("trained_model.keras")
print("Model telah disimpan sebagai 'trained_model.keras'")

# Menampilkan file .keras di direktori kerja
files = [f for f in os.listdir() if f.endswith('.keras')]
print("Files found:", files)

Epoch 1/100
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6234 - loss: 1.3452
Epoch 1: loss improved from 1.43366 to 1.33580, saving model to weights-improvement-01-1.3358.keras
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.6234 - loss: 1.3451
Epoch 2/100
[1m411/412[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - accuracy: 0.6357 - loss: 1.2688
Epoch 2: loss improved from 1.33580 to 1.29972, saving model to weights-improvement-02-1.2997.keras
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.6357 - loss: 1.2689
Epoch 3/100
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6452 - loss: 1.2590
Epoch 3: loss improved from 1.29972 to 1.28816, saving model to weights-improvement-03-1.2882.keras
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.6452 - loss: 1.2590
Epo

In [5]:
# Train the model
epochs = 100  # Increased epochs
batch_size = 256  # Experiment with different batch sizes
model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)

# Menyimpan model yang telah dilatih
model.save("trained_model.keras")
print("Model telah disimpan sebagai 'trained_model.keras'")

# Menampilkan file .keras di direktori kerja
files = [f for f in os.listdir() if f.endswith('.keras')]
print("Files found:", files)

Epoch 1/100
[1m102/103[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 34ms/step - accuracy: 0.7679 - loss: 0.7676
Epoch 1: loss improved from 0.80968 to 0.75885, saving model to weights-improvement-01-0.7588.keras
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.7680 - loss: 0.7674
Epoch 2/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.7865 - loss: 0.7154
Epoch 2: loss improved from 0.75885 to 0.72174, saving model to weights-improvement-02-0.7217.keras
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.7864 - loss: 0.7155
Epoch 3/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.7884 - loss: 0.6994
Epoch 3: loss improved from 0.72174 to 0.70189, saving model to weights-improvement-03-0.7019.keras
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.7884 - loss: 0.6995
Epoc

In [17]:
# Train the model
epochs = 10  # Increased epochs
batch_size = 3200  # Experiment with different batch sizes
history = model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks=callbacks_list)

# Menyimpan model yang telah dilatih
model.save("trained_model.keras")
print("Model telah disimpan sebagai 'trained_model.keras'")

# Menampilkan file .keras di direktori kerja
files = [f for f in os.listdir() if f.endswith('.keras')]
print("Files found:", files)

Epoch 1/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - accuracy: 0.9091 - loss: 0.2933
Epoch 1: loss improved from 0.31078 to 0.30292, saving model to weights-improvement-01-0.3029.keras
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 306ms/step - accuracy: 0.9087 - loss: 0.2943
Epoch 2/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295ms/step - accuracy: 0.9029 - loss: 0.3143
Epoch 2: loss did not improve from 0.30292
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 296ms/step - accuracy: 0.9029 - loss: 0.3144
Epoch 3/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 307ms/step - accuracy: 0.9015 - loss: 0.3097
Epoch 3: loss did not improve from 0.30292
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 309ms/step - accuracy: 0.9017 - loss: 0.3093
Epoch 4/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297ms/step - accuracy: 0.9033 - loss: 0.3129
Epoch 4: loss did not

In [20]:
import random
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Generate text from the model
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len, words_per_line=20):
    generated_text = ""
    word_count = 0

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding='pre')
        prediction = model.predict(token_list, verbose=0)
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction + 1e-7) / 1.0  # Experiment with temperature
        exp_preds = np.exp(prediction)
        prediction = exp_preds / np.sum(exp_preds)
        index = np.argmax(np.random.multinomial(1, prediction[0], 1))
        output_word = ""
        for word, idx in tokenizer.word_index.items():
            if idx == index:
                output_word = word
                break

        seed_text += " " + output_word
        seed_text = seed_text.split(' ')[-max_sequence_len:]
        seed_text = ' '.join(seed_text)

        # Add the generated word to the output and manage line breaks
        generated_text += output_word + " "
        word_count += 1
        if word_count % words_per_line == 0:
            generated_text += "\n"

    return generated_text

# Pick a random seed
start_index = random.randint(0, len(sequences) - seq_length - 1)
seed_text = ' '.join([list(tokenizer.word_index.keys())[word - 1] for word in sequences[start_index: start_index + seq_length]])
print("Seed:")
print(seed_text)

# Generate words with paragraph formatting
print("\nGenerated text:")
print(generate_text(seed_text, 25, model, tokenizer, seq_length))


Seed:
puppy began a series of short charges at the stick running a very little way forwards each time and a long way back and barking hoarsely all the while till at last it sat down a good way off panting with its tongue hanging out of its mouth and its

Generated text:
great eyes half shut this seemed to alice a good opportunity for making her escape so she set off at 
once and ran till she 
