In [1]:
# Install necessary libraries
!pip install datasets

# Import libraries
import numpy as np
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
# Load the dataset (using Hugging Face's wikitext dataset)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# Extract text data
text_data = dataset['train']['text']

# Filter out empty lines
text_data = [text for text in text_data if text.strip() != ""]

# Tokenize the text
tokenizer = Tokenizer(num_words=20000)  # Increase vocabulary size
tokenizer.fit_on_texts(text_data)

# Create input sequences
input_sequences = []
for text in text_data:
    tokenized_text = tokenizer.texts_to_sequences([text])[0]
    for i in range(1, len(tokenized_text)):
        input_sequences.append(tokenized_text[:i+1])

# Limit dataset size to avoid memory issues
input_sequences = input_sequences[:10000]  # Use only 10,000 sequences

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [3]:
# Set a smaller max sequence length
max_len = 50

# Pad sequences
padded_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

# Prepare X and y
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

In [4]:
# Use a data generator to avoid memory issues
class DataGenerator(Sequence):
    def __init__(self, input_sequences, batch_size, max_len, vocab_size):
        self.input_sequences = input_sequences
        self.batch_size = batch_size
        self.max_len = max_len
        self.vocab_size = vocab_size

    def __len__(self):
        return int(np.ceil(len(self.input_sequences) / self.batch_size))

    def __getitem__(self, idx):
        batch_sequences = self.input_sequences[idx * self.batch_size:(idx + 1) * self.batch_size]
        padded_sequences = pad_sequences(batch_sequences, maxlen=self.max_len, padding='pre')
        X = padded_sequences[:, :-1]
        y = padded_sequences[:, -1]
        return X, y

In [5]:
# Create the generator
batch_size = 32
vocab_size = len(tokenizer.word_index) + 1
data_gen = DataGenerator(input_sequences, batch_size, max_len, vocab_size)


In [6]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len-1))
model.add(LSTM(256, return_sequences=True))  # Increase units
model.add(LSTM(256))  # Add another LSTM layer
model.add(Dense(vocab_size, activation='softmax'))

# Compile the model with sparse_categorical_crossentropy
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(data_gen, epochs=50, callbacks=[early_stopping])  # Train for more epochs



Epoch 1/50


  self._warn_if_super_not_called()


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.0785 - loss: 8.8154
Epoch 2/50
[1m  5/313[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m9s[0m 30ms/step - accuracy: 0.1066 - loss: 6.4152

  current = self.get_monitor_value(logs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.0795 - loss: 6.6740
Epoch 3/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.0793 - loss: 6.5678
Epoch 4/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.0836 - loss: 6.3106
Epoch 5/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.0983 - loss: 6.1091
Epoch 6/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.0993 - loss: 5.9514
Epoch 7/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.1086 - loss: 5.7827
Epoch 8/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.1174 - loss: 5.6274
Epoch 9/50
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.1240 - loss: 5.4957
Epoch 10/50
[1m313/313[0m [32m━━

<keras.src.callbacks.history.History at 0x7f2f3e351cd0>

In [7]:
# Temperature sampling function
def generate_text_with_temperature(model, tokenizer, seed_text, max_len=50, temperature=1.0):
    for _ in range(max_len):
        tokenized_text = tokenizer.texts_to_sequences([seed_text])[0]
        padded_text = pad_sequences([tokenized_text], maxlen=max_len-1, padding='pre')
        predictions = model.predict(padded_text, verbose=0)[0]

        # Apply temperature sampling
        predictions = np.log(predictions) / temperature
        exp_predictions = np.exp(predictions)
        predictions = exp_predictions / np.sum(exp_predictions)

        predicted_word_idx = np.random.choice(len(predictions), p=predictions)
        predicted_word = tokenizer.index_word.get(predicted_word_idx, '')
        seed_text += " " + predicted_word
    return seed_text

In [11]:
# Test with temperature sampling
seed_text = "how are"
print(generate_text_with_temperature(model, tokenizer, seed_text, temperature=0.7))

how are launched and tgs 000 if the situations deemed the blitz system is carried over directly received barker 's children do though barker 's football in a trunk in two forces and her arms that he serialized in dengeki february 1973 aged 77 years two funeral services were held were calamity
