In [2]:
pip install tensorflow


Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.18.0-cp39-cp39-macosx_12_0_arm64.whl (239.4 MB)
[K     |████████████████████████████████| 239.4 MB 13.7 MB/s eta 0:00:01     |██████████████▉                 | 111.0 MB 18.6 MB/s eta 0:00:07
[?25hCollecting absl-py>=1.0.0
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[K     |████████████████████████████████| 133 kB 12.9 MB/s eta 0:00:01
[?25hCollecting termcolor>=1.1.0
  Downloading termcolor-2.5.0-py3-none-any.whl (7.8 kB)
Collecting h5py>=3.11.0
  Downloading h5py-3.12.1-cp39-cp39-macosx_11_0_arm64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 18.9 MB/s eta 0:00:01
[?25hCollecting libclang>=13.0.0
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl (25.8 MB)
[K     |████████████████████████████████| 25.8 MB 11.1 MB/s eta 0:00:01
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 

In [2]:
pip install matplotlib

Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl (7.8 MB)
[K     |████████████████████████████████| 7.8 MB 5.4 MB/s eta 0:00:01
Collecting cycler>=0.10
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting kiwisolver>=1.3.1
  Downloading kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl (64 kB)
[K     |████████████████████████████████| 64 kB 6.2 MB/s eta 0:00:011
Collecting pyparsing>=2.3.1
  Downloading pyparsing-3.2.1-py3-none-any.whl (107 kB)
[K     |████████████████████████████████| 107 kB 9.2 MB/s eta 0:00:01
[?25hCollecting contourpy>=1.0.1
  Downloading contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl (249 kB)
[K     |████████████████████████████████| 249 kB 20.3 MB/s eta 0:00:01
Collecting fonttools>=4.22.0
  Downloading fonttools-4.55.3-cp39-cp39-macosx_10_9_universal2.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 24.9 MB/s eta 0:00:

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
import requests
import zipfile
import os
import matplotlib.pyplot as plt

# Step 1: Download the dataset
dataset_url = "https://drive.google.com/uc?export=download&id=1GeUzNVqiixXHnTl8oNiQ2W3CynX_lsu2"
response = requests.get(dataset_url)
dataset_path = "dataset.zip"

# Save and extract the dataset
with open(dataset_path, "wb") as f:
    f.write(response.content)

with zipfile.ZipFile(dataset_path, "r") as zip_ref:
    zip_ref.extractall("dataset")

# Step 2: Read the dataset file
with open("dataset/text_file.txt", "r", encoding="utf-8") as f:  # Replace with your text file name
    data = f.read()

# Step 3: Clean and tokenize the text
data = data.lower().replace("\n", " ")  # Convert to lowercase and replace newlines

# Tokenizer for word indexing
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1

# Step 4: Convert text into sequences of tokens
input_sequences = []
for line in data.split("."):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

# Step 5: Pad sequences to the same length
max_sequence_len = max(len(seq) for seq in input_sequences)
input_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Step 6: Split into predictors and label
X, y = input_sequences[:, :-1], input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

# Step 7: Build the LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Step 8: Train the model
history = model.fit(X, y, epochs=10, verbose=1)

# Step 9: Plot training loss
plt.plot(history.history['loss'], label='Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.show()

# Step 10: Function to predict the next word
def predict_next_word(seed_text, tokenizer, model, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = tf.keras.preprocessing.sequence.pad_sequences([token_list],
                                                                maxlen=max_sequence_len - 1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            return word
    return ""

# Step 11: Test the model
seed_text = "the quick brown fox"
next_word = predict_next_word(seed_text, tokenizer, model, max_sequence_len)
print(f"Input: '{seed_text}' -> Predicted next word: '{next_word}'")




BadZipFile: File is not a zip file