In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.regularizers import l2

import pickle
import numpy as np


with open("/kaggle/input/pride-prejudice-subtitles-and-text/PP.txt", "r", encoding="utf8") as f:
    data = f.read()  # Lire le contenu du fichier comme une chaîne de caractères

text = data.replace('\n', ' ').replace('\r', ' ').replace('\ufeff', ' ').replace('“', '').replace('”', '')
text = ' '.join(text.split())  # Nettoyage des espaces supplémentaires

text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

total_words=len(tokenizer.word_index)
total_words
tokenizer.word_index
sequence_data = tokenizer.texts_to_sequences([text])[0]

#Pour enregistrer le tokenizer
pickle.dump(tokenizer, open('token.pkl', 'wb'))

vocab_size = len(tokenizer.word_index) + 1
print(f"Taille du vocabulaire : {vocab_size}")

sequences = []
for i in range(5, len(sequence_data)):
    words = sequence_data[i-5:i+1]
    sequences.append(words)

print(f"35 premier sequences: {sequences[:35]}")
print("*"*35)
print("La longueur des séquences est : ", len(sequences))

sequences = np.array(sequences)

X = sequences[:, :-1]  # Les trois premiers mots
y = sequences[:, -1]   # Le mot à prédire

y = to_categorical(y, num_classes=vocab_size)






Taille du vocabulaire : 6956
35 premier sequences: [[4073, 13, 24, 6, 539, 2492], [13, 24, 6, 539, 2492, 703], [24, 6, 539, 2492, 703, 12], [6, 539, 2492, 703, 12, 6], [539, 2492, 703, 12, 6, 1026], [2492, 703, 12, 6, 1026, 119], [703, 12, 6, 1026, 119, 7], [12, 6, 1026, 119, 7, 1263], [6, 1026, 119, 7, 1263, 3], [1026, 119, 7, 1263, 3, 6], [119, 7, 1263, 3, 6, 97], [7, 1263, 3, 6, 97, 381], [1263, 3, 6, 97, 381, 63], [3, 6, 97, 381, 63, 18], [6, 97, 381, 63, 18, 7], [97, 381, 63, 18, 7, 344], [381, 63, 18, 7, 344, 3], [63, 18, 7, 344, 3, 6], [18, 7, 344, 3, 6, 345], [7, 344, 3, 6, 345, 124], [344, 3, 6, 345, 124, 98], [3, 6, 345, 124, 98, 279], [6, 345, 124, 98, 279, 1], [345, 124, 98, 279, 1, 177], [124, 98, 279, 1, 177, 67], [98, 279, 1, 177, 67, 1096], [279, 1, 177, 67, 1096, 3], [1, 177, 67, 1096, 3, 52], [177, 67, 1096, 3, 52, 6], [67, 1096, 3, 52, 6, 119], [1096, 3, 52, 6, 119, 99], [3, 52, 6, 119, 99, 18], [52, 6, 119, 99, 18, 29], [6, 119, 99, 18, 29, 16], [119, 99, 18, 29, 16

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
model = Sequential()
model.add(Embedding(vocab_size, 30))
model.add(LSTM(400, return_sequences=True))
model.add(LSTM(400))
model.add(Dense(vocab_size, activation="softmax"))
model.build(input_shape=(None, 5))

In [4]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))


In [5]:
model.summary()


In [6]:
early_stopping = EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True)
checkpoint = ModelCheckpoint("/tmp/next_words.keras", monitor='val_loss', verbose=1, save_best_only=True)
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1)




In [7]:
history = model.fit(X_train, y_train,
                    epochs=500,
                    batch_size=64,
                    validation_data=(X_val, y_val),
                    callbacks=[checkpoint, early_stopping, tensorboard_callback])

Epoch 1/500
[1m1520/1523[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - loss: 6.6088
Epoch 1: val_loss improved from inf to 6.01198, saving model to /tmp/next_words.keras
[1m1523/1523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - loss: 6.6080 - val_loss: 6.0120
Epoch 2/500
[1m1516/1523[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - loss: 5.7682
Epoch 2: val_loss improved from 6.01198 to 5.63699, saving model to /tmp/next_words.keras
[1m1523/1523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 5.7677 - val_loss: 5.6370
Epoch 3/500
[1m1520/1523[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - loss: 5.2852
Epoch 3: val_loss improved from 5.63699 to 5.52424, saving model to /tmp/next_words.keras
[1m1523/1523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 9ms/step - loss: 5.2852 - val_loss: 5.5242
Epoch 4/500
[1m1520/1523[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/s

In [8]:
# history = model.fit(X_train, y_train,
#                     epochs=50,
#                     batch_size=64,
#                     validation_data=(X_val, y_val),
#                     callbacks=[checkpoint, early_stopping, tensorboard_callback])

In [9]:
### Generation d'une texte a l'aide d'une phrase saisit par utilsiateur
initial_text = "this night"
num_words_to_generate = 150

for _ in range(num_words_to_generate):
    # Convertir le texte actuel en séquence d'indices
    sequence = tokenizer.texts_to_sequences([initial_text])[0]
    sequence = np.array(sequence[-5:]).reshape(1, -1)  

    # Prédire le prochain mot
    predicted_probabilities = model.predict(sequence)
    predicted_index = np.argmax(predicted_probabilities)
    predicted_word = tokenizer.index_word.get(predicted_index, '')

    initial_text += ' ' + predicted_word

print("Texte généré:", initial_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 