In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
import pickle
import numpy as np


# Ouvrir et lire le fichier
with open("/kaggle/input/pride-prejudice-subtitles-and-text/PP.txt", "r", encoding="utf8") as f:
    data = f.read()  # Lire le contenu du fichier comme une chaîne de caractères

# **Methode 1:** **Concaténation du texte dans une seule liste de tokens**
text = data.replace('\n', ' ').replace('\r', ' ').replace('\ufeff', ' ').replace('“', '').replace('”', '')
text = ' '.join(text.split())  # Nettoyage des espaces supplémentaires

text
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts([data])

total_words=len(tokenizer.word_index)
total_words
tokenizer.word_index
sequence_data = tokenizer.texts_to_sequences([text])[0]
#Pour enregistrer le tokenizer
pickle.dump(tokenizer, open('token.pkl', 'wb'))

# Taille du vocabulaire
vocab_size = len(tokenizer.word_index) + 1
print(f"Taille du vocabulaire : {vocab_size}")

# Création des séquences
sequences = []
for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)

print(f"35 premier sequences: {sequences[:35]}")
print("*"*35)
print("La longueur des séquences est : ", len(sequences))

# Conversion des séquences en numpy array
sequences = np.array(sequences)

# Séparation des données et des cibles
X = sequences[:, :-1]  # Les trois premiers mots
y = sequences[:, -1]   # Le mot à prédire

# Conversion des cibles en one-hot encoding
y = to_categorical(y, num_classes=vocab_size)






Taille du vocabulaire : 6956
35 premier sequences: [[4073, 13, 24, 6], [13, 24, 6, 539], [24, 6, 539, 2492], [6, 539, 2492, 703], [539, 2492, 703, 12], [2492, 703, 12, 6], [703, 12, 6, 1026], [12, 6, 1026, 119], [6, 1026, 119, 7], [1026, 119, 7, 1263], [119, 7, 1263, 3], [7, 1263, 3, 6], [1263, 3, 6, 97], [3, 6, 97, 381], [6, 97, 381, 63], [97, 381, 63, 18], [381, 63, 18, 7], [63, 18, 7, 344], [18, 7, 344, 3], [7, 344, 3, 6], [344, 3, 6, 345], [3, 6, 345, 124], [6, 345, 124, 98], [345, 124, 98, 279], [124, 98, 279, 1], [98, 279, 1, 177], [279, 1, 177, 67], [1, 177, 67, 1096], [177, 67, 1096, 3], [67, 1096, 3, 52], [1096, 3, 52, 6], [3, 52, 6, 119], [52, 6, 119, 99], [6, 119, 99, 18], [119, 99, 18, 29]]
***********************************
La longueur des séquences est :  121834


In [2]:
# Création du modèle LSTM
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=3))  # Embedding avec une taille plus grande
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

model.summary()




In [3]:
# Configuration de la sauvegarde du meilleur modèle
checkpoint = ModelCheckpoint("/tmp/next_words.keras", monitor='loss', verbose=1, save_best_only=True)

# Compilation du modèle
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

In [4]:
# Entraînement du modèle
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])



Epoch 1/70
[1m1903/1904[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - loss: 6.4714
Epoch 1: loss improved from inf to 6.12491, saving model to /tmp/next_words.keras
[1m1904/1904[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 14ms/step - loss: 6.4710
Epoch 2/70
[1m1901/1904[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 14ms/step - loss: 5.5112
Epoch 2: loss improved from 6.12491 to 5.46437, saving model to /tmp/next_words.keras
[1m1904/1904[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 14ms/step - loss: 5.5111
Epoch 3/70
[1m1904/1904[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 5.1411
Epoch 3: loss improved from 5.46437 to 5.12834, saving model to /tmp/next_words.keras
[1m1904/1904[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 14ms/step - loss: 5.1411
Epoch 4/70
[1m1904/1904[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 4.8752
Epoch 4: loss improved from 5.12834 to 4.88205, 

<keras.src.callbacks.history.History at 0x7f2cb9bc7d00>

In [5]:
### Generation d'une texte a l'aide d'une phrase saisit par utilsiateur
# Génération de texte après l'entraînement
initial_text = "this morning i am very tired"
num_words_to_generate = 500

for _ in range(num_words_to_generate):
    # Convertir le texte actuel en séquence d'indices
    sequence = tokenizer.texts_to_sequences([initial_text])[0]
    sequence = np.array(sequence[-3:]).reshape(1, -1)  # Garder les 3 derniers mots

    # Prédire le prochain mot
    predicted_probabilities = model.predict(sequence)
    predicted_index = np.argmax(predicted_probabilities)
    predicted_word = tokenizer.index_word.get(predicted_index, '')

    # Ajouter le mot prédit au texte initial
    initial_text += ' ' + predicted_word

print("Texte généré:", initial_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1