In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.regularizers import l2

import pickle
import numpy as np
from sklearn.model_selection import train_test_split

with open("/kaggle/input/pride-prejudice-subtitles-and-text/PP.txt", "r", encoding="utf8") as f:
    data = f.read()

# Prétraitement du texte
text = data.replace('\n', ' ').replace('\r', ' ').replace('\ufeff', ' ').replace('“', '').replace('”', '')
text = ' '.join(text.split())

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

# Enregistrer le tokenizer
with open('token.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle)

vocab_size = len(tokenizer.word_index) + 1
print(f"Taille du vocabulaire : {vocab_size}")



Taille du vocabulaire : 6745


In [2]:
sequence_data = tokenizer.texts_to_sequences([text])[0]
sequences = []
for i in range(5, len(sequence_data)):
    words = sequence_data[i-5:i+1]
    sequences.append(words)

sequences = np.array(sequences)



In [3]:
# Diviser les données en entrée et sortie
X = sequences[:, :-1]  
y = sequences[:, -1]  
y = to_categorical(y, num_classes=vocab_size)





In [4]:
# Définir le modèle
model = Sequential()
model.add(Embedding(vocab_size, 50))
model.add(GRU(120))  # Remplace LSTM par GRU
model.add(Dense(vocab_size, activation='softmax'))
model.build(input_shape=(None, X.shape[1])) 
print(model.summary())

None


In [5]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=80, verbose=1)



Epoch 1/80
[1m3809/3809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 17ms/step - accuracy: 0.0497 - loss: 6.4830
Epoch 2/80
[1m3809/3809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 17ms/step - accuracy: 0.1210 - loss: 5.3772
Epoch 3/80
[1m3809/3809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 17ms/step - accuracy: 0.1462 - loss: 4.9742
Epoch 4/80
[1m3809/3809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 17ms/step - accuracy: 0.1660 - loss: 4.6532
Epoch 5/80
[1m3809/3809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 17ms/step - accuracy: 0.1832 - loss: 4.3793
Epoch 6/80
[1m3809/3809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 17ms/step - accuracy: 0.2044 - loss: 4.1159
Epoch 7/80
[1m3809/3809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 17ms/step - accuracy: 0.2268 - loss: 3.8882
Epoch 8/80
[1m3809/3809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 17ms/step - accuracy: 0.2529 - loss: 3.6703
Epoch 9/

<keras.src.callbacks.history.History at 0x7cad53746c50>

In [6]:
input_text = " It is a truth universally"
predict_next_words = 45


for _ in range(predict_next_words):
    input_sequence = tokenizer.texts_to_sequences([input_text])[0]
    
    input_sequence = input_sequence[-5:]
    
    input_sequence = np.array(input_sequence).reshape(1, -1)
    
    predicted_prob = model.predict(input_sequence, verbose=0)
    predicted_index = np.argmax(predicted_prob, axis=-1)
    
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            output_word = word
            break
    
    if output_word == "":
        print("Aucun mot prédit trouvé.")
        break
    
    input_text += " " + output_word

print(input_text)

 It is a truth universally acknowledged that it is only reply and too eager to pardon him i would not have been so much through the same parish and to see a girl whom chiefly as soon as she made go was sure over her mother’s thoughts had been produced
