In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
import pickle

# Charger les données
nrows_to_load = 50000
df = pd.read_csv("C:\\Users\\Yannick Gisa\\Desktop\\BookSell\\archive\\Reviews.csv", nrows=nrows_to_load)

# Supprimer les commentaires déjà évalués
df = df[df['Score'] != 0]

if not df.empty:
    # Convertir les scores en 1 (vrai) ou 0 (faux)
    df['Score'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)

    # Diviser les données en ensembles d'entraînement et de test
    X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Score'], test_size=0.2, random_state=42)

    # Initialisation du Tokenizer avec un nombre maximum de mots
    tokenizer = Tokenizer(num_words=10000)

    # Adapter le Tokenizer sur les données d'entraînement
    tokenizer.fit_on_texts(X_train)

    # Tokenisation des mots et rembourrage des séquences pour les données d'entraînement et de test
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    # Rembourrage des séquences pour qu'elles aient la même longueur
    max_sequence_length = max([len(seq) for seq in X_train_seq])
    X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
    X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

    # Construction du modèle RNN (LSTM)
    model = Sequential([
        Embedding(input_dim=10000, output_dim=64),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True), 
        Dropout(0.5), 
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),  
        Dense(32, activation='relu'),  
        Dense(1, activation='sigmoid')
    ])

    # Compilation du modèle
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Entraînement du modèle
    model.fit(X_train_padded, y_train, epochs=5, batch_size=128, validation_split=0.2)

    # Évaluation du modèle
    loss, accuracy = model.evaluate(X_test_padded, y_test)
    print("Accuracy:", accuracy)

    # Saving the model for Future Inferences
    model_json = model.to_json()
    with open("model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("model.weights.h5")

    # Sauvegarder max_sequence_length dans un fichier
    with open('max_sequence_length.pkl', 'wb') as f:
        pickle.dump(max_sequence_length, f)

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5093s[0m 20s/step - accuracy: 0.7882 - loss: 0.4798 - val_accuracy: 0.8692 - val_loss: 0.3112
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11594s[0m 46s/step - accuracy: 0.8855 - loss: 0.2814 - val_accuracy: 0.8827 - val_loss: 0.2899
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6493s[0m 26s/step - accuracy: 0.9081 - loss: 0.2311 - val_accuracy: 0.8779 - val_loss: 0.2945
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5983s[0m 24s/step - accuracy: 0.9195 - loss: 0.2084 - val_accuracy: 0.8838 - val_loss: 0.2993
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5935s[0m 24s/step - accuracy: 0.9321 - loss: 0.1804 - val_accuracy: 0.8845 - val_loss: 0.3088
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m279s[0m 893ms/step - accuracy: 0.8769 - loss: 0.3354
Accuracy: 0.8766000270843506
