Working with a dataset that contains movie reviews labeled as positive or negative.

In [4]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


df = pd.read_csv("imdb-dataset/IMDB Dataset.csv")

print(df.head())

#Affichage des colonnes du dataset

print(df.columns)



















                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Index(['review', 'sentiment'], dtype='object')


In [5]:
#Effectuez des opérations de nettoyage des textes (suppression de la ponctuation, des stop words, mise en minuscules, etc.)

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
df['cleaned_reviews'] = df['review'].apply(lambda x: ' '.join([word for word in word_tokenize(x.lower()) if word not in stop_words and word not in string.punctuation]))

print(df['cleaned_reviews'].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADEM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADEM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0    one reviewers mentioned watching 1 oz episode ...
1    wonderful little production br br filming tech...
2    thought wonderful way spend time hot summer we...
3    basically 's family little boy jake thinks 's ...
4    petter mattei 's `` love time money '' visuall...
Name: cleaned_reviews, dtype: object


In [6]:
#encodage des labels pour transformer les labels catégoriels en valeurs numériques

le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])



In [20]:

from sklearn.preprocessing import LabelEncoder

#Divisez les données en ensembles d'entraînement et de test.
#Using train_test_split to divide  data into training and test sets.

#feature_column is  review column      and    label_column is  sentiment  


# Features and labels
X = df['review']
y = df['sentiment']

# Encoder les labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Print the sizes of the splits
print(f'Training set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')


Training set size: 40000
Test set size: 10000


In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



# Utilisation de pad_sequences de Keras pour que toutes les séquences aient la même longueur.

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

#vérification de la taille des séquences

print("Taille de X_train_pad:", X_train_pad.shape)
print("Taille de X_test_pad:", X_test_pad.shape)






Taille de X_train_pad: (40000, 100)
Taille de X_test_pad: (10000, 100)


1. Early Stopping : Utilisez EarlyStopping de Keras pour surveiller la perte de validation (val_loss) et arrêter l'entraînement lorsque celle-ci cesse de s'améliorer.

2.	Learning Rate Decay : Proposez une fonction de décroissance pour le taux d'apprentissage, par exemple en utilisant ReduceLROnPlateau de Keras.

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder




# Définir le callback EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # métrique à surveiller
    patience=3,          # nombre d'époques à attendre après la dernière amélioration
    restore_best_weights=True  # restaurer les poids du meilleur modèle
)


# Définir le callback ReduceLROnPlateau   de  décroissance de taux d'apprentissage  (Learning Rate Decay)  
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',  # métrique à surveiller
    factor=0.1,           # facteur de réduction du taux d'apprentissage (nouveau_lr = lr * factor)
    patience=5,           # nombre d'époques à attendre après la dernière amélioration avant de réduire le taux d'apprentissage
    min_lr=0.0001,        # taux d'apprentissage minimal
    verbose=1             # afficher des messages pour indiquer les réductions de taux d'apprentissage
)




# utiliser les 2 callback lors de l'entraînement de modèle plus tard







Epoch 1/20




[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 44ms/step - accuracy: 0.7752 - loss: 0.4538 - val_accuracy: 0.8621 - val_loss: 0.3126
Epoch 2/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 44ms/step - accuracy: 0.8831 - loss: 0.2823 - val_accuracy: 0.8692 - val_loss: 0.3077
Epoch 3/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 44ms/step - accuracy: 0.9080 - loss: 0.2292 - val_accuracy: 0.8672 - val_loss: 0.3523
Epoch 4/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 44ms/step - accuracy: 0.9236 - loss: 0.1904 - val_accuracy: 0.8704 - val_loss: 0.3282
Epoch 5/20
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 45ms/step - accuracy: 0.9422 - loss: 0.1529 - val_accuracy: 0.8673 - val_loss: 0.3365
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8657 - loss: 0.3071
Loss: 0.3077445924282074
Accuracy: 0.8691999912261963
