In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Attention, LayerNormalization, Dropout

In [4]:
df = pd.read_csv("IMDB Dataset.csv")

In [5]:
df['cleaned_review'] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
df['cleaned_review'] = df['cleaned_review'].apply(lambda x: x.lower())
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['cleaned_review'] = df['cleaned_review'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ALI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment'], test_size=0.2, random_state=42)

In [7]:
# Label Encoding for Sentiment Column
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [8]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100, truncating='post', padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, truncating='post', padding='post')

In [9]:
# Transformer Model Building
embedding_dim = 32
num_heads = 2
ff_dim = 32

inputs = Input(shape=(100,))
embedding_layer = Embedding(input_dim=10000, output_dim=embedding_dim)(inputs)
transformer_block = embedding_layer

for _ in range(num_heads):
    attn_output = Attention(use_scale=True)([transformer_block, transformer_block])
    transformer_block = LayerNormalization(epsilon=1e-6)(transformer_block + attn_output)
    transformer_block = Dropout(0.1)(transformer_block)

transformer_block = tf.keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation='relu')(transformer_block)
transformer_block = tf.keras.layers.GlobalAveragePooling1D()(transformer_block)
transformer_block = Dropout(0.1)(transformer_block)
transformer_block = Dense(20, activation='relu')(transformer_block)
output_layer = Dense(1, activation='sigmoid')(transformer_block)

model = Model(inputs=inputs, outputs=output_layer)

In [10]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_padded, y_train_encoded, epochs=5, validation_data=(X_test_padded, y_test_encoded))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1f3589800d0>

In [13]:
y_pred_prob = model.predict(X_test_padded)
y_pred = (y_pred_prob > 0.5).astype(int)

# Converting back to original labels
y_pred_original = label_encoder.inverse_transform(y_pred.flatten())

print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print("Classification Report:\n", classification_report(y_test_encoded, y_pred))

Accuracy: 0.8576
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.87      0.86      4961
           1       0.86      0.85      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [15]:
model.save('transformer_classification_sentiment_model.h5')

In [16]:
import pickle

# Enregistrer le tokenizer dans un fichier
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)