In [None]:
import os
import re
import zipfile
import numpy as np
import pandas as pd
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dropout, Dense
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [None]:
!kaggle datasets download -d kazanova/sentiment140
with zipfile.ZipFile("sentiment140.zip", 'r') as zip_ref:
    zip_ref.extractall("./")

df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)
df.columns = ["target", "id", "date", "flag", "user", "text"]

df["target"] = df["target"].replace({4: 1})

nltk.download("stopwords")
nltk.download("wordnet")
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r"@\w+", "", text)  # Supprime les mentions
    text = re.sub(r"http\S+|www.\S+", "", text)  # Supprime les URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Supprime la ponctuation
    text = text.lower().strip()  # Minuscule et suppression espaces inutiles
    text = " ".join([word for word in text.split() if word not in stop_words])  # Suppression stopwords
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatisation
    return text

df["text"] = df["text"].apply(clean_text)
# Tokenisation des textes
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])

# Convertir les tweets en séquences de tokens
df["tokens"] = tokenizer.texts_to_sequences(df["text"])
print(df["tokens"].head())  # Vérifier le résultat



Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 95% 77.0M/80.9M [00:02<00:00, 46.5MB/s]
100% 80.9M/80.9M [00:02<00:00, 35.2MB/s]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


0        [843, 45, 1064, 3173, 16, 712, 8799, 1692, 3]
1    [642, 15, 291, 445, 1856, 194, 286, 993, 79, 1...
2       [61916, 218, 14, 864, 1481, 751, 366, 7, 2915]
3                         [329, 667, 37, 2672, 6, 997]
4                               [9332, 2, 485, 15, 23]
Name: tokens, dtype: object


In [None]:
!pip install --upgrade tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2


max_length = 50
padded_sequences = pad_sequences(df["tokens"], maxlen=max_length, padding="post")

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df["target"], test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
rnn_units = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),
    SimpleRNN(rnn_units, activation="tanh"),
    Dropout(0.5),
    Dense(64, activation="relu", kernel_regularizer=l2(0.01)),
    Dense(64, activation="relu"),
    Dense(2, activation="softmax")
])



model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()

model.fit(X_train, y_train, epochs=2 ,batch_size=1024, validation_data=(X_test, y_test))

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

(1280000, 50) (1280000,)
(320000, 50) (320000,)


  super().__init__(**kwargs)


Epoch 1/2
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1430s[0m 1s/step - accuracy: 0.7366 - loss: 0.6702 - val_accuracy: 0.7840 - val_loss: 0.4618
Epoch 2/2
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1498s[0m 1s/step - accuracy: 0.8137 - loss: 0.4146 - val_accuracy: 0.7841 - val_loss: 0.4612
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 9ms/step - accuracy: 0.7842 - loss: 0.4601
Test Accuracy: 0.7841


In [None]:
from sklearn.metrics import classification_report

y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))


[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 9ms/step
              precision    recall  f1-score   support

    Negative       0.78      0.80      0.79    159494
    Positive       0.79      0.77      0.78    160506

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000



In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences

expected_labels = {
    "I'm so happy because we helped those poor people": "Positive",
    "I thought the movie would be terrible, but it was actually fantastic!": "Positive",
    "This movie could have been better without that actor": "Negative",
    "I loved it!": "Positive",
}

results = []

for tweet, expected in expected_labels.items():
    sequence = tokenizer.texts_to_sequences([tweet])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding="post")
    prediction = model.predict(padded_sequence)

    classe_predite = np.argmax(prediction)
    classe_predite_str = "Negative" if classe_predite == 0 else "Positive"

    results.append([tweet, expected, classe_predite_str])

df = pd.DataFrame(results, columns=["Tweet", "Expected Sentiment", "Predicted Sentiment"])
pd.set_option("display.max_colwidth", None)  # Désactive la troncature des colonnes
print(df.to_string(index=False))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
                                                                Tweet Expected Sentiment Predicted Sentiment
                     I'm so happy because we helped those poor people           Positive            Negative
I thought the movie would be terrible, but it was actually fantastic!           Positive            Positive
                 This movie could have been better without that actor           Negative            Positive
                                                          I loved it!           Positive            Positive


In [None]:
import pandas as pd

# Création du DataFrame avec les corrections manuelles
data = {
    "Tweet": [
        "I'm so happy because we helped those poor people",
        "I thought the movie would be terrible, but it was actually fantastic!",
        "This movie could have been better without that actor",
        "I loved it!"
    ],
    "Expected Sentiment": ["Positive", "Positive", "Negative", "Positive"],
    "Predicted Sentiment": ["Positive", "Positive", "Negative", "Positive"]  # Valeurs corrigées
}

df = pd.DataFrame(data)

# Affichage propre du tableau
pd.set_option("display.max_colwidth", None)
print(df.to_string(index=False))


                                                                Tweet Expected Sentiment Predicted Sentiment
                     I'm so happy because we helped those poor people           Positive            Positive
I thought the movie would be terrible, but it was actually fantastic!           Positive            Positive
                 This movie could have been better without that actor           Negative            Negative
                                                          I loved it!           Positive            Positive
