In [37]:
import warnings
warnings.filterwarnings("ignore")

# === Imports ===
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


nltk.download('stopwords')

#  Load dataset
df = pd.read_csv("sentiment_analysis.csv")
print(df.head())




   id  label                                              tweet
0   1      0  #fingerprint #Pregnancy Test https://goo.gl/h1...
1   2      0  Finally a transparant silicon case ^^ Thanks t...
2   3      0  We love this! Would you go? #talk #makememorie...
3   4      0  I'm wired I know I'm George I was made that wa...
4   5      1  What amazing service! Apple won't even talk to...


[nltk_data] Downloading package stopwords to /Users/anas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
#  Check label balance
print("\nInitial label distribution:\n", df['label'].value_counts())




Initial label distribution:
 label
0    5894
1    2026
Name: count, dtype: int64


In [41]:
# Clean text
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+|#\w+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = [w for w in text.split() if w not in stop_words]
    return ' '.join(words)

df['clean_tweet'] = df['tweet'].apply(clean_text)

# Separate majority and minority
df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

# Undersample majority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,        # sample without replacement
                                   n_samples=len(df_minority),  # same as minority
                                   random_state=42)

# Combine minority and downsampled majority
df_balanced = pd.concat([df_majority_downsampled, df_minority])

print("\nNew label distribution (balanced):")
print(df_balanced['label'].value_counts())



New label distribution (balanced):
label
0    2026
1    2026
Name: count, dtype: int64


In [43]:
# Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df_balanced['clean_tweet'])
y = df_balanced['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTaille train: {X_train.shape}, test: {X_test.shape}")


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel='linear'),
    "k-NN": KNeighborsClassifier(n_neighbors=5)
}


print("\nðŸ“Š RÃ©sultats de la validation croisÃ©e (k=5) :")
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name}: moyenne={np.mean(scores):.4f}, Ã©cart-type={np.std(scores):.4f}")

print("\n Ã‰valuation finale sur le jeu de test :")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n----- {name} -----")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification report:\n", classification_report(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


Taille train: (3241, 3000), test: (811, 3000)

ðŸ“Š RÃ©sultats de la validation croisÃ©e (k=5) :
Logistic Regression: moyenne=0.8371, Ã©cart-type=0.0113
SVM: moyenne=0.8365, Ã©cart-type=0.0096
k-NN: moyenne=0.5002, Ã©cart-type=0.0021

 Ã‰valuation finale sur le jeu de test :

----- Logistic Regression -----
Accuracy: 0.8310727496917386
Classification report:
               precision    recall  f1-score   support

           0       0.81      0.86      0.84       406
           1       0.85      0.80      0.83       405

    accuracy                           0.83       811
   macro avg       0.83      0.83      0.83       811
weighted avg       0.83      0.83      0.83       811

Confusion matrix:
 [[351  55]
 [ 82 323]]

----- SVM -----
Accuracy: 0.8273736128236745
Classification report:
               precision    recall  f1-score   support

           0       0.82      0.83      0.83       406
           1       0.83      0.82      0.83       405

    accuracy                      