### Chargement de données

In [3]:
import pandas as pd

df = pd.read_csv('../data/processed/data_cleaned.csv')

### 1. Convertir tout le texte en minuscules.

In [8]:
df['text'] = df['text'].str.lower()

### 2. Appliquer la tokenisation.

In [19]:
import nltk
from nltk.tokenize import word_tokenize

# Télécharger le tokenizer une seule fois
nltk.download("punkt")
nltk.download("punkt_tab")

# Exemple de tokenisation
df["tokens"] = df["text"].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anass\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\anass\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


### 3. Retirer les stopwords avec NLTK.

In [29]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

stop_words = set(stopwords.words("english"))
df["clean_tokens"] = df["tokens"].apply(
    lambda tokens: [word for word in tokens if word.lower() not in stop_words]
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anass\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 4. Supprimer ponctuation & caractères spéciaux (regex).

In [45]:
import re

clean_token_pattern = re.compile(r"^[a-z]+$", flags=re.UNICODE)

df["clean_tokens_2"] = df["clean_tokens"].apply(
    lambda tokens: [
        token for token in tokens if clean_token_pattern.match(token)
    ]
)

### 5. Appliquer le stemming (PorterStemmer).

In [51]:
import nltk
from nltk.stem import PorterStemmer

nltk.download("punkt")       # pour tokenizer
nltk.download("stopwords")   # si tu supprimes les stopwords

stemmer = PorterStemmer()

df["stemmed_tokens"] = df["clean_tokens_2"].apply(
    lambda tokens: [stemmer.stem(token) for token in tokens]
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anass\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anass\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 6. Vectoriser le texte à l’aide de TfidfVectorizer() ou CountVectorizer().

- reconstruire le texte stemmé

In [57]:
df["stemmed_text"] = df["stemmed_tokens"].apply(lambda tokens: " ".join(tokens))

- Séparer features et labels

In [None]:
y = df["label"]
X = df["stemmed_text"]

- Split train/test

In [65]:
from sklearn.model_selection import train_test_split

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X - Train", len(X_train_text))
print("X - Test", len(X_test_text))
print("y - Train", len(y_train))
print("y - Test", len(y_test))

X - Train 23048
X - Test 5763
y - Train 23048
y - Test 5763


- Vectorisation du texte

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer


# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=8000)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

print(X_train.shape, X_test.shape)

(23048, 8000) (5763, 8000)
