# Lemmatization

Objectif: Réduire les mots à leur forme canonique (lemme).

Partie de la story **SAE-73**.

In [1]:
import sys
import os
import pandas as pd
import nltk
from nltk.stem import PorterStemmer

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../..', 'src')))

from text_preprocessing import tokenize_text, remove_stopwords, lemmatize_tokens

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\melou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melou\AppData\Roaming\nltk_data...


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\melou\AppData\Roaming\nltk_data...


True

## Chargement des Données

In [2]:
# Load cleaned reviews and use Sample
reviews_path = '../../data/cleaned/reviews_clean.parquet'
if os.path.exists(reviews_path):
    reviews = pd.read_parquet(reviews_path)
    reviews = reviews.head(1000).copy()
    print("Using sample of 1000 reviews")
else:
    print("Creating dummy data")
    reviews = pd.DataFrame({'text': ["The cats are running fairly quickly", "This was the best experience better than others"]})

# Apply previous steps (SAE-71/72)
reviews['tokens'] = reviews['text'].apply(str).apply(tokenize_text)
reviews['tokens_clean'] = reviews['tokens'].apply(remove_stopwords)

Using sample of 1000 reviews


## Application de la Lemmatization

In [3]:
reviews['tokens_lemma'] = reviews['tokens_clean'].apply(lemmatize_tokens)

print("Lemmatization applied.")
reviews[['tokens_clean', 'tokens_lemma']].head()

Lemmatization applied.


Unnamed: 0,tokens_clean,tokens_lemma
0,"[Went, lunch, found, burger, meh, ., obvious, ...","[Went, lunch, found, burger, meh, ., obvious, ..."
1,"[needed, new, tires, wife, 's, car, ., special...","[needed, new, tire, wife, 's, car, ., special,..."
2,"[Jim, Woltman, works, Goleta, Honda, 5, stars,...","[Jim, Woltman, work, Goleta, Honda, 5, star, !..."
3,"[times, get, shrimp, ., 've, got, nice, select...","[time, get, shrimp, ., 've, got, nice, selecti..."
4,"[one, fantastic, place, eat, whether, hungry, ...","[one, fantastic, place, eat, whether, hungry, ..."


## Comparaison: Stemming vs Lemmatization

In [4]:
stemmer = PorterStemmer()
test_words = ['running', 'cats', 'better', 'best', 'rocks', 'fairly']

print(f"{'Word':<10} | {'Stem (Porter)':<15} | {'Lemma (WordNet)':<15}")
print("-"*45)

for word in test_words:
    stem = stemmer.stem(word)
    lemma = lemmatize_tokens([word])[0]
    print(f"{word:<10} | {stem:<15} | {lemma:<15}")

Word       | Stem (Porter)   | Lemma (WordNet)
---------------------------------------------
running    | run             | running        
cats       | cat             | cat            
better     | better          | better         
best       | best            | best           
rocks      | rock            | rock           
fairly     | fairli          | fairly         


**Note:** WordNetLemmatizer sans POS tag considère souvent les mots comme des noms (ex: 'running' -> 'running' (noun) vs 'run' (verb)). Stemming est plus agressif mais parfois fait des erreurs ('fairly' -> 'fairli'). Pour notre usage, Lemma est souvent préféré pour la lisibilité.