# Pipeline Preprocessing Complet

Objectif: Appliquer et tester la fonction `preprocess_pipeline` qui regroupe toutes les étapes.

Partie de la story **SAE-74**.

In [1]:
import sys
import os
import pandas as pd
import nltk
from tqdm import tqdm

# Add src to path
sys.path.append(os.path.abspath(os.path.join('../..', 'src')))

from text_preprocessing import preprocess_pipeline

# Download NLTK resources (ensure everything is there)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Enable tqdm for pandas
tqdm.pandas()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\melou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\melou\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\melou\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Tests Unitaires Pipeline

In [2]:
test_text = "The food was AMAZING! I did not like the service at all."

print(f"Original: {test_text}\n")

# 1. Full Pipeline
res1 = preprocess_pipeline(test_text)
print(f"Full Defaults: {res1}")

# 2. No Stopword Removal
res2 = preprocess_pipeline(test_text, remove_stopwords_flag=False)
print(f"Keep Stopwords: {res2}")

# 3. Keep Negation
res3 = preprocess_pipeline(test_text, exclude_stopwords={'not', 'no'})
print(f"Keep Negation: {res3}")

# 4. No Lemmatization
res4 = preprocess_pipeline(test_text, lemmatize_flag=False)
print(f"No Lemma: {res4}")

Original: The food was AMAZING! I did not like the service at all.



Full Defaults: ['food', 'amazing', 'like', 'service']
Keep Stopwords: ['the', 'food', 'wa', 'amazing', 'i', 'did', 'not', 'like', 'the', 'service', 'at', 'all']
Keep Negation: ['food', 'amazing', 'not', 'like', 'service']
No Lemma: ['food', 'amazing', 'like', 'service']


## Application au Dataset

In [3]:
# Load cleaned reviews and use Sample
reviews_path = '../../data/cleaned/reviews_clean.parquet'
if os.path.exists(reviews_path):
    reviews = pd.read_parquet(reviews_path)
    reviews = reviews.head(2000).copy()
    print("Using sample of 2000 reviews")
else:
    reviews = pd.DataFrame({'text': ["Sample review text.", "Another review here."]})

print("Processing...")
# Apply pipeline (Stopwords removed, Negation Kept)
reviews['tokens_final'] = reviews['text'].progress_apply(
    lambda x: preprocess_pipeline(x, exclude_stopwords={'not', 'no', 'nor', 'neither'})
)

reviews[['text', 'tokens_final']].head()

Using sample of 2000 reviews
Processing...


  0%|          | 0/2000 [00:00<?, ?it/s]

 11%|█▏        | 225/2000 [00:00<00:00, 2236.10it/s]

 23%|██▎       | 459/2000 [00:00<00:00, 2284.19it/s]

 34%|███▍      | 688/2000 [00:00<00:00, 2163.15it/s]

 45%|████▌     | 908/2000 [00:00<00:00, 2162.99it/s]

 56%|█████▋    | 1125/2000 [00:00<00:00, 2161.10it/s]

 68%|██████▊   | 1353/2000 [00:00<00:00, 2193.61it/s]

 79%|███████▊  | 1573/2000 [00:00<00:00, 2033.09it/s]

 91%|█████████ | 1820/2000 [00:00<00:00, 2157.99it/s]

100%|██████████| 2000/2000 [00:00<00:00, 2181.37it/s]




Unnamed: 0,text,tokens_final
0,Went for lunch and found that my burger was me...,"[went, lunch, found, burger, meh, obvious, foc..."
1,I needed a new tires for my wife's car. They h...,"[needed, new, tire, wife, car, special, order,..."
2,Jim Woltman who works at Goleta Honda is 5 sta...,"[jim, woltman, work, goleta, honda, 5, star, k..."
3,Been here a few times to get some shrimp. They...,"[time, get, shrimp, theyve, got, nice, selecti..."
4,This is one fantastic place to eat whether you...,"[one, fantastic, place, eat, whether, hungry, ..."


## Sauvegarde

In [4]:
output_path = '../../outputs/reviews_preprocessed.pkl'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
reviews.to_pickle(output_path)
print(f"Saved to {output_path}")

Saved to ../../outputs/reviews_preprocessed.pkl
