# SAE-75: TF-IDF Vectorization Demo

Ce notebook démontre l'utilisation de la fonction `compute_tfidf` pour vectoriser les reviews Yelp.

In [None]:
import pandas as pd
import numpy as np
import sys
import os

# Ajouter le dossier src au path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

from src.features import compute_tfidf
from src.text_preprocessing import preprocess_text

## 1. Chargement des données

In [None]:
# Charger un échantillon de reviews
DATA_PATH = '../../data/cleaned/reviews_clean.parquet'

try:
    df = pd.read_parquet(DATA_PATH)
    # Échantillonner pour la démo
    df_sample = df.sample(n=1000, random_state=42)
    print(f"Loaded {len(df_sample)} reviews.")
except FileNotFoundError:
    print(f"File not found: {DATA_PATH}")
    # Fallback si exécuté depuis la racine
    DATA_PATH = 'data/cleaned/reviews_clean.parquet'
    df = pd.read_parquet(DATA_PATH)
    df_sample = df.sample(n=1000, random_state=42)
    print(f"Loaded {len(df_sample)} reviews (fallback path).")

## 2. Preprocessing (si nécessaire)

In [None]:
if 'text_preprocessed' not in df_sample.columns:
    print("Pre-processing text...")
    df_sample['text_preprocessed'] = df_sample['text'].apply(lambda x: preprocess_text(str(x)))
    print("Preprocessing done.")
else:
    print("Using existing preprocessed text.")

print(df_sample[['text', 'text_preprocessed']].head())

## 3. TF-IDF Vectorization

In [None]:
# Application de TF-IDF
tfidf_matrix, vectorizer = compute_tfidf(
    df_sample['text_preprocessed'], 
    max_features=1000, 
    stop_words='english',  # Utilisation de **kwargs
    ngram_range=(1, 2)     # Utilisation de **kwargs
)

print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")

## 4. Analyse des résultats

In [None]:
# Récupérer les noms des features
feature_names = vectorizer.get_feature_names_out()

# Somme des scores TF-IDF pour chaque terme
tfidf_sum = tfidf_matrix.sum(axis=0)

# Créer un DataFrame pour visualiser
tfidf_scores = pd.DataFrame(
    tfidf_sum.T,
    index=feature_names,
    columns=['score']
).sort_values('score', ascending=False)

print("Top 20 termes les plus importants (par score cumulé) :")
print(tfidf_scores.head(20))