# SAE-97 - Nettoyage Dataset Reviews

**Objectif:** Charger, analyser et nettoyer le dataset Reviews

**Input:** `data/raw/yelp_academic_reviews4students.jsonl`

**Output:** `data/cleaned/reviews_clean.parquet`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
print("‚úÖ Imports r√©ussis")

In [None]:
RAW_DATA_PATH = Path('../data/raw/yelp_academic_reviews4students.jsonl')
CLEANED_DATA_PATH = Path('../data/cleaned/reviews_clean.parquet')
CLEANED_DATA_PATH.parent.mkdir(parents=True, exist_ok=True)

print(f"üìÇ Chargement: {RAW_DATA_PATH}")
print(f"üíæ Sauvegarde: {CLEANED_DATA_PATH}")

In [None]:
print("‚è≥ Chargement... (1-2 min)")
reviews_df = pd.read_json(RAW_DATA_PATH, lines=True)
print(f"‚úÖ {reviews_df.shape[0]:,} lignes, {reviews_df.shape[1]} colonnes")

In [None]:
reviews_df.head()

In [None]:
reviews_df.info()

In [None]:
missing = pd.DataFrame({
    'Colonne': reviews_df.columns,
    'Manquantes': reviews_df.isnull().sum(),
    'Pct': (reviews_df.isnull().sum() / len(reviews_df) * 100).round(2)
}).sort_values('Pct', ascending=False)
missing[missing['Manquantes'] > 0]

In [None]:
reviews_clean = reviews_df.copy()
print(f"üìã Initial: {len(reviews_clean):,} lignes")

In [None]:
# Supprimer doublons
dups = reviews_clean.duplicated(subset=['review_id']).sum()
if dups > 0:
    reviews_clean = reviews_clean.drop_duplicates(subset=['review_id'])
    print(f"‚ùå {dups} doublons supprim√©s")
else:
    print("‚úÖ Aucun doublon")

In [None]:
# Supprimer lignes sans donn√©es essentielles
for col in ['review_id', 'text', 'user_id', 'business_id']:
    if col in reviews_clean.columns:
        before = len(reviews_clean)
        reviews_clean = reviews_clean.dropna(subset=[col])
        if before > len(reviews_clean):
            print(f"‚ùå {before - len(reviews_clean)} sans {col}")

In [None]:
# Nettoyer texte
if 'text' in reviews_clean.columns:
    reviews_clean['text_length'] = reviews_clean['text'].str.len()
    before = len(reviews_clean)
    reviews_clean = reviews_clean[reviews_clean['text_length'] >= 10]
    print(f"‚ùå {before - len(reviews_clean)} avis trop courts")
    
    reviews_clean['text'] = reviews_clean['text'].str.replace(r'\s+', ' ', regex=True).str.strip()
    print("‚úÖ Texte nettoy√©")

In [None]:
# Valider notes
if 'stars' in reviews_clean.columns:
    before = len(reviews_clean)
    reviews_clean = reviews_clean[(reviews_clean['stars'] >= 1) & (reviews_clean['stars'] <= 5)]
    if before > len(reviews_clean):
        print(f"‚ùå {before - len(reviews_clean)} notes invalides")
    else:
        print("‚úÖ Notes valides (1-5)")

In [None]:
# Convertir dates
if 'date' in reviews_clean.columns:
    reviews_clean['date'] = pd.to_datetime(reviews_clean['date'], errors='coerce')
    invalid = reviews_clean['date'].isnull().sum()
    if invalid > 0:
        reviews_clean = reviews_clean.dropna(subset=['date'])
        print(f"‚ùå {invalid} dates invalides")
    else:
        print("‚úÖ Dates converties")

In [None]:
print("üìä R√âSUM√â")
print("=" * 50)
print(f"Initial:    {len(reviews_df):,}")
print(f"Final:      {len(reviews_clean):,}")
print(f"Supprim√©:   {len(reviews_df) - len(reviews_clean):,}")
print(f"Conserv√©:   {(len(reviews_clean) / len(reviews_df) * 100):.2f}%")

In [None]:
# Distribution notes
if 'stars' in reviews_clean.columns:
    plt.figure(figsize=(10, 5))
    reviews_clean['stars'].value_counts().sort_index().plot(kind='bar')
    plt.xlabel('Note')
    plt.ylabel('Nombre')
    plt.title('Distribution des Notes')
    plt.show()

In [None]:
# Sauvegarder
if 'text_length' in reviews_clean.columns:
    reviews_clean = reviews_clean.drop(columns=['text_length'])

print("‚è≥ Sauvegarde...")
reviews_clean.to_parquet(CLEANED_DATA_PATH, index=False, compression='snappy')
print(f"‚úÖ Sauvegard√©: {CLEANED_DATA_PATH}")

## ‚úÖ SAE-97 Termin√©