# SAE-98 - Nettoyage Dataset Users

**Objectif:** Charger, analyser et nettoyer le dataset Users

**Input:** `data/raw/yelp_academic_dataset_user4students.jsonl`

**Output:** `data/cleaned/users_clean.parquet`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
print("‚úÖ Imports r√©ussis")

In [None]:
RAW_DATA_PATH = Path('../data/raw/yelp_academic_dataset_user4students.jsonl')
CLEANED_DATA_PATH = Path('../data/cleaned/users_clean.parquet')
CLEANED_DATA_PATH.parent.mkdir(parents=True, exist_ok=True)

print(f"üìÇ Chargement: {RAW_DATA_PATH}")
print(f"üíæ Sauvegarde: {CLEANED_DATA_PATH}")

In [None]:
print("‚è≥ Chargement... (1-2 min)")
users_df = pd.read_json(RAW_DATA_PATH, lines=True)
print(f"‚úÖ {users_df.shape[0]:,} lignes, {users_df.shape[1]} colonnes")

In [None]:
users_df.head()

In [None]:
users_df.info()

In [None]:
missing = pd.DataFrame({
    'Colonne': users_df.columns,
    'Manquantes': users_df.isnull().sum(),
    'Pct': (users_df.isnull().sum() / len(users_df) * 100).round(2)
}).sort_values('Pct', ascending=False)
missing[missing['Manquantes'] > 0]

In [None]:
users_clean = users_df.copy()
print(f"üìã Initial: {len(users_clean):,} lignes")

In [None]:
# Supprimer doublons
dups = users_clean.duplicated(subset=['user_id']).sum()
if dups > 0:
    users_clean = users_clean.drop_duplicates(subset=['user_id'])
    print(f"‚ùå {dups} doublons supprim√©s")
else:
    print("‚úÖ Aucun doublon")

In [None]:
# Supprimer lignes sans user_id
if users_clean['user_id'].isnull().any():
    before = len(users_clean)
    users_clean = users_clean.dropna(subset=['user_id'])
    print(f"‚ùå {before - len(users_clean)} sans user_id")
else:
    print("‚úÖ Tous les user_id pr√©sents")

In [None]:
# Remplir valeurs manquantes pour colonnes num√©riques
numeric_cols = ['review_count', 'useful', 'funny', 'cool', 'fans', 'average_stars']
for col in numeric_cols:
    if col in users_clean.columns:
        users_clean[col] = users_clean[col].fillna(0)

# Remplir valeurs manquantes pour colonnes texte
if 'name' in users_clean.columns:
    users_clean['name'] = users_clean['name'].fillna('Unknown')

print("‚úÖ Valeurs manquantes trait√©es")

In [None]:
# Valider review_count >= 0
if 'review_count' in users_clean.columns:
    before = len(users_clean)
    users_clean = users_clean[users_clean['review_count'] >= 0]
    if before > len(users_clean):
        print(f"‚ùå {before - len(users_clean)} review_count n√©gatifs")
    else:
        print("‚úÖ review_count valides")

In [None]:
# Valider average_stars entre 0 et 5
if 'average_stars' in users_clean.columns:
    before = len(users_clean)
    users_clean = users_clean[(users_clean['average_stars'] >= 0) & (users_clean['average_stars'] <= 5)]
    if before > len(users_clean):
        print(f"‚ùå {before - len(users_clean)} average_stars invalides")
    else:
        print("‚úÖ average_stars valides (0-5)")

In [None]:
# Convertir dates
if 'yelping_since' in users_clean.columns:
    users_clean['yelping_since'] = pd.to_datetime(users_clean['yelping_since'], errors='coerce')
    invalid = users_clean['yelping_since'].isnull().sum()
    if invalid > 0:
        print(f"‚ö†Ô∏è {invalid} dates invalides (conserv√©es comme NaT)")
    else:
        print("‚úÖ Dates converties")

In [None]:
print("üìä R√âSUM√â")
print("=" * 50)
print(f"Initial:    {len(users_df):,}")
print(f"Final:      {len(users_clean):,}")
print(f"Supprim√©:   {len(users_df) - len(users_clean):,}")
print(f"Conserv√©:   {(len(users_clean) / len(users_df) * 100):.2f}%")

In [None]:
# Distribution review_count
if 'review_count' in users_clean.columns:
    plt.figure(figsize=(12, 5))
    plt.hist(users_clean['review_count'], bins=50, edgecolor='black')
    plt.xlabel('Nombre d\'avis')
    plt.ylabel('Nombre d\'utilisateurs')
    plt.title('Distribution du Nombre d\'Avis par Utilisateur')
    plt.xlim(0, users_clean['review_count'].quantile(0.95))
    plt.show()

In [None]:
# Distribution average_stars
if 'average_stars' in users_clean.columns:
    plt.figure(figsize=(10, 5))
    users_clean['average_stars'].hist(bins=30, edgecolor='black')
    plt.xlabel('Note moyenne')
    plt.ylabel('Nombre d\'utilisateurs')
    plt.title('Distribution des Notes Moyennes des Utilisateurs')
    plt.show()

In [None]:
# Sauvegarder
print("‚è≥ Sauvegarde...")
users_clean.to_parquet(CLEANED_DATA_PATH, index=False, compression='snappy')
print(f"‚úÖ Sauvegard√©: {CLEANED_DATA_PATH}")
print(f"üìä Taille: {CLEANED_DATA_PATH.stat().st_size / 1024**2:.2f} MB")

In [None]:
# Test rechargement
test_df = pd.read_parquet(CLEANED_DATA_PATH)
print(f"‚úÖ Test r√©ussi: {len(test_df):,} lignes")

## ‚úÖ SAE-98 Termin√©

**Prochaines √©tapes:**
- SAE-67: Dashboard Profils Reviewers
- SAE-68: Dashboard Performance √âtablissements