# Nettoyage des données ASOS

## Importer les bibliothèques

In [70]:
import pandas as pd
import numpy as np
import re
import ast
from urllib.parse import urlparse

In [13]:
df = pd.read_csv('../data/products_asos.csv')
df.head(10)
df.tail(10)

Unnamed: 0,url,name,size,category,price,color,sku,description,images
30835,https://www.asos.com/asos-design/asos-design-o...,ASOS DESIGN one shoulder corsage sweetheart ne...,"UK 4 - Out of stock,UK 6 - Out of stock,UK 8 -...",ASOS DESIGN one shoulder corsage sweetheart ne...,66.0,BLACK,120641835.0,[{'Product Details': 'Dresses by ASOS DESIGNIf...,['https://images.asos-media.com/products/asos-...
30836,https://www.asos.com/pretty-lavish/pretty-lavi...,Pretty Lavish one shoulder ruched satin split ...,"UK 6,UK 8,UK 10 - Out of stock,UK 12 - Out of ...",Pretty Lavish one shoulder ruched satin split ...,75.0,Cool Gold,121423897.0,[{'Product Details': 'Dresses by Pretty Lavish...,['https://images.asos-media.com/products/prett...
30837,https://www.asos.com/asos-luxe/asos-luxe-weddi...,ASOS LUXE Wedding embellished high neck mini d...,"UK 4 - Out of stock,UK 6 - Out of stock,UK 8 -...",ASOS LUXE Wedding embellished high neck mini d...,165.0,Natural,121144566.0,[{'Product Details': 'Dresses by ASOS LuxeAisl...,['https://images.asos-media.com/products/asos-...
30838,https://www.asos.com/asos-design/asos-design-c...,ASOS DESIGN cotton shirred maxi smock dress in...,"UK 4 - Out of stock,UK 6 - Out of stock,UK 8,U...",ASOS DESIGN cotton shirred maxi smock dress in...,44.0,MULTI,122463785.0,[{'Product Details': 'Dresses by ASOS DESIGNTh...,['https://images.asos-media.com/products/asos-...
30839,https://www.asos.com/asos-design/asos-design-s...,ASOS DESIGN satin midi dress with cowl neck an...,"UK 4 - Out of stock,UK 6 - Out of stock,UK 8,U...",ASOS DESIGN satin midi dress with cowl neck an...,55.0,CHOCOLATE,120479167.0,[{'Product Details': 'Dresses by ASOS DESIGNAl...,['https://images.asos-media.com/products/asos-...
30840,https://www.asos.com/urban-revivo/urban-revivo...,Urban Revivo square neck mini dress in floral ...,"XS - UK 6 - Out of stock,S - UK 8 - Out of sto...",Urban Revivo square neck mini dress in floral ...,44.0,Multi,116745746.0,[{'Product Details': 'Mini dress by Urban Revi...,['https://images.asos-media.com/products/urban...
30841,https://www.asos.com/asos-design/asos-design-l...,ASOS DESIGN long sleeve maxi t-shirt dress in ...,"UK 4 - Out of stock,UK 6 - Out of stock,UK 8 -...",ASOS DESIGN long sleeve maxi t-shirt dress in ...,24.0,Black,1444255.0,[{'Product Details': 'Dress by ASOS DESIGN Act...,['https://images.asos-media.com/products/asos-...
30842,https://www.asos.com/asyou/asyou-layered-t-shi...,ASYOU layered t-shirt dress with focus graphic...,"UK 4 - Out of stock,UK 6,UK 8 - Out of stock,U...",ASYOU layered t-shirt dress with focus graphic...,22.99,Washed black,110783769.0,[{'Product Details': 'Dress by ASYOU Exclusive...,['https://images.asos-media.com/products/asyou...
30843,https://www.asos.com/miss-selfridge/miss-selfr...,Miss Selfridge Petite rib knit frill hem funne...,"UK 4 - Out of stock,UK 6 - Out of stock,UK 8 -...",Miss Selfridge Petite rib knit frill hem funne...,32.99,BLACK,116363729.0,[{'Product Details': 'Petite by Miss Selfridge...,['https://images.asos-media.com/products/miss-...
30844,https://www.asos.com/other-stories/other-stori...,& Other Stories plisse mesh midi dress in blac...,"XS - UK 4-6 - Out of stock,S - UK 8-10,M - UK ...",& Other Stories plisse mesh midi dress in blac...,65.0,Black and white,124159122.0,[{'Product Details': 'Dresses by & Other Stori...,['https://images.asos-media.com/products/other...


In [72]:
# Chercher les lignes complètement vides
empty_rows = df.isnull().all(axis=1)
print(f"Nombre de lignes complètement vides: {empty_rows.sum()}")

if empty_rows.sum() > 0:
    print("Index des lignes vides:")
    empty_indices = df[empty_rows].index.tolist()
    print(empty_indices)
    
    # Supprimer les lignes vides
    df = df.dropna(how='all')
    print(f"Après suppression: {df.shape}")
else:
    print("Aucune ligne complètement vide trouvée")

Nombre de lignes complètement vides: 18
Index des lignes vides:
[13, 145, 150, 180, 201, 215, 279, 339, 358, 402, 481, 531, 597, 604, 667, 683, 690, 694]
Après suppression: (30827, 9)


In [73]:
# Mettre à jour le dataset principal
df = df.copy()
print(f"Dataset mis à jour: {df.shape}")
print("Lignes vides supprimées avec succès!")

Dataset mis à jour: (30827, 9)
Lignes vides supprimées avec succès!


## Vérifier les données

In [74]:
# Vérifier la taille des données
print("Nombre de lignes:", len(df))
print("Nombre de colonnes:", len(df.columns))

# Vérifier les valeurs manquantes
print("\nValeurs manquantes:")
print(df.isnull().sum())

# Vérifier les doublons
print("\nLignes dupliquées:", df.duplicated().sum())

Nombre de lignes: 30827
Nombre de colonnes: 9

Valeurs manquantes:
url            0
name           0
size           0
category       0
price          0
color          0
sku            0
description    0
images         0
dtype: int64

Lignes dupliquées: 326


## Nettoyer les prix

In [75]:
# Fonction pour nettoyer les prix
def clean_price(price):
    if pd.isna(price):
        return None
    # Enlever les symboles et convertir en nombre
    price_clean = re.sub(r'[^0-9.]', '', str(price))
    try:
        return float(price_clean)
    except:
        return None

# Appliquer le nettoyage
df['price_clean'] = df['price'].apply(clean_price)
print("Prix nettoyés:", df['price_clean'].notna().sum())
print("Prix invalides:", df['price_clean'].isna().sum())

Prix nettoyés: 30827
Prix invalides: 0


## Nettoyer les catégories

In [76]:
# Nettoyer les catégories
df['category_clean'] = df['category'].str.title().str.strip()

print("Catégories avant:", df['category'].nunique())
print("Catégories après:", df['category_clean'].nunique())
print("\nTop 5 catégories:")
print(df['category_clean'].value_counts().head())

Catégories avant: 29492
Catégories après: 29483

Top 5 catégories:
category_clean
Asos 4505 Icon Performance T-Shirt                            7
Barney'S Originals Emma Real Leather Jacket With Belt         6
Barbour Beadnell Wax Jacket In Navy                           5
Columbia Puffect Cropped Jacket In Black Exclusive At Asos    5
Berghaus Nula Micro Water Resistant Padded Jacket In Black    5
Name: count, dtype: int64
category_clean
Asos 4505 Icon Performance T-Shirt                            7
Barney'S Originals Emma Real Leather Jacket With Belt         6
Barbour Beadnell Wax Jacket In Navy                           5
Columbia Puffect Cropped Jacket In Black Exclusive At Asos    5
Berghaus Nula Micro Water Resistant Padded Jacket In Black    5
Name: count, dtype: int64


## Nettoyer les couleurs

In [77]:
# Nettoyer les couleurs
df['color_clean'] = df['color'].str.title().str.strip()

print("Couleurs avant:", df['color'].nunique())
print("Couleurs après:", df['color_clean'].nunique())
print("\nTop 5 couleurs:")
print(df['color_clean'].value_counts().head())

Couleurs avant: 3636
Couleurs après: 2929

Top 5 couleurs:
color_clean
Black    6558
White    1723
Multi    1436
Pink     1422
Green    1035
Name: count, dtype: int64


## Supprimer les doublons

In [78]:
# Supprimer les doublons
print("Avant:", len(df))
df = df.drop_duplicates()
print("Après suppression des doublons:", len(df))

# Réinitialiser l'index
df = df.reset_index(drop=True)

Avant: 30827
Après suppression des doublons: 30501
Après suppression des doublons: 30501


## Sauvegarder les données nettoyées

In [79]:
# Nettoyer les couleurs
df['color'] = df['color'].str.lower().str.strip()
print(f"Couleurs uniques: {df['color'].nunique()}")
df['color'].value_counts().head()

Couleurs uniques: 2929


color
black    6507
white    1712
multi    1421
pink     1397
green    1020
Name: count, dtype: int64

## 13. Final Data Quality Check

In [80]:
# Nettoyer les catégories
df['category'] = df['category'].str.lower().str.strip()
print(f"Catégories uniques: {df['category'].nunique()}")
df['category'].value_counts().head()

Catégories uniques: 29483


category
asos 4505 icon performance t-shirt                            7
barney's originals emma real leather jacket with belt         6
asos 4505 icon long sleeve run top                            5
barbour beadnell wax jacket in navy                           5
columbia puffect cropped jacket in black exclusive at asos    5
Name: count, dtype: int64

## 14. Export Clean Dataset

In [81]:
# Supprimer les doublons
print(f"Forme originale: {df.shape}")

# Supprimer les doublons exacts
df = df.drop_duplicates()
print(f"Après suppression des doublons: {df.shape}")

# Affichage final
print("\nNettoyage terminé!")
print(f"Dataset final: {df.shape}")

# Sauvegarder le dataset nettoyé
df.to_csv('../data/products_asos_cleaned.csv', index=False)
print("Dataset nettoyé sauvegardé: products_asos_cleaned.csv")

Forme originale: (30501, 12)
Après suppression des doublons: (30501, 12)

Nettoyage terminé!
Dataset final: (30501, 12)
Après suppression des doublons: (30501, 12)

Nettoyage terminé!
Dataset final: (30501, 12)
Dataset nettoyé sauvegardé: products_asos_cleaned.csv
Dataset nettoyé sauvegardé: products_asos_cleaned.csv
