In [None]:
import pandas as pd
import numpy as np
import ast

RAW_CSV = "../data/products.csv"
FINAL_CSV = "../data/products_clean.csv"

In [None]:
df = pd.read_csv(RAW_CSV)

Information about the dataset

In [None]:
print(f"Shape: {df.shape}")

display(df.head(3))

# Basic info
print(df.info())

# Missing values
print(df.isnull().sum())

# Check duplicates
print("\nDuplicate uniq_id rows:", df.duplicated(subset="uniq_id").sum())

Prepare dataset

In [None]:
# remove duplicate uniq_id
df = df.drop_duplicates(subset="uniq_id")
print("After removing duplicates:", df.shape)

In [None]:
# fill missing description with title
df['description'] = df.apply(lambda row: row['title'] if pd.isnull(row['description']) else row['description'], axis=1)

In [None]:
# fill price with median
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
median_price = df['price'].median()
df['price'] = df['price'].fillna(median_price)

In [None]:
# fill manufacturer, package_dimensions, country_of_origin, material, color
df['manufacturer'] = df['manufacturer'].fillna("Unknown")
df['package_dimensions'] = df['package_dimensions'].fillna("Unknown")
df['country_of_origin'] = df['country_of_origin'].fillna("Unknown")
df['material'] = df['material'].fillna("Unknown")
df['color'] = df['color'].fillna("Unknown")

In [None]:
# parse list columns (convert to python list)
def parse_list_column(col):
    return col.apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

df['categories'] = parse_list_column(df['categories'])
df['images'] = parse_list_column(df['images'])

In [None]:
# clean text columns
text_cols = ['title', 'description', 'brand', 'manufacturer', 'material', 'color']
for col in text_cols:
    df[col] = df[col].str.strip()
    df[col] = df[col].str.lower()

In [None]:
# save final clean dataset
df.to_csv(FINAL_CSV, index=False)
print("Clean dataset saved to:", FINAL_CSV)