# Product info data preprocess


## 1. Imports

In [21]:
import pandas as pd
import ast

## 2. Clean data, drop columns that will not be used

In [22]:
df_product_info = pd.read_csv('data/product_info.csv')
missing_percent = df_product_info.isnull().mean()
cols_to_drop = missing_percent[missing_percent > 0.3].index.tolist() + [
    'online_only', 'limited_edition', 'out_of_stock', 'sephora_exclusive',
    'size', 'variation_type', 'variation_value'
]
df_product_info = df_product_info.drop(columns=cols_to_drop)
print(f"Dropped columns: {list(cols_to_drop)}")

Dropped columns: ['variation_desc', 'value_price_usd', 'sale_price_usd', 'child_max_price', 'child_min_price', 'online_only', 'limited_edition', 'out_of_stock', 'sephora_exclusive', 'size', 'variation_type', 'variation_value']


## 3. Normalize the ingredients

In [23]:
def clean_ingredient_entry(entry):
    try:
        items = ast.literal_eval(entry) if isinstance(entry, str) else entry
        if not isinstance(items, list):
            return ""

        all_ingredients = []

        for item in items:
            if ',' not in item and ':' in item:
                continue

            split_ings = [ing.strip().rstrip('.') for ing in item.split(',')]
            all_ingredients.extend(split_ings)

        unique_ingredients = sorted(set(all_ingredients))
        return ', '.join(unique_ingredients)

    except Exception as e:
        print(f"Error parsing ingredients: {e}")
        return ""
    
df_product_info['ingredients'] = df_product_info['ingredients'].apply(clean_ingredient_entry)


In [24]:
product_info_cosmetics = df_product_info[df_product_info['primary_category'] != 'Fragrance']
product_info_cosmetics['product_name'] = (
    product_info_cosmetics['product_name'] + 
    ' (' + product_info_cosmetics['brand_name'] + ')'
)

print(product_info_cosmetics.head(5))


   product_id                                       product_name  brand_id  \
14    P476416  AFRICAN Beauty Butter- Intensive Dry Skin Trea...      6471   
15    P476418   African Beauty Butter Mini Gift Set (54 Thrones)      6471   
16    P476417  African Beauty Butter Collection Deluxe Tin (5...      6471   
17    P503832  Mini AFRICAN Beauty Butter- Intensive Dry Skin...      6471   
31    P443401  Blu Mediterraneo Mini Eau de Toilette Set (Acq...      5847   

        brand_name  loves_count  rating  reviews  \
14      54 Thrones        19028  4.3256    258.0   
15      54 Thrones         7526  3.5610     41.0   
16      54 Thrones         3741  4.2273     22.0   
17      54 Thrones         3392  4.5175    143.0   
31  Acqua di Parma         7290  3.3333     12.0   

                                          ingredients  price_usd  new  \
14  *Beta-Sitosterol, *Squalane, *Tocopherol (mixe...       38.0    0   
15  (Rose Absolute), *Beta-Sitosterol, *Squalane, ...       29.0    0   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_info_cosmetics['product_name'] = (


## 4. Save the processed data

In [25]:
product_info_cosmetics.dropna()
product_info_cosmetics.to_csv('data/product_info_cosmetics.csv', index=False)

In [26]:
df_product_info.dropna()
df_product_info.to_csv('data/product_info_processed.csv', index=False)