In [1]:
import pandas as pd
from pathlib import Path
from miguellib.datasets.utils import load_df, validate_data, flag_non_ingredients, standardize_data
from miguellib.datasets.preprocessing import run_pipeline

In [2]:
BASE_DIR = Path().resolve()
RAW_PATH = BASE_DIR / ".." / ".." / "miguellib" / "datasets" / "cosmetics_raw.csv"
PROCESSED_PATH = BASE_DIR / ".." / ".." / "miguellib" / "datasets" / "cosmetics_processed.csv"
PROCESSED_PATH = BASE_DIR / ".." / ".." / "miguellib" / "datasets" / "cosmetics_processed.csv"
MANUAL_REVIEW_PATH = BASE_DIR / ".." / ".." / "miguellib" / "datasets" / "flagged_ingredients_reviewed.xlsx" 

In [3]:
df_raw = load_df(RAW_PATH)
df_raw.head()
print("Shape:", df_raw.shape)

Shape: (1472, 11)


In [4]:
validate_data(df_raw)

Validation passed.


In [5]:
df_processed, flagged = run_pipeline(raw_path=RAW_PATH, processed_path=None, reviewed_path=None, flag_non_ingredients_rows=True)

Validation passed.
Found 198 suspicious entries in Ingredients:

Index 7 | Brand: Drunk Elephant | Name: Virgin Marula Luxury Facial Oil
Ingredients: 100% unrefined sclerocraya birrea (marula) kernel oil.

Index 26 | Brand: Drunk Elephant | Name: Virgin Marula Luxury Facial Oil Mini
Ingredients: 100% unrefined sclerocraya birrea (marula) kernel oil.

Index 32 | Brand: Olehenriksen | Name: Sheer Transformation Perfecting Moisturizer
Ingredients: visit the olehenriksen boutique

Index 33 | Brand: Josie Maran | Name: 100 percent Pure Argan Oil
Ingredients: organic argania spinosa (argan) kernel oil*. *organic. **natural.

Index 39 | Brand: Dr. Jart+ | Name: Premium Beauty Balm SPF 45
Ingredients: #name?

Index 55 | Brand: Shiseido | Name: Eudermine Revitalizing Essence
Ingredients: visit the shiseido boutique

Index 90 | Brand: Origins | Name: High-Potency Night-a-Mins Oil-Free Resurfacing Cream with Fruit-Derived AHAs
Ingredients: * essential oil

Index 93 | Brand: Josie Maran | Name: Ar

In [6]:
print("Flagged entries preview:")
print(flagged[['Brand', 'Name', 'Ingredients']].head(10))

Flagged entries preview:
             Brand                                               Name  \
7   Drunk Elephant                    Virgin Marula Luxury Facial Oil   
26  Drunk Elephant               Virgin Marula Luxury Facial Oil Mini   
32    Olehenriksen        Sheer Transformation Perfecting Moisturizer   
33     Josie Maran                         100 percent Pure Argan Oil   
39       Dr. Jart+                         Premium Beauty Balm SPF 45   
55        Shiseido                     Eudermine Revitalizing Essence   
90         Origins  High-Potency Night-a-Mins Oil-Free Resurfacing...   
93     Josie Maran                     Argan Daily Moisturizer SPF 47   
95      Jack Black  Double-Duty Face Moisturizer Broad Spectrum SP...   
98          La Mer                    The Moisturizing Cool Gel Cream   

                                          Ingredients  
7   100% unrefined sclerocraya birrea (marula) ker...  
26  100% unrefined sclerocraya birrea (marula) ker...  
32 

In [7]:
df_reviewed = pd.read_excel(MANUAL_REVIEW_PATH)

In [8]:
print("Sample manual review actions:")
print(df_reviewed[['Brand', 'Name', 'Action', 'Fill_in_Ingredients']].head(10))

Sample manual review actions:
            Brand                                               Name   Action  \
0  DRUNK ELEPHANT                    Virgin Marula Luxury Facial Oil  replace   
1  DRUNK ELEPHANT               Virgin Marula Luxury Facial Oil Mini  replace   
2    OLEHENRIKSEN       Sheer Transformation® Perfecting Moisturizer      NaN   
3     JOSIE MARAN                         100 percent Pure Argan Oil     keep   
4       DR. JART+                         Premium Beauty Balm SPF 45  replace   
5        SHISEIDO                     Eudermine Revitalizing Essence  replace   
6         ORIGINS  High-Potency Night-a-Mins™ Oil-Free Resurfacin...      NaN   
7     JOSIE MARAN                     Argan Daily Moisturizer SPF 47   remove   
8      JACK BLACK  Double-Duty Face Moisturizer Broad Spectrum SP...  replace   
9          LA MER                    The Moisturizing Cool Gel Cream  replace   

                                 Fill_in_Ingredients  
0  Sclerocarya Birrea S

In [9]:
df_processed = df_processed.merge(
    df_reviewed[['Brand', 'Name', 'Action', 'Fill_in_Ingredients']],
    on=['Brand', 'Name'],
    how='left'
)

In [10]:
df_processed = standardize_data(df_processed) 

In [11]:
validate_data(df_processed)

Validation passed.


In [12]:
df_processed.to_csv(PROCESSED_PATH, index=False)
print(f"Processed dataset saved to: {PROCESSED_PATH}")

Processed dataset saved to: /Users/cayetanah/Downloads/SkinCares/miguellib/datasets/../../miguellib/datasets/cosmetics_processed.csv
