In [None]:
%pip install openpyxl

In [9]:
import sys
from pathlib import Path
import pandas as pd

# Add project root to path so miguellib can be imported
project_root = Path().resolve().parent.parent
sys.path.insert(0, str(project_root))

from miguellib.datasets.utils import load_df, validate_data, flag_non_ingredients, standardize_data, apply_review_actions

# Define run_pipeline2 locally - copy from (2)pipeline.py
def run_pipeline2(raw_path, processed_path=None, reviewed_path=None, flag_non_ingredients_rows=True):
    """
    Full preprocessing pipeline for the cosmetics dataset.
    """
    # step 1
    df = load_df(raw_path)
    
    # step 2
    validate_data(df)
    
    # step 3
    df = standardize_data(df)
    
    # step 4
    flagged = None
    if flag_non_ingredients_rows:
        flagged = flag_non_ingredients(df)
    
    # step 5
    if reviewed_path is not None:
        df = apply_review_actions(df, reviewed_path)
        print("Manual review actions applied.")

    # step 6
    df = standardize_data(df)

    # step 7
    validate_data(df)
    
    # step 8
    if processed_path is None:
        processed_path = Path(raw_path).parent / "cosmetics_processed.csv"

    df.to_csv(processed_path, index=False)
    print(f"Processed dataset saved to: {processed_path}")
    
    return df, flagged

In [10]:
BASE_DIR = Path().resolve()
RAW_PATH = BASE_DIR / "cosmetics_raw.csv"
PROCESSED_PATH = BASE_DIR / "cosmetics_processed.csv"
MANUAL_REVIEW_PATH = BASE_DIR / "flagged_ingredients_reviewed.xlsx"

In [11]:
df_raw = load_df(RAW_PATH)
df_raw.head()
print("Shape:", df_raw.shape)

Shape: (1472, 11)


In [12]:
validate_data(df_raw)

Validation passed.


In [13]:
df_processed, flagged = run_pipeline2(raw_path=RAW_PATH, processed_path=None, reviewed_path=None, flag_non_ingredients_rows=True)

Validation passed.
Found 198 suspicious entries in Ingredients:

Index 7 | Brand: Drunk Elephant | Name: Virgin Marula Luxury Facial Oil
Ingredients: 100% unrefined sclerocraya birrea (marula) kernel oil.

Index 26 | Brand: Drunk Elephant | Name: Virgin Marula Luxury Facial Oil Mini
Ingredients: 100% unrefined sclerocraya birrea (marula) kernel oil.

Index 32 | Brand: Olehenriksen | Name: Sheer Transformation Perfecting Moisturizer
Ingredients: visit the olehenriksen boutique

Index 33 | Brand: Josie Maran | Name: 100 percent Pure Argan Oil
Ingredients: organic argania spinosa (argan) kernel oil*. *organic. **natural.

Index 39 | Brand: Dr. Jart+ | Name: Premium Beauty Balm SPF 45
Ingredients: #name?

Index 55 | Brand: Shiseido | Name: Eudermine Revitalizing Essence
Ingredients: visit the shiseido boutique

Index 90 | Brand: Origins | Name: High-Potency Night-a-Mins Oil-Free Resurfacing Cream with Fruit-Derived AHAs
Ingredients: * essential oil

Index 93 | Brand: Josie Maran | Name: Ar

In [14]:
print("Flagged entries preview:")
print(flagged[['Brand', 'Name', 'Ingredients']].head(10))

Flagged entries preview:
             Brand                                               Name  \
7   Drunk Elephant                    Virgin Marula Luxury Facial Oil   
26  Drunk Elephant               Virgin Marula Luxury Facial Oil Mini   
32    Olehenriksen        Sheer Transformation Perfecting Moisturizer   
33     Josie Maran                         100 percent Pure Argan Oil   
39       Dr. Jart+                         Premium Beauty Balm SPF 45   
55        Shiseido                     Eudermine Revitalizing Essence   
90         Origins  High-Potency Night-a-Mins Oil-Free Resurfacing...   
93     Josie Maran                     Argan Daily Moisturizer SPF 47   
95      Jack Black  Double-Duty Face Moisturizer Broad Spectrum SP...   
98          La Mer                    The Moisturizing Cool Gel Cream   

                                          Ingredients  
7   100% unrefined sclerocraya birrea (marula) ker...  
26  100% unrefined sclerocraya birrea (marula) ker...  
32 

In [15]:
df_reviewed = pd.read_excel(MANUAL_REVIEW_PATH)

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [16]:
print("Sample manual review actions:")
print(df_reviewed[['Brand', 'Name', 'Action', 'Fill_in_Ingredients']].head(10))

Sample manual review actions:


NameError: name 'df_reviewed' is not defined

In [17]:
df_processed = df_processed.merge(
    df_reviewed[['Brand', 'Name', 'Action', 'Fill_in_Ingredients']],
    on=['Brand', 'Name'],
    how='left'
)

NameError: name 'df_reviewed' is not defined

In [None]:
df_processed = standardize_data(df_processed) 

In [None]:
validate_data(df_processed)

Validation passed.


In [None]:
df_processed.to_csv(PROCESSED_PATH, index=False)
print(f"Processed dataset saved to: {PROCESSED_PATH}")

Processed dataset saved to: /Users/sabinabacaoanu/SkinCares/miguellib/datasets/../../miguellib/datasets/cosmetics_processed.csv
