We canonicalize and tokenize ingredient lists because raw INCI text contains inconsistent naming (synonyms, formatting variants) that would fragment features and weaken TF-IDF. In this notebook we briefly explore the most common ingredient tokens and edge cases, then define a synonym → canonical dictionary and apply it to produce cleaned ingredient tokens per product for downstream feature engineering.

In [None]:
import pandas as pd
import numpy as np
import re
import json
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from collections import Counter
import matplotlib.pyplot as plt


df = pd.read_csv("cosmetics_processed.csv")

In [3]:
df["Ingredients"].head(10)

0    algae (seaweed) extract, mineral oil, petrolat...
1    galactomyces ferment filtrate (pitera), butyle...
2    water, dicaprylyl carbonate, glycerin, ceteary...
3    algae (seaweed) extract, cyclopentasiloxane, p...
4    water, snail secretion filtrate, phenyl trimet...
5    water, saccharomyces/camellia sinensis leaf/cl...
6    water, glycerin, caprylic/ capric triglyceride...
7    100% unrefined sclerocraya birrea (marula) ker...
8    water, glycerin, cyclohexasiloxane, squalane, ...
9    algae (seaweed) extract, mineral oil, petrolat...
Name: Ingredients, dtype: object

In [4]:
def split_tokens(s):
    if pd.isna(s) or not str(s).strip():
        return []
    return [t.strip().lower() for t in str(s).split(",") if t.strip()]


all_tokens = []
for s in df["Ingredients"]:
    all_tokens.extend(split_tokens(s))

cnt = Counter(all_tokens)

print("Products:", len(df))
print("Total tokens (with repeats):", len(all_tokens))
print("Unique tokens:", len(cnt))

cnt.most_common(30)

Products: 1472
Total tokens (with repeats): 45812
Unique tokens: 6143


[('water', 995),
 ('glycerin', 980),
 ('phenoxyethanol', 806),
 ('butylene glycol', 788),
 ('disodium edta', 547),
 ('sodium hyaluronate', 476),
 ('caprylyl glycol', 461),
 ('dimethicone', 442),
 ('xanthan gum', 416),
 ('tocopheryl acetate', 415),
 ('ethylhexylglycerin', 398),
 ('citric acid', 347),
 ('tocopherol', 313),
 ('caprylic/capric triglyceride', 310),
 ('potassium sorbate', 301),
 ('carbomer', 297),
 ('sodium hydroxide', 278),
 ('fragrance', 270),
 ('sodium benzoate', 260),
 ('limonene', 249),
 ('propanediol', 245),
 ('silica', 242),
 ('linalool', 240),
 ('glyceryl stearate', 239),
 ('pentylene glycol', 226),
 ('caffeine', 211),
 ('polysorbate 20', 207),
 ('squalane', 206),
 ('1', 198),
 ('cetearyl alcohol', 195)]

In [5]:
# tokens containing parentheses
paren = [t for t in cnt if "(" in t or ")" in t]
print("Tokens with parentheses:", len(paren))
paren[:30]

Tokens with parentheses: 1571


['algae (seaweed) extract',
 'citrus aurantifolia (lime) extract',
 'sesamum indicum (sesame) seed oil',
 'eucalyptus globulus (eucalyptus) leaf oil',
 'sesamum indicum (sesame) seed powder',
 'medicago sativa (alfalfa) seed powder',
 'helianthus annuus (sunflower) seedcake',
 'prunus amygdalus dulcis (sweet almond) seed meal',
 'galactomyces ferment filtrate (pitera)',
 'butyrospermum parkii (shea butter)',
 'citrus aurantifolia (lime) peel extract',
 'glycine soja (soybean) seed extract',
 'citrus medica limonum (lemon) peel oil',
 'citrus aurantium bergamia (bergamot) fruit oil',
 'citrus aurantium dulcis (orange) peel oil',
 'citrus aurantifolia (lime) oil',
 'vitis vinifera (grape) seed oil',
 'persea gratissima (avocado) oil',
 'citrus grandis (grapefruit) peel oil',
 'olea europaea (olive) leaf extract',
 'glycine soja (soybean) sprout extract',
 'curcuma longa (turmeric) root extract',
 'chlorhexidine digluconate. may contain: iron oxides (ci 77492',
 'ci 77499).',
 'pistacia l

In [6]:
# This finds ingredient tokens that include extra notes in parentheses (e.g., "aqua (water)") and records a clean version with the parentheses removed.
# Output is a list of (original_token, cleaned_token) pairs so we can decide which mappings to add to the synonym → canonical dictionary.
def strip_parentheses(t):
    return re.sub(r"\s*\([^)]*\)", "", t).strip()


paren_pairs = []
for t in paren[:5000]:
    stripped = strip_parentheses(t)
    if stripped and stripped != t:
        paren_pairs.append((t, stripped))

paren_pairs[:30]

[('algae (seaweed) extract', 'algae extract'),
 ('citrus aurantifolia (lime) extract', 'citrus aurantifolia extract'),
 ('sesamum indicum (sesame) seed oil', 'sesamum indicum seed oil'),
 ('eucalyptus globulus (eucalyptus) leaf oil', 'eucalyptus globulus leaf oil'),
 ('sesamum indicum (sesame) seed powder', 'sesamum indicum seed powder'),
 ('medicago sativa (alfalfa) seed powder', 'medicago sativa seed powder'),
 ('helianthus annuus (sunflower) seedcake', 'helianthus annuus seedcake'),
 ('prunus amygdalus dulcis (sweet almond) seed meal',
  'prunus amygdalus dulcis seed meal'),
 ('galactomyces ferment filtrate (pitera)', 'galactomyces ferment filtrate'),
 ('butyrospermum parkii (shea butter)', 'butyrospermum parkii'),
 ('citrus aurantifolia (lime) peel extract',
  'citrus aurantifolia peel extract'),
 ('glycine soja (soybean) seed extract', 'glycine soja seed extract'),
 ('citrus medica limonum (lemon) peel oil', 'citrus medica limonum peel oil'),
 ('citrus aurantium bergamia (bergamot

In [7]:
# tokens with strange punctuation or double spaces
weird = [
    t
    for t in cnt
    if "  " in t or t.endswith(".") or t.endswith("-") or t.startswith("-")
]
print("Weird tokens:", len(weird))
weird[:30]

Weird tokens: 521


['alcohol denat.',
 'fragrance.',
 'sorbic acid.',
 'symphytum officinale callus culture extract.',
 'ci 77499).',
 'phenoxyethanol. *hadasei-3.',
 'ethylhexylglycerin.',
 '100% unrefined sclerocraya birrea (marula) kernel oil.',
 'caprylyl glycol.',
 'citral.',
 'sunflower seed oil.',
 'linalool. *napiers aqua formula. **napiers original formula. +fragrances of natural origin.',
 'ci 60725 (violet 2).',
 'phenoxyethanol.',
 'geraniol.',
 'ci 77492. *pitera™.',
 'iron oxides.',
 'limonene. *napiers original formula. **napiers moisture formula. ***fragrances of natural origin.',
 'ci 77891 (titanium dioxide).',
 'organic argania spinosa (argan) kernel oil*. *organic. **natural.',
 'titanium dioxide].',
 'titanium dioxide.',
 'blue 1 (ci 42090).',
 'chlorella vulgaris extract.',
 'moringa pterygosperma seed oil.',
 'sodium anisate.',
 'carmine (ci 75470).',
 'chlorphenesin.',
 'eugenol*. *plant origin.',
 'nitrogen. *plant origin.']

In [10]:
def normalize_basic(t):
    t = t.lower().strip()
    t = re.sub(r"\s+", " ", t)
    t = t.strip(" .-–—_")
    return t


# group tokens by their "basic normalized" form
groups = {}
for t in cnt:
    key = normalize_basic(t)
    groups.setdefault(key, []).append(t)

# show keys that have multiple variants
multi = [(k, v) for k, v in groups.items() if len(v) > 1]
multi[:30]

[('glycerin', ['glycerin', 'glycerin.']),
 ('sodium gluconate', ['sodium gluconate', 'sodium gluconate.']),
 ('copper gluconate', ['copper gluconate', 'copper gluconate.']),
 ('calcium gluconate', ['calcium gluconate', 'calcium gluconate.']),
 ('zinc gluconate', ['zinc gluconate', 'zinc gluconate.']),
 ('water', ['water', 'water.']),
 ('beta-carotene', ['beta-carotene', 'beta-carotene.']),
 ('citric acid', ['citric acid', 'citric acid.']),
 ('cyanocobalamin', ['cyanocobalamin', 'cyanocobalamin.']),
 ('limonene', ['limonene', 'limonene.']),
 ('geraniol', ['geraniol', 'geraniol.']),
 ('linalool', ['linalool', 'linalool.']),
 ('citronellol', ['citronellol', 'citronellol.']),
 ('benzyl salicylate', ['benzyl salicylate', 'benzyl salicylate.']),
 ('citral', ['citral', 'citral.']),
 ('sodium benzoate', ['sodium benzoate', 'sodium benzoate.']),
 ('alcohol denat', ['alcohol denat.', 'alcohol denat']),
 ('fragrance', ['fragrance.', 'fragrance']),
 ('pentylene glycol', ['pentylene glycol', 'penty

In [None]:
mask = df["Ingredients"].str.contains("100%", na=False)
df_percent_rows = df.loc[mask, ["Brand", "Name", "Ingredients"]]

df_percent_rows.head(20)

NameError: name 'df_step2' is not defined

## Starting to tweak the ingredient words:


### Cleans the raw Ingredients text into a consistent comma-separated string by removing wrappers/marketing noise, normalizing spacing/dashes, and keeping meaningful ingredient info

In [None]:
# base cleaner (keeps meaningful parentheses like "(ci 77891)",
# but removes wrappers like "+/-", stars, "may contain", marketing sentences, etc.)
def clean_ingredients(ing: str) -> str:
    if not isinstance(ing, str) or not ing.strip():
        return ""

    parts = [p.strip() for p in ing.split(",") if p.strip()]
    cleaned = []

    for raw in parts:
        t = raw.lower().strip()
        t = re.sub(r"\s+", " ", t)
        t = t.replace("–", "-").replace("—", "-")

        # remove leading bullets / stars / simple prefixes
        t = re.sub(r"^[\*\u2022]+\s*", "", t)
        t = re.sub(r"^[\+\-]+\s*", "", t)

        # remove broken "+/-" header/prefix variants

        t = re.sub(
            r"^[\[\(]?\s*[\+\/-]{1,6}\s*(?:may contain|peut contenir)?\s*[\]\):\-]*\s*",
            "",
            t,
        )
        t = re.sub(r"^may contain\s*:?\s*", "", t)
        t = re.sub(r"^peut contenir\s*:?\s*", "", t)

        t = t.strip("[]{}")

        #  "(-)-alpha-bisabolol" -> "alpha-bisabolol"
        t = re.sub(r"^\(\s*[-+]\s*\)\s*-\s*", "", t)

        t = re.sub(r"^\(\s*(?:may contain|\+\/-|\+|\-)\s*\)\s*", "", t)

        m = re.match(r"^\(([^()]{1,80})\)$", t)
        if m:
            t = m.group(1).strip()

        t = re.sub(r"\s*please be aware.*$", "", t)
        t = re.sub(r"\s*ingredient lists may change.*$", "", t)
        t = re.sub(r"\s*please refer.*$", "", t)
        t = re.sub(r"\[\s*\+\/-\s*:.*$", "", t)
        t = re.sub(r"\(\s*\+\/\s*\)\s*:.*$", "", t)
        t = re.sub(r"^\s*may contain\s*:?\s*", "", t)
        t = t.strip(" .;:-")

        if t.count("(") > t.count(")"):
            t = t.split("(", 1)[0].strip()

        t = re.sub(r"[\*\.;:]+$", "", t).strip()
        t = t.strip(" -_")

        # drop obvious marketing / non-ingredient sentences
        if re.search(r"\bpercent\b", t) and any(w in t for w in ["off", "save", "discount", "free shipping"]):
            continue
        if len(t) > 120 and (":" in t or "helps" in t or "step" in t):
            continue
        if "division:" in t or "active ingredients:" in t:
            continue
        if t.startswith("and ") or t.startswith("all products are"):
            continue

        if not t:
            continue

        cleaned.append(t)

    return ", ".join(cleaned)


df_step2 = df.copy()
df_step2["Ingredients"] = df_step2["Ingredients"].apply(clean_ingredients)
df_step2["Ingredients"].head(10)

0    algae (seaweed) extract, mineral oil, petrolat...
1    galactomyces ferment filtrate (pitera), butyle...
2    water, dicaprylyl carbonate, glycerin, ceteary...
3    algae (seaweed) extract, cyclopentasiloxane, p...
4    water, snail secretion filtrate, phenyl trimet...
5    water, saccharomyces/camellia sinensis leaf/cl...
6    water, glycerin, caprylic/ capric triglyceride...
7    100% unrefined sclerocraya birrea (marula) ker...
8    water, glycerin, cyclohexasiloxane, squalane, ...
9    algae (seaweed) extract, mineral oil, petrolat...
Name: Ingredients, dtype: object

 ### Splits the cleaned Ingredients string into a deduplicated list of normalized ingredient tokens (ingredient_tokens) for easier downstream processing.

In [None]:
def normalize_ingredient_token(token: Optional[str]) -> str:
   
    if not isinstance(token, str) or not token.strip():
        return ""

    t = token.strip().lower()
    t = t.replace("–", "-").replace("—", "-")
    t = re.sub(r"\s+", " ", t)

   
    t = t.strip(" \t\n\r[]{}")

   
    t = t.strip(" .;:-")

    return t

In [None]:
# Turn the cleaned Ingredients string into a list of tokens
# - Split by commas into ingredient items
# - Normalize again defensively (safe even if already normalized)
# - Deduplicate while preserving order


def ingredient_tokens(ing: str) -> List[str]:
    if not isinstance(ing, str) or not ing.strip():
        return []

    parts = [
        p.strip() for p in ing.split(",") if p.strip()
    ]  
    tokens = [normalize_ingredient_token(p) for p in parts]
    tokens = [t for t in tokens if t and t != "nan"]

    seen, out = set(), []
    for t in tokens:
        if t and t not in seen:
            out.append(t)
            seen.add(t)
    return out


df_step2["ingredient_tokens"] = df_step2["Ingredients"].apply(ingredient_tokens)

# Preview
df_step2[["Brand", "Name", "ingredient_tokens"]].head(5)

Unnamed: 0,Brand,Name,ingredient_tokens
0,La Mer,Crème de la Mer,"[algae (seaweed) extract, mineral oil, petrola..."
1,Sk-Ii,Facial Treatment Essence,"[galactomyces ferment filtrate (pitera), butyl..."
2,Drunk Elephant,Protini Polypeptide Cream,"[water, dicaprylyl carbonate, glycerin, cetear..."
3,La Mer,The Moisturizing Soft Cream,"[algae (seaweed) extract, cyclopentasiloxane, ..."
4,It Cosmetics,Your Skin But Better CC+ Cream with SPF 50+,"[water, snail secretion filtrate, phenyl trime..."


### Flattens all token lists, counts frequency, and prints the top N most common tokens to help spot typos, variants, and candidates for synonym/canon rules.

In [None]:
# print all tokens and start scanning for synonimc, then chose canonicals
tokens = [t for toks in df_step2["ingredient_tokens"].dropna() for t in toks if isinstance(toks, list)]
c = Counter(tokens)

TOP_N = 1700

top_tokens = [t for t, _ in c.most_common(TOP_N)]

print("\n".join(top_tokens))

water
glycerin
phenoxyethanol
butylene glycol
disodium edta
sodium hyaluronate
caprylyl glycol
dimethicone
tocopheryl acetate
xanthan gum
ethylhexylglycerin
citric acid
tocopherol
fragrance
caprylic/capric triglyceride
potassium sorbate
carbomer
linalool
sodium hydroxide
limonene
sodium benzoate
silica
propanediol
glyceryl stearate
pentylene glycol
squalane
caffeine
polysorbate 20
cetearyl alcohol
lecithin
1
cyclopentasiloxane
2-hexanediol
acrylates/c10-30 alkyl acrylate crosspolymer
stearic acid
panthenol
bht
peg-100 stearate
camellia sinensis leaf extract
hexylene glycol
chlorphenesin
mica
citronellol
cetyl alcohol
alcohol
trehalose
titanium dioxide (ci 77891)
geraniol
sodium citrate
allantoin
betaine
sodium pca
helianthus annuus (sunflower) seed oil
sodium chloride
dipropylene glycol
alcohol denat
niacinamide
algae extract
lactic acid
hydrogenated lecithin
dipotassium glycyrrhizate
sucrose
adenosine
tromethamine
polysorbate 60
cucumis sativus (cucumber) fruit extract
sorbitol
propyl

###  Loads synonyms.json once (cached), replaces token variants with their standardized synonym forms, drops tokens mapped to "", and deduplicates again.

In [None]:
# Apply synonyms from synonyms.json onto token lists
def apply_synonyms_to_tokens(
    tokens: list[str], synonyms_path: str = "synonyms.json"
) -> list[str]:
    # Load + cache synonyms once
    if not hasattr(apply_synonyms_to_tokens, "_syn_cache"):
        p = Path(synonyms_path)
        raw = json.loads(p.read_text(encoding="utf-8")) if p.exists() else {}
        apply_synonyms_to_tokens._syn_cache = {
            normalize_ingredient_token(k): (normalize_ingredient_token(v) if v else "")
            for k, v in raw.items()
        }

    syn = apply_synonyms_to_tokens._syn_cache

    if not isinstance(tokens, list):
        return []

    mapped = []
    for t in tokens:
        t_norm = normalize_ingredient_token(t)
        t_final = syn.get(t_norm, t_norm)  
        if t_final:  # drop "" mappings
            mapped.append(t_final)


    seen, out = set(), []
    for t in mapped:
        if t not in seen:
            out.append(t)
            seen.add(t)
    return out


df_step2["ingredient_tokens"] = df_step2["ingredient_tokens"].apply(
    apply_synonyms_to_tokens
)

# Preview
df_step2[["Brand", "Name", "ingredient_tokens"]].head(5)

Unnamed: 0,Brand,Name,ingredient_tokens
0,La Mer,Crème de la Mer,"[algae (seaweed) extract, mineral oil, petrola..."
1,Sk-Ii,Facial Treatment Essence,"[galactomyces ferment filtrate (pitera), butyl..."
2,Drunk Elephant,Protini Polypeptide Cream,"[water, dicaprylyl carbonate, glycerin, cetear..."
3,La Mer,The Moisturizing Soft Cream,"[algae (seaweed) extract, cyclopentasiloxane, ..."
4,It Cosmetics,Your Skin But Better CC+ Cream with SPF 50+,"[water, snail secretion filtrate, phenyl trime..."


### Prints the unique token vocabulary after synonym mapping

In [None]:
all_tokens = (
    pd.Series([t for row in df_step2["ingredient_tokens"] for t in row])
    .dropna()
    .astype(str)
)
unique_tokens = sorted(all_tokens.unique())

print("Unique token count:", len(unique_tokens))
print("\nFirst 300 tokens:\n")
for t in unique_tokens[:1500]:
    print(t)

Unique token count: 5343

First 300 tokens:

(bergamot) fruit oil
(bilberry) extract
(evening primrose) oil
(jojoba) seed oil
(mel) honey
(rosemary) leaf extract (rosmarinus officinalis leaf extract)
*naturally derived fragrance
/peut contenir/+/-:titanium dioxide (ci 77891)
000 percent better than pure ascorbic acid
1.2-hexanediol
10-decanediol
10-hydroxydecanoic acid
100 percent pure argan oil: nourishes and protects skin with essential fatty acids
100 percent sugarcane-derived squalane
100% natural abaca leaf fiber
100% natural fragrance
100% unrefined sclerocraya birrea (marula) kernel oil
15 dimethicone
2 hexanediol
2-dimethylhydrocinnamal
2-hexandiol
2-hexanediol
2-o-ethyl ascorbic acid
3-0-ethyl ascorbic acid
3-aminopropane sulfonic acid
3-butylene glycol
3-methyl-4
3-o-ethyl ascorbic acid
4 isostearate
4-t-butylcyclohexanol
6
6 copolymer
6-naphthalate
6-trimethyl-2-cyclohexene-1-yl)-3-butene-2-one
7-dehydrocholesterol
77491
77492 (iron oxides)
77499)
a 70 propellant
a70 propell

### Canonization rules that collapse many equivalent forms (water/aqua/eau, CI pigments, citrus variants, etc.) into one canonical token.

In [None]:
CANON_RULES: List[Tuple[str, List[str]]] = [
    #water / base-water 
    ("water", [
        r".*:\s*water\s*$",
        r".*ingredients:\s*water\s*$",
        r".*\bbase:\s*water\s*$",
        r".*\bbase concentrate.*water\s*$",
        r".*\bmask:\s*water\s*$",
        r".*\bcleanser:\s*water\s*$",
        r".*\beye.*cream:\s*water\s*$",

        r"^\s*aqua\s*$",
        r"^\s*water\s*$",
        r"^\s*eau\s*$",
        r"aqua\s*/\s*water\s*/\s*eau",
        r"water\s*/\s*aqua\s*/\s*eau",
        r"aqua\s*\(water\)",
        r"water\s*\(aqua\)",
        r"purified\s+water",
    ]),

    ("fragrance", [
        r"^\s*parfum\s*$",
        r"^\s*fragrance\s*$",
        r"parfum\s*\(fragrance\)",
        r"fragrance\s*\(parfum\)",
        r"fragrance\s*/\s*parfum",
        r"parfum\s*/\s*fragrance",
        r"natural\s+fragrance",
        r"\baroma\b",
        r"\bflavor\b",
    ]),

    ("vitamin e", [
        r"\btocopherol\b",
        r"\btocopheryl\s+acetate\b",
        r"\btocopheryl\s+succinate\b",
        r"\btocotrienols\b",
    ]),

    # pigments 
    ("mica", [r"\bmica\b", r"\bci\s*77019\b"]),

    ("titanium dioxide", [
        r"titanium\s+dioxide",
        r"\bci\s*77891\b",
        r"\bci77891\b",
        r"\bci7789\b",
    ]),

    ("iron oxides", [
        r"iron\s+oxides",
        r"\bci\s*77491\b",
        r"\bci\s*77492\b",
        r"\bci\s*77499\b",
        r"\b77491\b",
        r"\b77492\b",
        r"\b77499\b",
    ]),

    # citrus 
    ("orange extract", [
        r"citrus\s+sinensis",
        r"aurantium\s+dulcis",
        r"\borange\b.*\b(peel|fruit|flower|leaf|oil|water|extract|powder|wax)\b",
        r"\b(sweet|blood)\s+orange\b",
    ]),

    ("lemon extract", [
        r"citrus\s+limon",
        r"medica\s+limonum",
        r"\blemon\b.*\b(peel|fruit|oil|water|extract|powder)\b",
    ]),

    ("lime extract", [
        r"citrus\s+aurantifolia",
        r"\blime\b.*\b(peel|fruit|oil|water|extract|powder)\b",
    ]),

    ("grapefruit extract", [
        r"citrus\s+paradisi",
        r"citrus\s+grandis",
        r"\b(grapefruit|pomelo)\b.*\b(peel|fruit|oil|water|extract|powder)\b",
    ]),

    ("bergamot extract", [
        r"citrus\s+aurantium\s+bergamia",
        r"aurantium\s+bergamia",
        r"\bbergamot\b.*\b(peel|fruit|oil|water|extract|powder)\b",
    ]),

    ("mandarin/tangerine extract", [
        r"citrus\s+reticulata",
        r"citrus\s+nobilis",
        r"citrus\s+tangerina",
        r"\b(mandarin|tangerine)\b.*\b(peel|fruit|oil|water|extract|powder)\b",
    ]),

    ("yuzu extract", [
        r"citrus\s+junos",
        r"\byuzu\b.*\b(peel|fruit|oil|water|extract|powder)\b",
    ]),
]

In [None]:
CANON_RULES_COMPILED = [
    (canon, [re.compile(pat) for pat in pats])
    for canon, pats in CANON_RULES
]

def apply_canon_to_tokens(
    tokens: list[str],
    canon_rules_compiled: list[tuple[str, list[re.Pattern]]],
) -> list[str]:
    if not isinstance(tokens, list):
        return []

    out = []
    seen = set()

    for tok in tokens:
        t = tok.strip()
        if not t:
            continue

        canon = None
        for canon_token, patterns in canon_rules_compiled:
            if any(p.search(t) for p in patterns):
                canon = canon_token
                break

        final_tok = canon if canon else t

        if final_tok not in seen:
            out.append(final_tok)
            seen.add(final_tok)

    return out

df_step2["ingredient_tokens"] = df_step2["ingredient_tokens"].apply(
    lambda toks: apply_canon_to_tokens(toks, CANON_RULES_COMPILED)
)

df_step2[["Brand", "Name", "ingredient_tokens"]].head(5)

Unnamed: 0,Brand,Name,ingredient_tokens
0,La Mer,Crème de la Mer,"[algae (seaweed) extract, mineral oil, petrola..."
1,Sk-Ii,Facial Treatment Essence,"[galactomyces ferment filtrate (pitera), butyl..."
2,Drunk Elephant,Protini Polypeptide Cream,"[water, dicaprylyl carbonate, glycerin, cetear..."
3,La Mer,The Moisturizing Soft Cream,"[algae (seaweed) extract, cyclopentasiloxane, ..."
4,It Cosmetics,Your Skin But Better CC+ Cream with SPF 50+,"[water, snail secretion filtrate, phenyl trime..."


In [None]:
df_step3 = df_step2.copy()
df_step3[["Brand","Name","Ingredients","ingredient_tokens"]].head(10)

Unnamed: 0,Brand,Name,Ingredients,ingredient_tokens
0,La Mer,Crème de la Mer,"algae (seaweed) extract, mineral oil, petrolat...","[algae (seaweed) extract, mineral oil, petrola..."
1,Sk-Ii,Facial Treatment Essence,"galactomyces ferment filtrate (pitera), butyle...","[galactomyces ferment filtrate (pitera), butyl..."
2,Drunk Elephant,Protini Polypeptide Cream,"water, dicaprylyl carbonate, glycerin, ceteary...","[water, dicaprylyl carbonate, glycerin, cetear..."
3,La Mer,The Moisturizing Soft Cream,"algae (seaweed) extract, cyclopentasiloxane, p...","[algae (seaweed) extract, cyclopentasiloxane, ..."
4,It Cosmetics,Your Skin But Better CC+ Cream with SPF 50+,"water, snail secretion filtrate, phenyl trimet...","[water, snail secretion filtrate, phenyl trime..."
5,Tatcha,The Water Cream,"water, saccharomyces/camellia sinensis leaf/cl...","[water, saccharomyces/camellia sinensis leaf/c..."
6,Drunk Elephant,Lala Retro Whipped Cream,"water, glycerin, caprylic/ capric triglyceride...","[water, glycerin, caprylic/ capric triglycerid..."
7,Drunk Elephant,Virgin Marula Luxury Facial Oil,100% unrefined sclerocraya birrea (marula) ker...,[100% unrefined sclerocraya birrea (marula) ke...
8,Kiehl'S Since 1851,Ultra Facial Cream,"water, glycerin, cyclohexasiloxane, squalane, ...","[water, glycerin, cyclohexasiloxane, squalane,..."
9,La Mer,Little Miss Miracle Limited-Edition Crème de l...,"algae (seaweed) extract, mineral oil, petrolat...","[algae (seaweed) extract, mineral oil, petrola..."


In [None]:
PROCESSED_PATH = Path("datasets") / "cosmetics_processed_clean_tokens.csv"
PROCESSED_PATH.parent.mkdir(parents=True, exist_ok=True)

df_step3.to_csv(PROCESSED_PATH, index=False)
print(f"Processed dataset saved to: {PROCESSED_PATH.resolve()}")

Processed dataset saved to: /Users/sabinabacaoanu/SkinCares/miguellib/datasets/datasets/cosmetics_processed_clean_tokens.csv
