In [2]:
"""This script analyzes cosmetics dataset, cleans the data, 
splits it by category and skin type, and visualizes ingredient similarity using t-SNE."""

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool

# Load dataset
df = pd.read_csv("datasets/cosmetics.csv")

# Display dataset overview
print(df.head(5))
print("Types of products:\n", df["Label"].value_counts())
print("\nBrands:\n", df["Brand"].value_counts())

# Count skin types
skin_types = ["Combination", "Dry", "Normal", "Oily", "Sensitive"]
for skin in skin_types:
    print(f"For {skin} skin: {(df[skin] == 1).sum()}")

# Clean ingredients column
df = df.drop_duplicates()
df = df[~df["Ingredients"].str.contains("visit", case=False, na=False)]

# Split data by category and skin type
def split_by_skin_type(category_df, label):
    """Splits a DataFrame by skin type into a dictionary."""
    return {f"{label}_{skin.lower()}": category_df[category_df[skin] == 1].reset_index(drop=True) 
            for skin in skin_types}

categories = df["Label"].unique()
dataframes_dict = {category.lower(): df[df["Label"] == category] for category in categories}

# Further split each category by skin type
for category in list(dataframes_dict.keys()):
    category_df = dataframes_dict[category]
    dataframes_dict.update(split_by_skin_type(category_df, category))


# Function to tokenize ingredients
def tokenize_ingredients(dataframes):
    """Creates one-hot encoding for ingredient lists in each DataFrame."""
    tokenized_data = {}

    def oh_encoder(tokens, ingredient_map, num_features):
        """One-hot encodes a list of ingredient tokens."""
        vector = np.zeros(num_features)
        for ingredient in tokens:
            if ingredient in ingredient_map:
                vector[ingredient_map[ingredient]] = 1
        return vector

    for name, df in dataframes.items():
        ingredient_map = {}
        tokenized_corpus = []
        index = 0

        # Tokenization
        for ingredients in df["Ingredients"].astype(str):
            tokens = ingredients.lower().split(", ")
            tokenized_corpus.append(tokens)
            for ingredient in tokens:
                if ingredient not in ingredient_map:
                    ingredient_map[ingredient] = index
                    index += 1

        # Create one-hot encoding matrix
        num_docs = len(df)
        num_features = len(ingredient_map)
        one_hot_matrix = np.zeros((num_docs, num_features))

        for i, tokens in enumerate(tokenized_corpus):
            one_hot_matrix[i, :] = oh_encoder(tokens, ingredient_map, num_features)

        tokenized_data[f"matrix_{name}"] = one_hot_matrix

    return tokenized_data

ingredient_matrices = tokenize_ingredients(dataframes_dict)

# Perform dimensionality reduction
tsne_model = TSNE(n_components=2, learning_rate=200, random_state=42)
cleanser_comb_tsne = tsne_model.fit_transform(ingredient_matrices["matrix_cleanser_combination"])

# Add t-SNE coordinates to DataFrame
dataframes_dict["cleanser_combination"]["X"] = cleanser_comb_tsne[:, 0]
dataframes_dict["cleanser_combination"]["Y"] = cleanser_comb_tsne[:, 1]

output_notebook()

# Create scatter plot
source = ColumnDataSource(dataframes_dict["cleanser_combination"])
plot = figure(x_axis_label="T-SNE 1", y_axis_label="T-SNE 2", width=500, height=400)
plot.circle(x="X", y="Y", source=source, size=10, color="#FF7373", alpha=0.8)

# Add hover tool
hover = HoverTool(tooltips=[("Item", "@Name"), ("Brand", "@Brand"), 
                            ("Price", "$@Price"), ("Rank", "@Rank")])
plot.add_tools(hover)

# Show plot
show(plot)

         Label           Brand                                           Name  \
0  Moisturizer          LA MER                                Crème de la Mer   
1  Moisturizer           SK-II                       Facial Treatment Essence   
2  Moisturizer  DRUNK ELEPHANT                     Protini™ Polypeptide Cream   
3  Moisturizer          LA MER                    The Moisturizing Soft Cream   
4  Moisturizer    IT COSMETICS  Your Skin But Better™ CC+™ Cream with SPF 50+   

   Price  Rank                                        Ingredients  \
0    175   4.1  Algae (Seaweed) Extract, Mineral Oil, Petrolat...   
1    179   4.1  Galactomyces Ferment Filtrate (Pitera), Butyle...   
2     68   4.4  Water, Dicaprylyl Carbonate, Glycerin, Ceteary...   
3    175   3.8  Algae (Seaweed) Extract, Cyclopentasiloxane, P...   
4     38   4.1  Water, Snail Secretion Filtrate, Phenyl Trimet...   

   Combination  Dry  Normal  Oily  Sensitive  
0            1    1       1     1          1  
1   

