In [1]:
import numpy as np 
import pandas as pd
import csv 
import difflib
import itertools
import operator

import os 
os.chdir("/Users/susanchen/Documents/GitHub/skincare_classification/Data_sets")

### This notebook will accomplish two preprocessing steps: Ingredients cleaning and Ingredients matching. The general steps in each process is listed below.

### Ingredients Cleaning:
- 2.1 Split ingredients into active and inactive 
- 2.2 Find number of inactive and active ingredients (if possible)
- 2.3 Check if ingredients are listed alphabetically 

### Ingredients Matching:
- 2.4 Create two dictionaries containing the ingredient name as the key and the rating and category as values respectively
- 2.5 Find the matching key of all these ingredient from our dictionary 
- 2.6 For all products loop over the ingredient list and find each ingredient's rating and category 
- 2.7 For all products, count how many ingredients belong to a certain category (antioxidants, emulsifiers,  emollients, etc)
- 2.8 For all products, find the top 3 ingredients by concentration

## 2.0 Load the 2 datafiles 

In [2]:
ingredientInfo = pd.read_csv('ingredient_cleaned.txt', sep = ",")
df = pd.read_csv('Master_uncleaned.csv')

In [3]:
cat_list =[]
for i in ingredientInfo.category.values:
    i = i.replace("['", "").replace("']", '').replace("'", '')
    cat_list.append(str(i)) 

In [4]:
ingredientInfo["Category"] = cat_list
ingredientInfo.head()

Unnamed: 0,name,category,rating,rating_num,Category
0,"1, 2-Hexanediol",['Preservatives'],Good,2,Preservatives
1,10-Hydroxydecanoic Acid,['Emollients'],Good,2,Emollients
2,4-T-butylcyclohexanol,"['Emollients', 'Skin-Soothing']",Good,2,"Emollients, Skin-Soothing"
3,Acacia farnesiana extract,"['Plant Extracts', 'Fragrance: Synthetic and F...",Poor,0,"Plant Extracts, Fragrance: Synthetic and Fragr..."
4,acacia senegal gum,"['Texture Enhancer', 'Plant Extracts', 'Skin-S...",Good,2,"Texture Enhancer, Plant Extracts, Skin-Soothing"


In [5]:
df.head()

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.7,Oily,1,1,1,1,1
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.0,Oily,1,1,1,1,1
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.0,Oily,1,1,1,1,1
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.0,Oily,1,1,1,1,1
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1


## 2.1 Split Ingredients into Active and Inactive

### It is a required standard to label ingredients in beauty and hygienic products as active and inactive if the product fall within in two categories. Those two categories are mandated by the FDA as (1) if the product contains ingredients are FDA classified drugs or (2) the product claims to have some medical benefit. In contract, if a product does not contain FDA classifed drugs or does not claim to have medical benefits, then its ingredients are consider to be comestic ingredients. Cosmetic products do not need to label their comestic ingredients as active or inactive, all ingredients can be listed in a single area on the label. The FDA only requires manufacturers to list cosmetic ingredients in order of the amount/concentration present, from highest to lowest.

In [6]:
def split_by_active_inactive(sr_ingredient):
            inactive_start = pd.concat([sr_ingredient.str.find('Other'),
                                       sr_ingredient.str.find('Inactive')],
                                       axis=1).max(axis=1)
            
            inactive_start = inactive_start.replace(-1, 0)
            inactive = [ingredient[start:] for (ingredient, start) in zip(sr_ingredient, inactive_start)]
            inactive = [ingredient[ingredient.find(':')+1:] for ingredient in inactive]                           
            active = [ingredient[:start] for (ingredient, start) in zip(sr_ingredient, inactive_start)]
            active = [ingredient[ingredient.rfind(':')+1:] for ingredient in active]
            return active, inactive 

## 2.3 Check if Ingredients are in alphabetically order. 

### Comestic products can label their ingredients in two ways: alphabetically or by concentration. Althought the latter (by concentration) is the standard, we still need to check. If ingredients are listed by concentration, we will then use this information to find the top 3 ingredients (excluding water) in step 2.8. 


In [7]:
def check_alphabetical(ingredient_list, tol = 1):
            if (len(ingredient_list)) <= tol:
                return True 
            for i in range(len(ingredient_list) - 1):
                if ingredient_list[i] > ingredient_list[i+1]:
                    return False 
            return True 

## All together:

In [8]:
class data_cleaning: 
    def __init__(self, df):
        self.df = df.copy(deep = True)

    def clean_ingredients(self):
        def split_by_active_inactive(sr_ingredient):
            inactive_start = pd.concat([sr_ingredient.str.find('Other'),
                                       sr_ingredient.str.find('Inactive')],
                                       axis=1).max(axis=1)
            
            inactive_start = inactive_start.replace(-1, 0)
            inactive = [ingredient[start:] for (ingredient, start) in zip(sr_ingredient, inactive_start)]
            inactive = [ingredient[ingredient.find(':')+1:] for ingredient in inactive]                           
            active = [ingredient[:start] for (ingredient, start) in zip(sr_ingredient, inactive_start)]
            active = [ingredient[ingredient.rfind(':')+1:] for ingredient in active]
            return active, inactive 

        def check_alphabetical(ingredient_list, tol = 1):
            if (len(ingredient_list)) <= tol:
                return True 
            for i in range(len(ingredient_list) - 1):
                if ingredient_list[i] > ingredient_list[i+1]:
                    return False 
            return True 
                
        
        self.df["Ingredients"].fillna('', inplace = True)
        self.df['active_ingredient'], self.df['inactive_ingredient']= split_by_active_inactive(self.df['Ingredients'])
        
        self.df['active_ingredient_list'] = self.df['active_ingredient'].apply(lambda x: [l.strip() for l in str(x).split(',') if l.lower().islower()])
        self.df['inactive_ingredient_list'] = self.df['inactive_ingredient'].apply(lambda x: [l.strip() for l in str(x).split(',') if l.lower().islower()])

        self.df["num_inactive_ingredients"] = self.df["inactive_ingredient_list"].apply(lambda x: len(x))
        self.df["num_active_ingredients"] = self.df["active_ingredient_list"].apply(lambda x: len (x))
        self.df["Is_alphabetical"] = self.df["inactive_ingredient_list"].apply(check_alphabetical)


    def clean(self):
        self.clean_ingredients()

    def get_df(self):
        return self.df

In [9]:
data_cleaner= data_cleaning(df)
data_cleaner.clean()
df_cleaned = data_cleaner.get_df()
df_cleaned

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive,active_ingredient,inactive_ingredient,active_ingredient_list,inactive_ingredient_list,num_inactive_ingredients,num_active_ingredients,Is_alphabetical
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.70,Oily,1,1,1,1,1,,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",[],"[Aqua (Water), Glycolic Acid, Rosa Damascena F...",42,0,False
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.00,Oily,1,1,1,1,1,,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",[],"[Aqua/Water/Eau, Solum Diatomeae/Diatomaceous ...",32,0,False
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.00,Oily,1,1,1,1,1,,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",[],"[Water (Aqua), Sodium Laureth Sulfate, Cocamid...",21,0,False
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.00,Oily,1,1,1,1,1,,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",[],"[Water/Aqua/Eau, Sodium Laureth Sulfate, Cocam...",26,0,False
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1,,"Purified Water, Glycerin, Behentrimonium Metho...",[],"[Purified Water, Glycerin, Behentrimonium Meth...",22,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,Instant Hydration Facial Sheet Mask,Andalou Naturals,"Aloe Barbadensis Leaf Juice*, Purified Water (...",4.99,Sensitive,0,0,0,0,1,,"[magnolia Officinalis Bark Extract, Boswellia...",[],"[[magnolia Officinalis Bark Extract, Boswellia...",16,0,False
2037,Prep Cicaronic SOS Ampoules,SNP,"Water, Glycerin, Butylene Glycol, Glycereth-26...",16.00,Sensitive,0,0,0,0,1,,"Water, Glycerin, Butylene Glycol, Glycereth-26...",[],"[Water, Glycerin, Butylene Glycol, Glycereth-2...",31,0,False
2038,Pro-Collagen Hydra-Gel Eye Mask,ELEMIS,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",76.00,Sensitive,0,0,0,0,1,,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",[],"[Aqua/Water/Eau, Glycerin, Ceratonia Siliqua G...",21,0,False
2039,ADVANCED Ceramide Capsules Daily Youth Restori...,Elizabeth Arden,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",63.00,Sensitive,0,0,0,0,1,,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",[],"[Cyclopentasiloxane, Isononyl Isononanoate, Is...",27,0,False


## 2.4 Create the rating and category dictionaries 

In [10]:
ingredients_rating_dict = {name.lower(): row['rating_num'] for (idx, row) in ingredientInfo.iterrows() for name in row["name"].split('/')}

In [11]:
ingredients_category_dict = {name.lower(): row["Category"] for (idx, row) in ingredientInfo.iterrows() for name in row["name"].split('/')}

## 2.5 Ingredient Matching 
### Create a set of all unique ingredients in our dataframe. Use this set to build the matching dictionary. 

In [12]:
all_ingredients = set(list(itertools.chain(*df_cleaned['inactive_ingredient_list'].values)))

In [13]:
Match_dict={}
for ingredient in all_ingredients:
    match_matric = {key: difflib.SequenceMatcher(None, key, ingredient).ratio() for key in ingredients_category_dict.keys()} 
    best_match, best_metric = max(match_matric.items(), key = operator.itemgetter(1))

    if best_metric > .5:
        Match_dict[ingredient.lower()] = best_match
    else: 
        Match_dict[ingredient.lower()] = "unknown" 

In [26]:
Match_ratings = {}
Match_categories ={}
for k in Match_dict.keys():
    if k in ingredients_rating_dict.keys():
        Match_ratings[k] = ingredients_rating_dict.get(k)
        Match_categories[k] = ingredients_category_dict.get(k).replace("[", '').replace("]", '').split(",")


## 2.6 Map over the Ingredient rating and category(s)
### For all products loop over the ingredient list and find each ingredient's rating and category 

In [27]:
category_list = []
ratings_list = []
for Ingredients in df_cleaned['inactive_ingredient_list'].values:
    temp_list = []
    temp_list2 = []
    for i in Ingredients:
        i = i.lower()
        temp_list.append(Match_ratings.get(i))
        temp_list2.append(Match_categories.get(i))
    ratings_list.append(temp_list)
    category_list.append(temp_list2)

df_cleaned["ingredients_ratings_list"] = ratings_list
df_cleaned["ingredients_category_list"] = category_list

## The ingredient category list as of right now is 2 dimensional list. We need it to be 1 dimensional.

In [28]:
from iteration_utilities import deepflatten
flatten_list = []
for i in range(len(df_cleaned.ingredients_category_list)):
    flat = list(deepflatten(df_cleaned["ingredients_category_list"][i], depth =1))
    flatten_list.append(flat)

In [29]:
df_cleaned["ingredients_category_list"] = flatten_list
df_cleaned

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive,...,num_of_Emollients,num_of_Hydration,num_of_Skin-Restoring,num_of_Plant Extracts,num_of_Preservatives,num_of_Skin-Softening,num_of_Sensitizing,num_of_Skin-Replenishing,top_3,top3_category_list
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.70,Oily,1,1,1,1,1,...,0,1,0,0,2,0,0,6,"[Glycolic Acid, Rosa Damascena Flower Water, C...","[None, None, None]"
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.00,Oily,1,1,1,1,1,...,0,0,0,0,3,0,0,2,[Solum Diatomeae/Diatomaceous Earth/Terre De D...,"[None, None, None]"
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.00,Oily,1,1,1,1,1,...,0,0,0,1,2,0,0,1,"[Sodium Laureth Sulfate, Cocamidopropyl Betain...","[None, Cleansing Agents, None]"
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.00,Oily,1,1,1,1,1,...,0,0,0,0,3,0,0,0,"[Sodium Laureth Sulfate, Cocamidopropyl Betain...","[None, Cleansing Agents, None]"
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1,...,1,0,0,0,3,0,0,1,"[Glycerin, Behentrimonium Methosulfate and Cet...","[Skin-Replenishing, Skin-Restoring, None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,Instant Hydration Facial Sheet Mask,Andalou Naturals,"Aloe Barbadensis Leaf Juice*, Purified Water (...",4.99,Sensitive,0,0,0,0,1,...,0,0,0,0,0,0,0,0,"[Boswellia Serrata (boswellin) Extract], Biosa...","[None, None, None]"
2037,Prep Cicaronic SOS Ampoules,SNP,"Water, Glycerin, Butylene Glycol, Glycereth-26...",16.00,Sensitive,0,0,0,0,1,...,0,0,0,1,0,0,0,2,"[Glycerin, Butylene Glycol, Glycereth-26]","[Skin-Replenishing, Skin-Restoring, Texture En..."
2038,Pro-Collagen Hydra-Gel Eye Mask,ELEMIS,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",76.00,Sensitive,0,0,0,0,1,...,0,0,0,4,1,0,0,2,"[Glycerin, Ceratonia Siliqua Gum, Carrageenan]","[Skin-Replenishing, Skin-Restoring, None, None]"
2039,ADVANCED Ceramide Capsules Daily Youth Restori...,Elizabeth Arden,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",63.00,Sensitive,0,0,0,0,1,...,5,0,0,0,1,0,0,0,"[Isononyl Isononanoate, Isododecane, Isopropyl...","[None, None, None]"


## 2.7 Count Ingredient Categories 

### The categories to count for are:
1. Antioxidants
2. Emollients
3. Hydration
4. Skin-Restoring
5. Plant Extracts
6. Preservatives
7. Skin-Softening
8. Sensitizing
9. Skin-Replenishing


In [30]:
from collections import Counter 
categories = ["Antioxidants", "Emollients", "Hydration", "Skin-Restoring", "Plant Extracts", "Preservatives", "Skin-Softening", "Sensitizing", "Skin-Replenishing"]


for category in categories:
    count = []
    for i in df_cleaned.ingredients_category_list.values:
        # declaring category to count
        x = category
        # initiating counter
        d = Counter(i) 
        # Add count 
        count.append(d[x])
    df_cleaned["num_of_" + category] = count

## 2.8 Find the top 3 ingredients by concentration
### Since the ingredients are not listed alphabetically, we can find the top 3 ingredients by concentration. The top 3 would be the first 3 listed ingredients or the first 4 listed ingredients if including water. 

In [31]:
top3 =[]
for i in range(len(df_cleaned)):
    main = df_cleaned.inactive_ingredient[i].split(", ")[:4]
    #print(main[0]) #print all the ways water is listed and to check what is the first listed ingredient in all products. It turns out to be water and it is not unusal that the ingredient in highest concentration is water. 
    if ("Water" or "Aqua" or "Eau" or "Water (Aqua)" in main[0]):
        main = main[1:4]
    else:
        main = main[:3]
    top3.append(main)
         
df_cleaned["top_3"] = top3

In [32]:
top_categories ={}
for Ingredients in df_cleaned['top_3'].values:
    for i in Ingredients:
        top_categories[i] = ingredients_category_dict.get(i)

top3category = []
for Ingredients in df_cleaned['top_3'].values:
    temp_list = []
    for i in Ingredients:
        i = i.lower()
        temp_list.append(top_categories.get(i))
    top3category.append(temp_list)
df_cleaned["top3_category_list"] = top3category

## Remove unnecessary columns and save data set

In [37]:
df_cleaned.drop(["Skin Type", "inactive_ingredient_list", "active_ingredient_list"], axis =1)


Unnamed: 0,Product,Brand,Ingredients,Price,Combination,Dry,Normal,Oily,Sensitive,active_ingredient,...,num_of_Emollients,num_of_Hydration,num_of_Skin-Restoring,num_of_Plant Extracts,num_of_Preservatives,num_of_Skin-Softening,num_of_Sensitizing,num_of_Skin-Replenishing,top_3,top3_category_list
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.70,1,1,1,1,1,,...,0,1,0,0,2,0,0,6,"[Glycolic Acid, Rosa Damascena Flower Water, C...","[None, None, None]"
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.00,1,1,1,1,1,,...,0,0,0,0,3,0,0,2,[Solum Diatomeae/Diatomaceous Earth/Terre De D...,"[None, None, None]"
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.00,1,1,1,1,1,,...,0,0,0,1,2,0,0,1,"[Sodium Laureth Sulfate, Cocamidopropyl Betain...","[None, Cleansing Agents, None]"
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.00,1,1,1,1,1,,...,0,0,0,0,3,0,0,0,"[Sodium Laureth Sulfate, Cocamidopropyl Betain...","[None, Cleansing Agents, None]"
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,1,1,1,1,1,,...,1,0,0,0,3,0,0,1,"[Glycerin, Behentrimonium Methosulfate and Cet...","[Skin-Replenishing, Skin-Restoring, None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,Instant Hydration Facial Sheet Mask,Andalou Naturals,"Aloe Barbadensis Leaf Juice*, Purified Water (...",4.99,0,0,0,0,1,,...,0,0,0,0,0,0,0,0,"[Boswellia Serrata (boswellin) Extract], Biosa...","[None, None, None]"
2037,Prep Cicaronic SOS Ampoules,SNP,"Water, Glycerin, Butylene Glycol, Glycereth-26...",16.00,0,0,0,0,1,,...,0,0,0,1,0,0,0,2,"[Glycerin, Butylene Glycol, Glycereth-26]","[Skin-Replenishing, Skin-Restoring, Texture En..."
2038,Pro-Collagen Hydra-Gel Eye Mask,ELEMIS,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",76.00,0,0,0,0,1,,...,0,0,0,4,1,0,0,2,"[Glycerin, Ceratonia Siliqua Gum, Carrageenan]","[Skin-Replenishing, Skin-Restoring, None, None]"
2039,ADVANCED Ceramide Capsules Daily Youth Restori...,Elizabeth Arden,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",63.00,0,0,0,0,1,,...,5,0,0,0,1,0,0,0,"[Isononyl Isononanoate, Isododecane, Isopropyl...","[None, None, None]"


In [38]:
df_cleaned.to_csv('Master_cleaned.csv', index= False)

## The data set is now ready for exploratory analysis and plots. Refer to notebook 3 for this process. 