In [103]:
import numpy as np 
import pandas as pd
import csv 
import difflib
import itertools
import operator


import os 
os.chdir("/Users/susanchen/Documents/GitHub/skincare_classification/Data_sets")

## 2.0 Load the dataset 

In [104]:
ingredientInfo = pd.read_csv('ingredient_cleaned.txt', sep = ",")
df = pd.read_csv('Master_uncleaned.csv')

In [105]:
df.head()

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.7,Oily,1,1,1,1,1
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.0,Oily,1,1,1,1,1
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.0,Oily,1,1,1,1,1
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.0,Oily,1,1,1,1,1
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1


In [106]:
ingredientInfo.head()

Unnamed: 0,name,category,rating,rating_num
0,"1, 2-Hexanediol",['Preservatives'],Good,2
1,10-Hydroxydecanoic Acid,['Emollients'],Good,2
2,4-T-butylcyclohexanol,"['Emollients', 'Skin-Soothing']",Good,2
3,Acacia farnesiana extract,"['Plant Extracts', 'Fragrance: Synthetic and F...",Poor,0
4,acacia senegal gum,"['Texture Enhancer', 'Plant Extracts', 'Skin-S...",Good,2


In [107]:
class look_up_ingredient():

    def __init__(self, rating_dict, category_dict):
        self.rating_dict = rating_dict
        self.rating_dict['unknown'] = []
        self.category_dict = category_dict
        self.category_dict["unknown"] = []

        self.rating = [value for value in self.rating_dict.values()]
        self.category = set([value for values in self.category_dict.values() for value in values])
        self.match_dict = {}

    def find_matching_ingredient(self, my_ingredients, threshold = .25):
        for ingredient in my_ingredients:
            continue
        match_matric ={key: difflib.SequenceMatcher(None, key, ingredient).ratio() for key in self.rating_dict.keys()} 
        best_match, best_metric = max(match_matric.items(), key = operator.itemgetter(1))

        if best_metric > threshold:
            self.match_dict[ingredient] = best_match
        else: 
            self.match_dict[ingredient] = "unknown" 

    def lookup(self, ingredient, option = ''):
        key = self.match_dict.get(ingredient, 'unknown')
        rating = self.rating_dict.get(key, -1)
        category = self.category_dict.get(key, [])

        if option == 'ingredient':
            return key
        elif option == "rating":
            return rating 
        elif option == 'category':
            return category
        else:
            return key, rating, category

In [108]:
ingredients_rating_dict = {name: row['rating_num'] for (idx, row) in ingredientInfo.iterrows() for name in row["name"].split('/')}

In [109]:
ingredients_category_dict = {name: row["category"] for (idx, row) in ingredientInfo.iterrows() for name in row["name"].split('/')}

In [110]:
lookup = look_up_ingredient(ingredients_rating_dict, ingredients_category_dict)

In [111]:
class data_cleaning: 
    def __init__(self, df):
        self.df = df.copy(deep = True)

    def clean_ingredients(self):
        def split_by_active_inactive(sr_ingredient):
            inactive_start = pd.concat([sr_ingredient.str.find('Other'),
                                       sr_ingredient.str.find('Inactive')],
                                       axis=1).max(axis=1)
            
            inactive_start = inactive_start.replace(-1, 0)
            inactive = [ingredient[start:] for (ingredient, start) in zip(sr_ingredient, inactive_start)]
            inactive = [ingredient[ingredient.find(':')+1:] for ingredient in inactive]                           
            active = [ingredient[:start] for (ingredient, start) in zip(sr_ingredient, inactive_start)]
            active = [ingredient[ingredient.rfind(':')+1:] for ingredient in active]
            return active, inactive 
        
        self.df["Ingredients"].fillna('', inplace = True)
        self.df['active_ingredient'], self.df['inactive_ingredient']= split_by_active_inactive(self.df['Ingredients'])
        
        self.df['active_ingredient_list'] = self.df['active_ingredient'].apply(lambda x: [l.strip() for l in str(x).split(',') if l.lower().islower()])
        self.df['inactive_ingredient_list'] = self.df['inactive_ingredient'].apply(lambda x: [l.strip() for l in str(x).split(',') if l.lower().islower()])

        self.df["num_inactive_ingredients"] = self.df["inactive_ingredient_list"].apply(lambda x: len(x))
        self.df["num_active_ingredients"] = self.df["active_ingredient_list"].apply(lambda x: len (x))

    def search_for_ingredients(self, lookup):
        print('processing all ingredients...')
        merged_ingredients = set(list(itertools.chain(*self.df['inactive_ingredient_list'].values)))
        merged_ingredients = merged_ingredients.union(
                            set(list(itertools.chain(*self.df['active_ingredient_list'].values))))
        lookup.find_matching_ingredient(merged_ingredients)
        ingredient_property = pd.DataFrame(index=merged_ingredients)
        
        print("find all ingredients information...")
        ingredient_property['matching'] = [lookup.lookup(ingredient, option='ingredient') 
                                        for ingredient in merged_ingredients]
        ingredient_property['rating'] = [lookup.lookup(ingredient, option='rating') 
                                        for ingredient in merged_ingredients]
        ingredient_property['category'] = [lookup.lookup(ingredient, option='category') 
                                        for ingredient in merged_ingredients]
        
        # map origianal ingredient list to matched ingredient
        self.df['inactive_ingredient_matched_list'] = [[ingredient_property.loc[ingredient, 'matching'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['inactive_ingredient_list'].values]
        self.df['active_ingredient_matched_list'] = [[ingredient_property.loc[ingredient, 'matching'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['active_ingredient_list'].values]
        
        # map original ingredient list to ingredient rating
        self.df['inactive_ingredient_rating_list'] = [[ingredient_property.loc[ingredient, 'rating'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['inactive_ingredient_list'].values]
        self.df['active_ingredient_rating_list'] = [[ingredient_property.loc[ingredient, 'rating'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['active_ingredient_list'].values]
        
        # map original ingredient list to ingredient category
        self.df['inactive_ingredient_category_list'] = [[ingredient_property.loc[ingredient, 'category'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['inactive_ingredient_list'].values]
        self.df['active_ingredient_category_list'] = [[ingredient_property.loc[ingredient, 'category'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['active_ingredient_list'].values]
    
    
    def clean(self):
        self.clean_ingredients()

    def get_df(self):
        return self.df


### Test the cleaning on a sample

In [112]:
sample = df.sample(10)
data_cleaner= data_cleaning(sample)
data_cleaner.clean()
sample_cleaned = data_cleaner.get_df()

In [113]:

merged_ingredients = set(list(itertools.chain(*sample_cleaned['inactive_ingredient_list'].values)))
ingredient_property = pd.DataFrame(index=merged_ingredients)
lookup.find_matching_ingredient(merged_ingredients)
ingredient_property['matching'] = [lookup.lookup(ingredient, option='ingredient') for ingredient in merged_ingredients]
ingredient_property['rating'] = [lookup.lookup(ingredient, option='rating') for ingredient in merged_ingredients]
ingredient_property['category'] = [lookup.lookup(ingredient, option='category') for ingredient in merged_ingredients]
ingredient_property



Unnamed: 0,matching,rating,category
Mica,unknown,[],[]
Juniperus Virginiana (Juniper) Oil¿,unknown,[],[]
Citric Acid.,unknown,[],[]
Sodium Polyacrylate,unknown,[],[]
Betaine,unknown,[],[]
...,...,...,...
Bis-Peg-18 Methyl Ether Dimethyl Silane,unknown,[],[]
Microcitrus Australasica Fruit Extract¿,unknown,[],[]
Tripeptide-1,unknown,[],[]
Stearyl Glycyrrhetinate,unknown,[],[]


In [122]:
data_cleaner.search_for_ingredients(lookup)
sample_cleaned = data_cleaner.get_df()
sample_cleaned

processing all ingredients...
find all ingredients information...


Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive,...,active_ingredient_list,inactive_ingredient_list,num_inactive_ingredients,num_active_ingredients,inactive_ingredient_matched_list,active_ingredient_matched_list,inactive_ingredient_rating_list,active_ingredient_rating_list,inactive_ingredient_category_list,active_ingredient_category_list
1599,Bienfait Aqua Vital Lotion Continuous Infusing...,Lancôme,"Aqua / Water / Eau, Glycerin, Alcohol Denat., ...",52.00,Dry,0,1,1,0,1,...,[],"[Aqua / Water / Eau, Glycerin, Alcohol Denat.,...",51,0,"[unknown, unknown, unknown, unknown, unknown, ...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[]
24,Clarifying Cleanser,Murad,Active Ingredient: Salicylic Acid (1.51%). Oth...,32.00,Oily,1,0,0,1,0,...,[Salicylic Acid (1.51%).],"[Water (Aqua), Cocamidopropyl Betaine, Sodium ...",31,1,"[unknown, unknown, unknown, unknown, unknown, ...",[unknown],"[[], [], [], [], [], [], [], [], [], [], [], [...",[[]],"[[], [], [], [], [], [], [], [], [], [], [], [...",[[]]
14,Deep Cleansing Oil,DHC,"Olea Europaea (Olive) Fruit Oil, Caprylic/Capr...",28.00,Oily,1,1,1,1,1,...,[],"[Olea Europaea (Olive) Fruit Oil, Caprylic/Cap...",8,0,"[unknown, unknown, unknown, unknown, unknown, ...",[],"[[], [], [], [], [], [], [], []]",[],"[[], [], [], [], [], [], [], []]",[]
789,Rapid Tone Repair Correcting Cream,Neutrogena,"Water, Pentaerythrityl Tetraethylhexanoate, Di...",35.99,Oily,1,1,1,1,1,...,[],"[Water, Pentaerythrityl Tetraethylhexanoate, D...",32,0,"[unknown, unknown, unknown, unknown, unknown, ...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[]
1080,Triple Oxygen Mask,Bliss,"Disodium Laureth Sulfosuccinate, Methyl Perflu...",22.00,Oily,1,1,1,1,1,...,[],"[Disodium Laureth Sulfosuccinate, Methyl Perfl...",59,0,"[unknown, unknown, acetyl tributyl citrate, un...",[],"[[], [], 1, [], [], [], [], [], [], [], [], []...",[],"[[], [], ['Film-Forming/Holding Agents'], [], ...",[]
289,PREBIOTIX Instant Flash Facial,Juice Beauty,"Aloe Barbadensis Leaf Juice*, Pyrus Malus (App...",rice$36.40Original Price$52.00,Oily,1,1,1,1,0,...,[],"[Aloe Barbadensis Leaf Juice*, Pyrus Malus (Ap...",41,0,"[unknown, unknown, unknown, unknown, unknown, ...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[]
710,Travel Size Revitalizing Supreme+ Global Anti-...,Estée Lauder,"Water\Aqua\Eau, Dimethicone, Glycerin, Isohexa...",27.00,Oily,1,1,1,1,1,...,[],"[Water\Aqua\Eau, Dimethicone, Glycerin, Isohex...",51,0,"[unknown, unknown, unknown, unknown, unknown, ...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[]
862,Kakadu C Intensive Vitamin C Peel Pad with Fer...,Dermadoctor,"Aqua (Water), Glycolic Acid, Rose Centifolia F...",rice$58.50Original Price$78.00,Oily,1,1,1,1,1,...,[],"[Aqua (Water), Glycolic Acid, Rose Centifolia ...",31,0,"[unknown, unknown, unknown, unknown, unknown, ...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[]
1270,Clear Improvement Charcoal Honey Mask To Purif...,Origins,Water\Aqua\Eau Butylene Glycol Bentonite Montm...,34.00,Oily,1,1,1,1,0,...,[],[Water\Aqua\Eau Butylene Glycol Bentonite Mont...,18,0,"[unknown, unknown, unknown, unknown, unknown, ...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[]
1441,The Night Night Eyes - Nighttime Retinoid Eye ...,THE ROUTE,"Water/Aqua/Eau, Cetearyl Alcohol, Caprylic/Cap...",65.00,Oily,1,1,1,1,1,...,[],"[Water/Aqua/Eau, Cetearyl Alcohol, Caprylic/Ca...",60,0,"[unknown, unknown, unknown, unknown, unknown, ...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[],"[[], [], [], [], [], [], [], [], [], [], [], [...",[]


In [129]:
sample_cleaned.inactive_ingredient_category_list[1599]

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

### Cleaing the entire dataset 

In [146]:
data_cleaner2= data_cleaning(df)
data_cleaner2.clean()
df_cleaned = data_cleaner2.get_df()
df_cleaned

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive,active_ingredient,inactive_ingredient,active_ingredient_list,inactive_ingredient_list,num_inactive_ingredients,num_active_ingredients
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.70,Oily,1,1,1,1,1,,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",[],"[Aqua (Water), Glycolic Acid, Rosa Damascena F...",42,0
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.00,Oily,1,1,1,1,1,,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",[],"[Aqua/Water/Eau, Solum Diatomeae/Diatomaceous ...",32,0
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.00,Oily,1,1,1,1,1,,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",[],"[Water (Aqua), Sodium Laureth Sulfate, Cocamid...",21,0
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.00,Oily,1,1,1,1,1,,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",[],"[Water/Aqua/Eau, Sodium Laureth Sulfate, Cocam...",26,0
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1,,"Purified Water, Glycerin, Behentrimonium Metho...",[],"[Purified Water, Glycerin, Behentrimonium Meth...",22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,Instant Hydration Facial Sheet Mask,Andalou Naturals,"Aloe Barbadensis Leaf Juice*, Purified Water (...",4.99,Sensitive,0,0,0,0,1,,"[magnolia Officinalis Bark Extract, Boswellia...",[],"[[magnolia Officinalis Bark Extract, Boswellia...",16,0
2037,Prep Cicaronic SOS Ampoules,SNP,"Water, Glycerin, Butylene Glycol, Glycereth-26...",16.00,Sensitive,0,0,0,0,1,,"Water, Glycerin, Butylene Glycol, Glycereth-26...",[],"[Water, Glycerin, Butylene Glycol, Glycereth-2...",31,0
2038,Pro-Collagen Hydra-Gel Eye Mask,ELEMIS,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",76.00,Sensitive,0,0,0,0,1,,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",[],"[Aqua/Water/Eau, Glycerin, Ceratonia Siliqua G...",21,0
2039,ADVANCED Ceramide Capsules Daily Youth Restori...,Elizabeth Arden,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",63.00,Sensitive,0,0,0,0,1,,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",[],"[Cyclopentasiloxane, Isononyl Isononanoate, Is...",27,0


In [149]:
main = df_cleaned.inactive_ingredient[2].split(", ")[0]
print(main)

Water (Aqua)


In [177]:
#main =[]
for i in range(len(df_cleaned)):
    main = df_cleaned.inactive_ingredient[i].split(", ")[:4]
    print(main[0]) #print all the ways water is listed 
    if ("Water" or "Aqua" or "Eau" or "Water (Aqua)" in main[0]):
        main = main[1:4]
    df_cleaned['main_ingredients'][i] = main


Aqua (Water)


KeyError: 'main_ingredients'

In [174]:
df_cleaned['Price'][0]

'8.70'