In [1]:
import numpy as np 
import pandas as pd
import csv 
import difflib
import itertools
import operator

import os 
os.chdir("/Users/susanchen/Documents/GitHub/skincare_classification/Data_sets")

### This notebook will accomplish two preprocessing steps: Ingredients cleaning and Ingredients matching. The general steps in each process is listed below.

### Ingredients Cleaning:
- 2.1  split ingredients into active and inactive 
- 2.2 find number of inactive and active ingredients (if possible)
- 2.3 check if ingredients are listed alphabetically 

### Ingredients Matching:
- 2.4 get a set of all unique ingredients in our product dataframe 
- 2.5 find the matching key of all these ingredient from our dictionary 
- 2.6 for all products loop over the ingredient list and find each ingredient's rating and category 
- 2.7 for all products, count how many ingredients belong to a certain category (antioxidants, emulsifiers,  emollients, etc)

## 2.0 Load the datasets 

In [2]:
ingredientInfo = pd.read_csv('ingredient_cleaned.txt', sep = ",")
df = pd.read_csv('Master_uncleaned.csv')

In [4]:
cat_list =[]
for i in ingredientInfo.category.values:
    i = i.replace("['", "").replace("']", '').replace("'", '')
    cat_list.append(str(i)) 

In [5]:
ingredientInfo["Category"] = cat_list
ingredientInfo.head()

Unnamed: 0,name,category,rating,rating_num,Category
0,"1, 2-Hexanediol",['Preservatives'],Good,2,Preservatives
1,10-Hydroxydecanoic Acid,['Emollients'],Good,2,Emollients
2,4-T-butylcyclohexanol,"['Emollients', 'Skin-Soothing']",Good,2,"Emollients, Skin-Soothing"
3,Acacia farnesiana extract,"['Plant Extracts', 'Fragrance: Synthetic and F...",Poor,0,"Plant Extracts, Fragrance: Synthetic and Fragr..."
4,acacia senegal gum,"['Texture Enhancer', 'Plant Extracts', 'Skin-S...",Good,2,"Texture Enhancer, Plant Extracts, Skin-Soothing"


In [6]:
df.head()

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.7,Oily,1,1,1,1,1
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.0,Oily,1,1,1,1,1
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.0,Oily,1,1,1,1,1
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.0,Oily,1,1,1,1,1
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1


In [7]:
class look_up_ingredient():

    def __init__(self, rating_dict, category_dict):
        self.rating_dict = rating_dict
        self.rating_dict['unknown'] = []
        self.category_dict = category_dict
        self.category_dict["unknown"] = []

        self.rating = [value for value in self.rating_dict.values()]
        self.category = set([value for values in self.category_dict.values() for value in values])
        self.match_dict = {}

    def find_matching_ingredient(self, my_ingredients, threshold = .10):
        for ingredient in my_ingredients:
            continue
        match_matric ={key: difflib.SequenceMatcher(None, key, ingredient).ratio() for key in self.category_dict.keys()} 
        best_match, best_metric = max(match_matric.items(), key = operator.itemgetter(1))

        if best_metric > threshold:
            self.match_dict[ingredient] = best_match
        else: 
            self.match_dict[ingredient] = "unknown" 

    def lookup(self, ingredient, option = ''):
        key = self.match_dict.get(ingredient, 'unknown')
        rating = self.rating_dict.get(key, -1)
        category = self.category_dict.get(key, [])

        if option == 'ingredient':
            return key
        elif option == "rating":
            return rating 
        elif option == 'category':
            return category
        else:
            return key, rating, category

In [8]:
ingredients_rating_dict = {name.lower(): row['rating_num'] for (idx, row) in ingredientInfo.iterrows() for name in row["name"].split('/')}

In [9]:
ingredients_category_dict = {name.lower(): row["Category"] for (idx, row) in ingredientInfo.iterrows() for name in row["name"].split('/')}

In [10]:
lookup = look_up_ingredient(ingredients_rating_dict, ingredients_category_dict)

In [11]:
class data_cleaning: 
    def __init__(self, df):
        self.df = df.copy(deep = True)

    def clean_ingredients(self):
        def split_by_active_inactive(sr_ingredient):
            inactive_start = pd.concat([sr_ingredient.str.find('Other'),
                                       sr_ingredient.str.find('Inactive')],
                                       axis=1).max(axis=1)
            
            inactive_start = inactive_start.replace(-1, 0)
            inactive = [ingredient[start:] for (ingredient, start) in zip(sr_ingredient, inactive_start)]
            inactive = [ingredient[ingredient.find(':')+1:] for ingredient in inactive]                           
            active = [ingredient[:start] for (ingredient, start) in zip(sr_ingredient, inactive_start)]
            active = [ingredient[ingredient.rfind(':')+1:] for ingredient in active]
            return active, inactive 
        
        self.df["Ingredients"].fillna('', inplace = True)
        self.df['active_ingredient'], self.df['inactive_ingredient']= split_by_active_inactive(self.df['Ingredients'])
        
        self.df['active_ingredient_list'] = self.df['active_ingredient'].apply(lambda x: [l.strip() for l in str(x).split(',') if l.lower().islower()])
        self.df['inactive_ingredient_list'] = self.df['inactive_ingredient'].apply(lambda x: [l.strip() for l in str(x).split(',') if l.lower().islower()])

        self.df["num_inactive_ingredients"] = self.df["inactive_ingredient_list"].apply(lambda x: len(x))
        self.df["num_active_ingredients"] = self.df["active_ingredient_list"].apply(lambda x: len (x))

    def search_for_ingredients(self, lookup):
        print('processing all ingredients...')
        merged_ingredients = set(list(itertools.chain(*self.df['inactive_ingredient_list'].values)))
        merged_ingredients = merged_ingredients.union(
                            set(list(itertools.chain(*self.df['active_ingredient_list'].values))))
        lookup.find_matching_ingredient(merged_ingredients)
        ingredient_property = pd.DataFrame(index=merged_ingredients)
        
        print("find all ingredients information...")
        ingredient_property['matching'] = [lookup.lookup(ingredient, option='ingredient') 
                                        for ingredient in merged_ingredients]
        ingredient_property['rating'] = [lookup.lookup(ingredient, option='rating') 
                                        for ingredient in merged_ingredients]
        ingredient_property['category'] = [lookup.lookup(ingredient, option='category') 
                                        for ingredient in merged_ingredients]
        
        # map origianal ingredient list to matched ingredient
        self.df['inactive_ingredient_matched_list'] = [[ingredient_property.loc[ingredient, 'matching'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['inactive_ingredient_list'].values]
        self.df['active_ingredient_matched_list'] = [[ingredient_property.loc[ingredient, 'matching'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['active_ingredient_list'].values]
        
        # map original ingredient list to ingredient rating
        self.df['inactive_ingredient_rating_list'] = [[ingredient_property.loc[ingredient, 'rating'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['inactive_ingredient_list'].values]
        self.df['active_ingredient_rating_list'] = [[ingredient_property.loc[ingredient, 'rating'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['active_ingredient_list'].values]
        
        # map original ingredient list to ingredient category
        self.df['inactive_ingredient_category_list'] = [[ingredient_property.loc[ingredient, 'category'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['inactive_ingredient_list'].values]
        self.df['active_ingredient_category_list'] = [[ingredient_property.loc[ingredient, 'category'] 
                                                        for ingredient in ingredients]
                                                        for ingredients in self.df['active_ingredient_list'].values]
    
    
    def clean(self):
        self.clean_ingredients()

    def get_df(self):
        return self.df


### Cleaing the entire dataset 

In [12]:
data_cleaner2= data_cleaning(df)
data_cleaner2.clean()
df_cleaned = data_cleaner2.get_df()
df_cleaned

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive,active_ingredient,inactive_ingredient,active_ingredient_list,inactive_ingredient_list,num_inactive_ingredients,num_active_ingredients
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.70,Oily,1,1,1,1,1,,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",[],"[Aqua (Water), Glycolic Acid, Rosa Damascena F...",42,0
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.00,Oily,1,1,1,1,1,,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",[],"[Aqua/Water/Eau, Solum Diatomeae/Diatomaceous ...",32,0
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.00,Oily,1,1,1,1,1,,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",[],"[Water (Aqua), Sodium Laureth Sulfate, Cocamid...",21,0
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.00,Oily,1,1,1,1,1,,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",[],"[Water/Aqua/Eau, Sodium Laureth Sulfate, Cocam...",26,0
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1,,"Purified Water, Glycerin, Behentrimonium Metho...",[],"[Purified Water, Glycerin, Behentrimonium Meth...",22,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,Instant Hydration Facial Sheet Mask,Andalou Naturals,"Aloe Barbadensis Leaf Juice*, Purified Water (...",4.99,Sensitive,0,0,0,0,1,,"[magnolia Officinalis Bark Extract, Boswellia...",[],"[[magnolia Officinalis Bark Extract, Boswellia...",16,0
2037,Prep Cicaronic SOS Ampoules,SNP,"Water, Glycerin, Butylene Glycol, Glycereth-26...",16.00,Sensitive,0,0,0,0,1,,"Water, Glycerin, Butylene Glycol, Glycereth-26...",[],"[Water, Glycerin, Butylene Glycol, Glycereth-2...",31,0
2038,Pro-Collagen Hydra-Gel Eye Mask,ELEMIS,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",76.00,Sensitive,0,0,0,0,1,,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",[],"[Aqua/Water/Eau, Glycerin, Ceratonia Siliqua G...",21,0
2039,ADVANCED Ceramide Capsules Daily Youth Restori...,Elizabeth Arden,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",63.00,Sensitive,0,0,0,0,1,,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",[],"[Cyclopentasiloxane, Isononyl Isononanoate, Is...",27,0


In [13]:
all_ingredients = set(list(itertools.chain(*df_cleaned['inactive_ingredient_list'].values)))

In [18]:
Match_dict={}
for ingredient in all_ingredients:
    match_matric = {key: difflib.SequenceMatcher(None, key, ingredient).ratio() for key in ingredients_category_dict.keys()} 
    best_match, best_metric = max(match_matric.items(), key = operator.itemgetter(1))

    if best_metric > .5:
        Match_dict[ingredient.lower()] = best_match
    else: 
        Match_dict[ingredient.lower()] = "unknown" 

Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding all ingredients...
Matching all ingredients...
Finding al

In [19]:
Match_ratings = {}
Match_categories ={}
for k in Match_dict.keys():
    if k in ingredients_rating_dict.keys():
        Match_ratings[k] = ingredients_rating_dict.get(k)
        Match_categories[k] = ingredients_category_dict.get(k).replace("[", '').replace("]", '').split(",")


In [20]:
category_list = []
ratings_list = []
for Ingredients in df_cleaned['inactive_ingredient_list'].values:
    temp_list = []
    temp_list2 = []
    for i in Ingredients:
        i = i.lower()
        temp_list.append(Match_ratings.get(i))
        temp_list2.append(Match_categories.get(i))
    ratings_list.append(temp_list)
    category_list.append(temp_list2)

df_cleaned["ingredients_ratings_list"] = ratings_list
df_cleaned["ingredients_category_list"] = category_list

In [21]:
from iteration_utilities import deepflatten
flatten_list = []
for i in range(len(df_cleaned.ingredients_category_list)):
    flat = list(deepflatten(df_cleaned["ingredients_category_list"][i], depth =1))
    flatten_list.append(flat)

In [22]:
df_cleaned["ingredients_category_list"] = flatten_list
df_cleaned

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive,active_ingredient,inactive_ingredient,active_ingredient_list,inactive_ingredient_list,num_inactive_ingredients,num_active_ingredients,ingredients_ratings_list,ingredients_category_list
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.70,Oily,1,1,1,1,1,,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",[],"[Aqua (Water), Glycolic Acid, Rosa Damascena F...",42,0,"[None, 3, None, None, None, 2, 3, 2, 2, 3, Non...","[None, Exfoliant, None, None, None, Uncategori..."
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.00,Oily,1,1,1,1,1,,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",[],"[Aqua/Water/Eau, Solum Diatomeae/Diatomaceous ...",32,0,"[None, None, None, 3, None, 2, 2, 2, 2, 2, 2, ...","[None, None, None, Skin-Replenishing, Skin-Re..."
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.00,Oily,1,1,1,1,1,,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",[],"[Water (Aqua), Sodium Laureth Sulfate, Cocamid...",21,0,"[None, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, Non...","[None, Cleansing Agents, Cleansing Agents, Pla..."
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.00,Oily,1,1,1,1,1,,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",[],"[Water/Aqua/Eau, Sodium Laureth Sulfate, Cocam...",26,0,"[None, 2, 2, 2, 2, 3, 3, None, None, None, Non...","[None, Cleansing Agents, Cleansing Agents, Cle..."
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1,,"Purified Water, Glycerin, Behentrimonium Metho...",[],"[Purified Water, Glycerin, Behentrimonium Meth...",22,0,"[2, 3, None, None, None, None, 3, 3, None, Non...","[Uncategorized, Skin-Replenishing, Skin-Resto..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,Instant Hydration Facial Sheet Mask,Andalou Naturals,"Aloe Barbadensis Leaf Juice*, Purified Water (...",4.99,Sensitive,0,0,0,0,1,,"[magnolia Officinalis Bark Extract, Boswellia...",[],"[[magnolia Officinalis Bark Extract, Boswellia...",16,0,"[None, None, None, 3, None, 3, 3, None, None, ...","[None, None, None, Skin-Soothing, Plant Extra..."
2037,Prep Cicaronic SOS Ampoules,SNP,"Water, Glycerin, Butylene Glycol, Glycereth-26...",16.00,Sensitive,0,0,0,0,1,,"Water, Glycerin, Butylene Glycol, Glycereth-26...",[],"[Water, Glycerin, Butylene Glycol, Glycereth-2...",31,0,"[2, 3, 2, 2, 2, None, 2, None, None, None, 2, ...","[Miscellaneous, Skin-Replenishing, Skin-Resto..."
2038,Pro-Collagen Hydra-Gel Eye Mask,ELEMIS,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",76.00,Sensitive,0,0,0,0,1,,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",[],"[Aqua/Water/Eau, Glycerin, Ceratonia Siliqua G...",21,0,"[None, 3, None, 2, 2, 3, None, 2, None, None, ...","[None, Skin-Replenishing, Skin-Restoring, Non..."
2039,ADVANCED Ceramide Capsules Daily Youth Restori...,Elizabeth Arden,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",63.00,Sensitive,0,0,0,0,1,,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",[],"[Cyclopentasiloxane, Isononyl Isononanoate, Is...",27,0,"[2, 2, 2, 2, 2, None, None, 2, None, None, Non...","[Emollients, Emollients, Miscellaneous, Textu..."


In [23]:
from collections import Counter 
num_of_antioxidants = []
num_of_emollients = []
num_of_hydration = [] 
num_of_sensitizing = []
num_of_skin_restoring = []
num_of_plant_extracts = []
num_of_preservatives = []
num_of_skin_softening = []
num_of_skin_replenishing = []

for i in df_cleaned.ingredients_category_list.values:
    # declaring categories to count
    a = "Antioxidants"
    b = "Emollients"
    c = "Hydration"
    
    e = "Skin-Restoring"
    f = "Plant Extracts"
    g = "Preservatives"
    h = "Skin-Softening"
    j = "Sensitizing"
    k = "Skin-Replenishing"

    # initating counter
    d = Counter(i) 

    # Add count 
    num_of_antioxidants.append(d[a])
    num_of_emollients.append(d[b])
    num_of_hydration.append(d[c])
    num_of_sensitizing.append(d[j])
    num_of_skin_restoring.append(d[e])
    num_of_plant_extracts.append(d[f])
    num_of_preservatives.append(d[g])
    num_of_skin_softening.append(d[h])
    num_of_skin_replenishing.append(d[k])

    #print('{} has occurred {} times'.format(x, d[x])) 
df_cleaned["num_of_antioxidants"] = num_of_antioxidants
df_cleaned["num_of_emollients"] = num_of_emollients 
df_cleaned["num_of_hydration"] = num_of_hydration  
df_cleaned["num_of_sensitizing"] = num_of_sensitizing
df_cleaned["num_of_skin-restoring"] = num_of_skin_restoring
df_cleaned["num_of_plant_extracts"] = num_of_plant_extracts
df_cleaned["num_of_preservatives"] = num_of_preservatives
df_cleaned["num_of_skin-softening"] = num_of_skin_softening
df_cleaned["num_of_skin_replenshing"] = num_of_skin_replenishing

In [24]:
df_cleaned

Unnamed: 0,Product,Brand,Ingredients,Price,Skin Type,Combination,Dry,Normal,Oily,Sensitive,...,ingredients_category_list,num_of_antioxidants,num_of_emollients,num_of_hydration,num_of_sensitizing,num_of_skin-restoring,num_of_plant_extracts,num_of_preservatives,num_of_skin-softening,num_of_skin_replenshing
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.70,Oily,1,1,1,1,1,...,"[None, Exfoliant, None, None, None, Uncategori...",11,0,1,0,0,0,2,0,6
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.00,Oily,1,1,1,1,1,...,"[None, None, None, Skin-Replenishing, Skin-Re...",3,0,0,0,0,0,3,0,2
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.00,Oily,1,1,1,1,1,...,"[None, Cleansing Agents, Cleansing Agents, Pla...",0,0,0,0,0,1,2,0,1
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.00,Oily,1,1,1,1,1,...,"[None, Cleansing Agents, Cleansing Agents, Cle...",2,0,0,0,0,0,3,0,0
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,Oily,1,1,1,1,1,...,"[Uncategorized, Skin-Replenishing, Skin-Resto...",1,1,0,0,0,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,Instant Hydration Facial Sheet Mask,Andalou Naturals,"Aloe Barbadensis Leaf Juice*, Purified Water (...",4.99,Sensitive,0,0,0,0,1,...,"[None, None, None, Skin-Soothing, Plant Extra...",1,0,0,0,0,0,0,0,0
2037,Prep Cicaronic SOS Ampoules,SNP,"Water, Glycerin, Butylene Glycol, Glycereth-26...",16.00,Sensitive,0,0,0,0,1,...,"[Miscellaneous, Skin-Replenishing, Skin-Resto...",1,0,0,0,0,1,0,0,2
2038,Pro-Collagen Hydra-Gel Eye Mask,ELEMIS,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",76.00,Sensitive,0,0,0,0,1,...,"[None, Skin-Replenishing, Skin-Restoring, Non...",0,0,0,0,0,4,1,0,2
2039,ADVANCED Ceramide Capsules Daily Youth Restori...,Elizabeth Arden,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",63.00,Sensitive,0,0,0,0,1,...,"[Emollients, Emollients, Miscellaneous, Textu...",2,5,0,0,0,0,1,0,0


In [42]:
top3 =[]
for i in range(len(df_cleaned)):
    main = df_cleaned.inactive_ingredient[i].split(", ")[:4]
    #print(main[0]) #print all the ways water is listed 
    if ("Water" or "Aqua" or "Eau" or "Water (Aqua)" in main[0]):
        main = main[1:4]
    else:
        main = main[:3]
    top3.append(main)
         
df_cleaned["top_3"] = top3

In [78]:
top_categories ={}
for Ingredients in df_cleaned['top_3'].values:
    for i in Ingredients:
        top_categories[i] = ingredients_category_dict.get(i)

top3category = []
for Ingredients in df_cleaned['top_3'].values:
    temp_list = []
    for i in Ingredients:
        i = i.lower()
        temp_list.append(top_categories.get(i))
    top3category.append(temp_list)
df_cleaned["top3_category_list"] = top3category

In [81]:
df_cleaned.drop(["Skin Type", "active_ingredient", "active_ingredient_list"], axis =1)


Unnamed: 0,Product,Brand,Ingredients,Price,Combination,Dry,Normal,Oily,Sensitive,inactive_ingredient,...,num_of_hydration,num_of_sensitizing,num_of_skin-restoring,num_of_plant_extracts,num_of_preservatives,num_of_skin-softening,num_of_skin_replenshing,top_3,top3_ratings_list,top3_category_list
0,Glycolic Acid 7% Toning Solution,The Ordinary,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",8.70,1,1,1,1,1,"Aqua (Water), Glycolic Acid, Rosa Damascena Fl...",...,1,0,0,0,2,0,6,"[Glycolic Acid, Rosa Damascena Flower Water, C...","[None, 3, None, None, None, 2, 3, 2, 2, 3, Non...","[None, None, None]"
1,The Microdelivery Exfoliating Facial Wash,Philosophy,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",29.00,1,1,1,1,1,"Aqua/Water/Eau, Solum Diatomeae/Diatomaceous E...",...,0,0,0,0,3,0,2,[Solum Diatomeae/Diatomaceous Earth/Terre De D...,"[None, None, None, 3, None, 2, 2, 2, 2, 2, 2, ...","[None, None, None]"
2,AHA/BHA Exfoliating Cleanser,Murad,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",40.00,1,1,1,1,1,"Water (Aqua), Sodium Laureth Sulfate, Cocamido...",...,0,0,0,1,2,0,1,"[Sodium Laureth Sulfate, Cocamidopropyl Betain...","[None, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 3, 3, Non...","[None, Cleansing Agents, None]"
3,Anti-Aging Cleansing Gel,Peter Thomas Roth,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",39.00,1,1,1,1,1,"Water/Aqua/Eau, Sodium Laureth Sulfate, Cocami...",...,0,0,0,0,3,0,0,"[Sodium Laureth Sulfate, Cocamidopropyl Betain...","[None, 2, 2, 2, 2, 3, 3, None, None, None, Non...","[None, Cleansing Agents, None]"
4,Hydrating Facial Cleanser,CeraVe,"Purified Water, Glycerin, Behentrimonium Metho...",13.49,1,1,1,1,1,"Purified Water, Glycerin, Behentrimonium Metho...",...,0,0,0,0,3,0,1,"[Glycerin, Behentrimonium Methosulfate and Cet...","[2, 3, None, None, None, None, 3, 3, None, Non...","[Skin-Replenishing, Skin-Restoring, None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2036,Instant Hydration Facial Sheet Mask,Andalou Naturals,"Aloe Barbadensis Leaf Juice*, Purified Water (...",4.99,0,0,0,0,1,"[magnolia Officinalis Bark Extract, Boswellia...",...,0,0,0,0,0,0,0,"[Boswellia Serrata (boswellin) Extract], Biosa...","[None, None, None, 3, None, 3, 3, None, None, ...","[None, None, None]"
2037,Prep Cicaronic SOS Ampoules,SNP,"Water, Glycerin, Butylene Glycol, Glycereth-26...",16.00,0,0,0,0,1,"Water, Glycerin, Butylene Glycol, Glycereth-26...",...,0,0,0,1,0,0,2,"[Glycerin, Butylene Glycol, Glycereth-26]","[2, 3, 2, 2, 2, None, 2, None, None, None, 2, ...","[Skin-Replenishing, Skin-Restoring, Texture En..."
2038,Pro-Collagen Hydra-Gel Eye Mask,ELEMIS,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",76.00,0,0,0,0,1,"Aqua/Water/Eau, Glycerin, Ceratonia Siliqua Gu...",...,0,0,0,4,1,0,2,"[Glycerin, Ceratonia Siliqua Gum, Carrageenan]","[None, 3, None, 2, 2, 3, None, 2, None, None, ...","[Skin-Replenishing, Skin-Restoring, None, None]"
2039,ADVANCED Ceramide Capsules Daily Youth Restori...,Elizabeth Arden,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",63.00,0,0,0,0,1,"Cyclopentasiloxane, Isononyl Isononanoate, Iso...",...,0,0,0,0,1,0,0,"[Isononyl Isononanoate, Isododecane, Isopropyl...","[2, 2, 2, 2, 2, None, None, 2, None, None, Non...","[None, None, None]"


In [None]:
df_cleaned.to_csv('Master_cleaned.csv', index= False)