In [160]:
import numpy as np
import pandas as pd
import re

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE



In [109]:
df = pd.read_csv('../Dataset/skincare_products.csv', delimiter = ',')
df.shape

(1138, 5)

In [110]:
df.head()

Unnamed: 0,product_name,product_url,product_type,clean_ingreds,price
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,"['capric triglyceride', 'cetyl alcohol', 'prop...",£5.20
1,CeraVe Facial Moisturising Lotion SPF 25 52ml,https://www.lookfantastic.com/cerave-facial-mo...,Moisturiser,"['homosalate', 'glycerin', 'octocrylene', 'eth...",£13.00
2,The Ordinary Hyaluronic Acid 2% + B5 Hydration...,https://www.lookfantastic.com/the-ordinary-hya...,Moisturiser,"['sodium hyaluronate', 'sodium hyaluronate', '...",£6.20
3,AMELIORATE Transforming Body Lotion 200ml,https://www.lookfantastic.com/ameliorate-trans...,Moisturiser,"['ammonium lactate', 'c12-15', 'glycerin', 'pr...",£22.50
4,CeraVe Moisturising Cream 454g,https://www.lookfantastic.com/cerave-moisturis...,Moisturiser,"['glycerin', 'cetearyl alcohol', 'capric trigl...",£16.00


In [111]:
df = df.rename(columns={'clean_ingreds': 'ingredients'})
# df = df.rename(columns={'product_name': 'product'})
# df = df.rename(columns={'product_type': 'type'})
# df.drop('product_url', inplace=True, axis=1)

In [112]:
for i in range(len(df['ingredients'])):
    ingredient = str(df['ingredients'].iloc[i])
    ingredient = ingredient.replace('[', '').replace(']', '').replace("'", '').replace('"', '')
    df['ingredients'].iloc[i] = ingredient


In [113]:
all_ingreds = []

for i in df['ingredients']:
    ingreds_list = i.split(', ')
    for j in ingreds_list:
        all_ingreds.append(j)

In [114]:
all_ingreds = sorted(set(all_ingreds))
all_ingreds[10:20]

['3-octadecanediol',
 '4-t-butylcyclohexanol',
 '7-dehydrocholesterol',
 'abies alba leaf oil',
 'abies balsamea extract',
 'abies sibirica oil',
 'acacia concinna fruit extract',
 'acacia decurrens wax',
 'acacia senegal gum',
 'acacia seyal gum extract']

In [115]:
all_ingreds.remove('')
for i in range(len(all_ingreds)):
    if all_ingreds[i][-1] == ' ':
        all_ingreds[i] = all_ingreds[i][0:-1]
        
all_ingreds = sorted(set(all_ingreds))
all_ingreds[10:20]

['7-dehydrocholesterol',
 'abies alba leaf oil',
 'abies balsamea extract',
 'abies sibirica oil',
 'acacia concinna fruit extract',
 'acacia decurrens wax',
 'acacia senegal gum',
 'acacia seyal gum extract',
 'acer saccharum extract',
 'acetate']

In [116]:
one_hot_list = [[0] * 0 for i in range(len(all_ingreds))]

for i in df['ingredients']:
    k=0
    for j in all_ingreds:
        if j in i:
            one_hot_list[k].append(1)
        else:
            one_hot_list[k].append(0)
        k+=1
        
ingred_matrix = pd.DataFrame(one_hot_list).transpose()
ingred_matrix.columns = [sorted(set(all_ingreds))]

ingred_matrix

Unnamed: 0,"1,10-decanediol","1,2-hexanediol",1-methylhydantoin-2-imide,10-hydroxydecanoic acid,"2,6-dimethyl-7-octen-2-ol","2-bromo-2-nitropropane-1,3-diol",2-oleamido-1,3-o-ethyl ascorbic acid,3-octadecanediol,4-t-butylcyclohexanol,...,zinc glycine,zinc laurate,zinc oxide,zinc pca,zinc sulfate,zingiber aromaticus extract,zingiber cassumunar root oil,zingiber officinale root extract,zingiber officinale root oil,zizyphus jujuba seed extract
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1134,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [117]:
brand_list = ["111skin", "a'kin", "acorelle", "adam revolution", "aesop", "ahava", "alchimie forever",
             "algenist", "alpha-h", "ambre solaire", "ameliorate", "american crew", "anthony", "antipodes",
             "apivita", "argentum", "ark skincare", "armani", "aromatherapy associates", "aromaworks", "aromatica",
             "aurelia probiotic skincare", "aurelia skincare",
             "australian bodycare", "avant skincare", "aveda", "aveeno", "avene", "avène",
             "bakel", "balance me", "barber pro", "bareminerals", "barry m cosmetics",
             "baxter of california", "bbb london", "beautypro", "benefit", "benton", "bioderma",
             "bioeffect", "bloom & blossom", "bloom and blossom", "bobbi brown", "bondi sands", "bubble t", "bulldog", "burt's bees",
             "by terry", "carita", "caudalie", "cerave", "chantecaille", "clinique",
             "comfort zone", "connock london", "cosmetics 27", "cosrx", "cowshed", "crystal clear", 
             "cult51", "darphin", "dear, klairs", "decleor", "decléor", "dermalogica", "dhc", "doctors formula",
             "dr. brandt", "dr brandt", "dr. hauschka", "dr hauschka", "dr. jackson's", "dr.jart+", "dr. lipp",
             "dr botanicals", "dr dennis", "dr. pawpaw", "ecooking", "egyptian magic",
             "eisenberg", "elemental herbology", "elemis", "elizabeth arden", "embryolisse",
             "emma hardie", "erno laszlo", "espa", "estée lauder", "estee lauder", "eucerin",
             "eve lom", "eve rebirth", "fade out", "farmacy", "filorga", "first aid beauty", "fit", "foreo",
             "frank body", "freezeframe", "gallinée", "garnier", "gatineau", "glamglow", "goldfaden md",
             "green people", "hawkins and brimble", "holika holika", "house 99", "huxley",
             "ilapothecary", "ila-spa", "indeed labs", "inika", "instant effects", "institut esthederm", "ioma", "klorane",
             "j.one", "jack black", "james read", "jason", "jo malone london", "juice beauty", "jurlique",
             "korres", "l:a bruket", "l'oréal men expert", "l'oreal men expert", "l'oréal paris", "l'oreal paris",
             "l’oréal paris", "lab series skincare for men",
             "lancaster", "lancer skincare", "lancôme", "lancome", "lanolips", "la roche-posay", "laura mercier",
             "liftlab", "little butterfly london", "lixirskin", "liz earle", "love boo",
             "löwengrip", "lowengrip", "lumene", "mac", "madara", "mádara", "magicstripes", "magnitone london",
             "mama mio", "mancave", "manuka doctor", "mauli", "mavala", "maybelline", "medik8", "men-u", "menaji", "molton brown", "moroccanoil",
             "monu", "murad", "naobay", "nars", "natio", "natura bissé", "natura bisse",
             "neal's yard remedies", "neom", "neostrata", "neutrogena", "niod", "nip+fab", "nuxe", "nyx",
             "oh k!", "omorovicza", "origins", "ortigia fico", "oskia", "ouai", "pai ", "paula's choice", "payot",
             "perricone md", "pestle & mortar", "pestle and mortar", "peter thomas roth",
             "philosophy", "pierre fabre", "pixi", "piz buin", "polaar", "prai", "project lip",
             "radical skincare", "rapideye", "rapidlash", "real chemistry", "recipe for men",
             "ren ", "renu", "revolution beauty", "revolution skincare", "rituals", "rmk", "rodial", "roger&gallet", "salcura",
             "sanctuary spa", "sanoflore", "sarah chapman", "sea magik", "sepai",
             "shaveworks", "shea moisture", "shiseido", "skin79", "skin authority", "skinceuticals",
             "skinchemists", "skindoctors", "skin doctors", "skinny tan", "sol de janeiro", "spa magik organiks",
              "st. tropez", "starskin", "strivectin", "sukin",
             "svr", "swiss clinic", "talika", "tan-luxe", "tanorganic", "tanworx", "thalgo", "the chemistry brand",
             "the hero project", "the inkey list", "the jojoba company", "the ordinary",
             "the organic pharmacy", "the ritual of namasté", "this works", "too faced", "trilogy", "triumph and disaster",
             "ultrasun", "uppercut deluxe", "urban decay", "uriage", "verso", "vichy",
             "vida glow", "vita liberata", "wahl", "weleda", "westlab", "wilma schumann", "yes to",
             "ysl", "zelens"]
brand_list = sorted(brand_list, key=len, reverse=True)

In [93]:
svd = TruncatedSVD(n_components=150, n_iter = 1000, random_state = 6) # firstly reduce features to 150 with truncatedSVD - this suppresses some noise
svd_features = svd.fit_transform(ingred_matrix)
tsne = TSNE(n_components = 2, n_iter = 1000, random_state = 6) # reduce 150 features to 2 using t-SNE with exact method
tsne_features = tsne.fit_transform(svd_features)

df['X'] = tsne_features[:, 0]
df['Y'] = tsne_features[:, 1]

In [125]:
df['brand'] = df['product_name'].str.lower()
k=0
for i in df['brand']:
    for j in brand_list:
        if j in i:
            df['brand'][k] = df['brand'][k].replace(i, j.title())
    k+=1
    
df.head()

Unnamed: 0,product_name,product_url,product_type,ingredients,price,brand
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,"capric triglyceride, cetyl alcohol, propanedio...",£5.20,The Ordinary
1,CeraVe Facial Moisturising Lotion SPF 25 52ml,https://www.lookfantastic.com/cerave-facial-mo...,Moisturiser,"homosalate, glycerin, octocrylene, ethylhexyl,...",£13.00,Cerave
2,The Ordinary Hyaluronic Acid 2% + B5 Hydration...,https://www.lookfantastic.com/the-ordinary-hya...,Moisturiser,"sodium hyaluronate, sodium hyaluronate, panthe...",£6.20,The Ordinary
3,AMELIORATE Transforming Body Lotion 200ml,https://www.lookfantastic.com/ameliorate-trans...,Moisturiser,"ammonium lactate, c12-15, glycerin, prunus amy...",£22.50,Ameliorate
4,CeraVe Moisturising Cream 454g,https://www.lookfantastic.com/cerave-moisturis...,Moisturiser,"glycerin, cetearyl alcohol, capric triglycerid...",£16.00,Cerave


In [164]:
search_term = 'aqua'
matching_products = df[df['ingredients'].str.contains(search_term, case=False)]

# Display the matching products
print(matching_products['product_name'])

23                     Bulldog Original Moisturiser 100ml
72                    Bulldog Sensitive Moisturiser 100ml
165     Clinique Repairwear Laser Focus Smooths, Resto...
214            Aveda Hand Relief Night Renewal Serum 30ml
290           Erno Laszlo Detoxifying Cleansing Oil 195ml
331              Elemis Pro-Collagen Rose Hydro-Mist 50ml
356        Emma Hardie Plump and Glow Hydrating Mist 90ml
360                                Eve Lom Face Mist 15ml
367              ARK Skincare Hydrating Beauty Mist 150ml
370     Laura Mercier Perfecting Water Moisture Mist 2...
383                       Eve Lom Radiance Face Mist 48ml
458                 NARS Cosmetics Aqua Gel Luminous Mask
472        AHAVA Single Use 24K Gold Mineral Mud Mask 6ml
488      AHAVA Single Use Overnight Deep Wrinkle Mask 6ml
492     AHAVA Single Use Brightening & Hydration Mask 6ml
495     Garnier Moisture Bomb Deep Sea Water & Hyaluro...
511     BARBER PRO Face Putty Black Peel-Off Mask with...
525           

In [130]:
df2 = df
df2.head()

Unnamed: 0,product_name,product_url,product_type,ingredients,price,brand
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,"capric triglyceride, cetyl alcohol, propanedio...",£5.20,The Ordinary
1,CeraVe Facial Moisturising Lotion SPF 25 52ml,https://www.lookfantastic.com/cerave-facial-mo...,Moisturiser,"homosalate, glycerin, octocrylene, ethylhexyl,...",£13.00,Cerave
2,The Ordinary Hyaluronic Acid 2% + B5 Hydration...,https://www.lookfantastic.com/the-ordinary-hya...,Moisturiser,"sodium hyaluronate, sodium hyaluronate, panthe...",£6.20,The Ordinary
3,AMELIORATE Transforming Body Lotion 200ml,https://www.lookfantastic.com/ameliorate-trans...,Moisturiser,"ammonium lactate, c12-15, glycerin, prunus amy...",£22.50,Ameliorate
4,CeraVe Moisturising Cream 454g,https://www.lookfantastic.com/cerave-moisturis...,Moisturiser,"glycerin, cetearyl alcohol, capric triglycerid...",£16.00,Cerave


In [132]:
df3 = df['ingredients'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('ingredient')
df3 = df.drop('ingredients', axis=1).join(df3)

In [162]:
df3.head()

Unnamed: 0,product_name,product_url,product_type,price,brand,ingredient
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,£5.20,The Ordinary,capric triglyceride
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,£5.20,The Ordinary,cetyl alcohol
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,£5.20,The Ordinary,propanediol
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,£5.20,The Ordinary,stearyl alcohol
0,The Ordinary Natural Moisturising Factors + HA...,https://www.lookfantastic.com/the-ordinary-nat...,Moisturiser,£5.20,The Ordinary,glycerin


In [144]:
df3.describe()

Unnamed: 0,product_name,product_url,product_type,price,brand,ingredient
count,27983,27983,27983,27983,27983,27983
unique,1138,1126,14,290,187,2772
top,Filorga Scrub and Mask 55ml,https://www.lookfantastic.com/lancome-advanced...,Moisturiser,£22.00,Clinique,phenoxyethanol
freq,90,132,3526,898,1333,613


In [134]:
sorted(df3.brand.unique())

["A'Kin",
 'Acorelle',
 'Aesop',
 'Ahava',
 'Alchimie Forever',
 'Alpha-H',
 'Ambre Solaire',
 'Ameliorate',
 'Antipodes',
 'Apivita',
 'Ark Skincare',
 'Armani',
 'Aromatherapy Associates',
 'Aromaworks',
 'Aurelia Probiotic Skincare',
 'Aurelia Skincare',
 'Australian Bodycare',
 'Avant Skincare',
 'Aveda',
 'Aveeno',
 'Avene',
 'Avène',
 'Balance Me',
 'Barber Pro',
 'Bareminerals',
 'Bbb London',
 'Beautypro',
 'Benefit',
 'Benton',
 'Bioderma',
 'Bloom & Blossom',
 'Bloom And Blossom',
 'Bobbi Brown',
 'Bondi Sands',
 'Bubble T',
 'Bulldog',
 "Burt'S Bees",
 'By Terry',
 'Caudalie',
 'Cerave',
 'Chantecaille',
 'Clinique',
 'Comfort Zone',
 'Connock London',
 'Cosrx',
 'Cowshed',
 'Crystal Clear',
 'Darphin',
 'Dear, Klairs',
 'Decléor',
 'Dermalogica',
 'Dhc',
 'Dr Brandt',
 'Dr Dennis',
 'Dr Hauschka',
 'Dr. Brandt',
 'Dr. Hauschka',
 'Dr. Pawpaw',
 'Dr.Jart+',
 'Egyptian Magic',
 'Elemental Herbology',
 'Elemis',
 'Elizabeth Arden',
 'Embryolisse',
 'Emma Hardie',
 'Erno Laszlo

In [120]:
df['brand'] = df['brand'].replace(['Aurelia Probiotic Skincare'],'Aurelia Skincare')
df['brand'] = df['brand'].replace(['Avene'],'Avène')
df['brand'] = df['brand'].replace(['Bloom And Blossom'],'Bloom & Blossom')
df['brand'] = df['brand'].replace(['Dr Brandt'],'Dr. Brandt')
df['brand'] = df['brand'].replace(['Dr Hauschka'],'Dr. Hauschka')
df['brand'] = df['brand'].replace(["L'oreal Paris", 'L’oréal Paris'], "L'oréal Paris")

In [135]:
df3['brand'] = df3['brand'].replace(['Aurelia Probiotic Skincare'],'Aurelia Skincare')
df3['brand'] = df3['brand'].replace(['Avene'],'Avène')
df3['brand'] = df3['brand'].replace(['Bloom And Blossom'],'Bloom & Blossom')
df3['brand'] = df3['brand'].replace(['Dr Brandt'],'Dr. Brandt')
df3['brand'] = df3['brand'].replace(['Dr Hauschka'],'Dr. Hauschka')
df3['brand'] = df3['brand'].replace(["L'oreal Paris", 'L’oréal Paris'], "L'oréal Paris")

In [166]:
def recommender(search):
    cs_list = []
    brands = []
    output = []
    binary_list = []
    idx = df[df['product_name'] == search].index.item()
    for i in ingred_matrix.iloc[idx][1:]:
        binary_list.append(i)    
    point1 = np.array(binary_list).reshape(1, -1)
    point1 = [val for sublist in point1 for val in sublist]
    prod_type = df['product_type'][df['product_name'] == search].iat[0]
    brand_search = df['brand'][df['product_name'] == search].iat[0]
    data_by_type = df[df['product_type'] == prod_type]
    
    for j in range(data_by_type.index[0], data_by_type.index[0] + len(data_by_type)):
        binary_list2 = []
        for k in ingred_matrix.iloc[j][1:]:
            binary_list2.append(k)
        point2 = np.array(binary_list2).reshape(1, -1)
        point2 = [val for sublist in point2 for val in sublist]
        dot_product = np.dot(point1, point2)
        norm_1 = np.linalg.norm(point1)
        norm_2 = np.linalg.norm(point2)
        cos_sim = dot_product / (norm_1 * norm_2)
        cs_list.append(cos_sim)
    data_by_type = pd.DataFrame(data_by_type)
    data_by_type['cos_sim'] = cs_list
    data_by_type = data_by_type.sort_values('cos_sim', ascending=False)
    data_by_type = data_by_type[data_by_type.product_name != search] 
    l = 0
    for m in range(len(data_by_type)):
        brand = data_by_type['brand'].iloc[l]
        if len(brands) == 0:
            if brand != brand_search:
                brands.append(brand)
                output.append(data_by_type.iloc[l])
        elif brands.count(brand) < 2:
            if brand != brand_search:
                brands.append(brand)
                output.append(data_by_type.iloc[l])
        l += 1
        
    return print('\033[1m', 'Recommending products similar to', search,':', '\033[0m'), print(pd.DataFrame(output)[['ingredients', 'cos_sim']].head(10))

In [167]:
recommender('Bulldog Original Moisturiser 100ml')

[1m Recommending products similar to Bulldog Original Moisturiser 100ml : [0m
                                           ingredients  cos_sim
111  glycerin, cetearyl alcohol, glyceryl stearate,...      0.4
3    ammonium lactate, c12-15, glycerin, prunus amy...      0.4
78   capric triglyceride, glycerin, c12-15, ceteary...      0.3
61   squalene, cocos nucifera fruit extract, c12-16...      0.3
97   anthemis nobilis flower water, camellia oleife...      0.3
110  aloe barbadenis extract, ethylhexyl palmitate,...      0.3
34   glycerin, capric triglyceride, glyceryl steara...      0.3
45   glycerin, capric triglyceride, c12-20, simmond...      0.3
112  glyceryl stearate se, glycerin, capric triglyc...      0.3
68   propanediol, capric triglyceride, cetearyl alc...      0.3


(None, None)