In [303]:
#--- Libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from collections import Counter


In [304]:
#--- Load data
df = pd.read_csv("sephora_data.csv")
df.head()

Unnamed: 0,category,brand,name,price,rank,ingredients,combo,dry,normal,oily,sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


In [305]:
#--- User-defined functions
def get_recommendations(title, cosine_sim = cosine_sim):
    #Source: https://gist.github.com/emmagrimaldi/4e33c0091d2294b04c063b552925fe5f#file-recommender-py
    
    # initializing the empty list of recommended products
    recommended_products = []
    #gettin the index of the product that matches the name
    idx = indices[indices == title].index[0]
    #creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    #getting the indexes of the 10 most similar products
    top_10_indexes = list(score_series.iloc[1:11].index)
    #populating the list with the names of the best 10 matching products
    for i in top_10_indexes:
        recommended_products.append(list(df.name)[i])
        
    return recommended_products

def sanitize_text(df, col):
    df[col+' Cleaned'] = df[col].str.replace(r"[^A-Za-z0-9(),!?@™\'\`\"\_\n]", " ")
    df[col+' Cleaned'] = df[col+' Cleaned'].replace('[^\w\s]', '')
    df[col+' Cleaned'] = df[col+' Cleaned'].str.lower()
    return df


### data exploration

In [306]:
#--- Average price for each category.
df.groupby('category')['price'].mean()

category
Cleanser       32.601423
Eye cream      63.602871
Face Mask      42.624060
Moisturizer    69.087248
Sun protect    45.905882
Treatment      79.177419
Name: price, dtype: float64

In [307]:
#--- Average rating for each category.
df.groupby('category')['rank'].mean()

category
Cleanser       4.312456
Eye cream      3.806220
Face Mask      4.166917
Moisturizer    4.238591
Sun protect    4.051176
Treatment      4.218145
Name: rank, dtype: float64

No significant difference in rating based on skin type.

In [308]:
len(df[(df['normal']==1) & (df['oily']==1) & (df['sensitive']==1) & (df['dry']==1)])/float(len(df['category']))

0.44972826086956524

Pretty even distribution per number of products per skin type. 10 pps less for sensitive skin. Yet, almost half of all products are marketed for all skin types. I am always skeptical of products marketed for all skin types - I mean are they really? I am no dermatologist though.

In [309]:
#Overall ingredients by skin type, strict
Counter(" ".join(df[(df['dry']==1) & (df['normal']==0) & (df['oily']==0) & (df['sensitive']==0)]['ingredients']).split(',')).most_common(10)


[(' Glycerin', 5),
 (' Phenoxyethanol', 5),
 (' Glyceryl Stearate', 3),
 (' Caprylyl Glycol', 3),
 (' Silica', 3),
 (' Glycerin ', 2),
 (' Petrolatum ', 2),
 (' Sodium Hyaluronate ', 2),
 (' Butylene Glycol ', 2),
 (' Stearic Acid', 2)]

In [310]:
Counter(" ".join(df[(df['dry']==0) & (df['normal']==1) & (df['oily']==0) & (df['sensitive']==0)]['ingredients']).split(',')).most_common(10)


[(' Glycerin', 5),
 (' Dimethicone', 3),
 (' Phenoxyethanol', 3),
 (' Lavandula Angustifolia (Lavender) Oil', 3),
 (' Caprylyl Glycol', 2),
 (' Chlorphenesin', 2),
 (' Tetrasodium Edta', 2),
 (' Tocopheryl Acetate', 2),
 (' Caprylic/Capric Triglyceride', 2),
 (' Glycol Distearate', 2)]

In [311]:
Counter(" ".join(df[(df['dry']==0) & (df['normal']==0) & (df['oily']==1) & (df['sensitive']==0)]['ingredients']).split(',')).most_common(10)


[(' Butylene Glycol', 13),
 (' Phenoxyethanol', 12),
 (' Glycerin', 12),
 (' Sodium Hydroxide', 7),
 (' Xanthan Gum', 6),
 (' Phenoxyethanol ', 5),
 (' Tocopheryl Acetate', 5),
 (' Glycolic Acid', 5),
 (' Ethylhexylglycerin', 5),
 (' Alcohol', 5)]

In [312]:
Counter(" ".join(df[(df['dry']==0) & (df['normal']==0) & (df['oily']==0) & (df['sensitive']==1)]['ingredients']).split(',')).most_common(10)


[(' Camellia Sinensis Leaf Extract', 2),
 (' Achillea Millefolium Extract', 2),
 (' Soy Isoflavones', 2),
 (' Copper PCA', 2),
 (' Zinc PCA', 2),
 (' Caprylic/Capric Triglyceride', 2),
 (' Polysorbate 20', 2),
 (' Phenoxyethanol', 2),
 ('-Lactic Acid: Supports natural cell turnover and cell renewal.\r\r\n-Mandelic Acid: Possesses anti-irritation an anti-redness properties.\r\r\n-Willow Bark Extract: Gently exfoliates and clears pores (contains salicylic acid).\r\r\n-Colloidal Oatmeal: Acts as an anti-irritant and anti-inflammatory.\r\r\n\r\r\nStep One:\r\r\nWater',
  1),
 (' Hamamelis Virginiana (Witch Hazel) Water', 1)]

In [313]:
#Look at price bracket and ingredients
Counter(" ".join(df[df['price']>=100]['ingredients']).split(',')).most_common(10)


[(' Glycerin', 84),
 (' Butylene Glycol', 70),
 (' Phenoxyethanol', 64),
 (' Dimethicone', 53),
 (' Sodium Hyaluronate', 50),
 (' Caprylic/Capric Triglyceride', 46),
 (' Limonene', 43),
 (' Xanthan Gum', 41),
 (' Fragrance', 39),
 (' Disodium Edta', 38)]

In [314]:
Counter(" ".join(df[df['price']<100]['ingredients']).split(',')).most_common(10)


[(' Glycerin', 806),
 (' Phenoxyethanol', 633),
 (' Butylene Glycol', 623),
 (' Caprylyl Glycol', 366),
 (' Sodium Hyaluronate', 353),
 (' Ethylhexylglycerin', 339),
 (' Xanthan Gum', 338),
 (' Dimethicone', 321),
 (' Tocopheryl Acetate', 315),
 (' Citric Acid', 303)]

In [315]:
#Look at rank bracket and ingredients
Counter(" ".join(df[df['rank']>=3.5]['ingredients']).split(',')).most_common(10)


[(' Glycerin', 843),
 (' Phenoxyethanol', 658),
 (' Butylene Glycol', 653),
 (' Caprylyl Glycol', 381),
 (' Sodium Hyaluronate', 380),
 (' Ethylhexylglycerin', 361),
 (' Xanthan Gum', 358),
 (' Dimethicone', 358),
 (' Tocopheryl Acetate', 335),
 (' Citric Acid', 303)]

In [316]:
Counter(" ".join(df[df['rank']<3.5]['ingredients']).split(',')).most_common(10)


[(' Glycerin', 47),
 (' Butylene Glycol', 40),
 (' Phenoxyethanol', 39),
 (' Sodium Hyaluronate', 23),
 (' Xanthan Gum', 21),
 (' Caprylyl Glycol', 21),
 (' Citric Acid', 19),
 (' Sodium Benzoate', 18),
 (' Dimethicone', 16),
 (' Carbomer', 16)]

### cosine similarities

In [317]:
#--- Prepare text and future df
df = sanitize_text(df, "ingredients")
indices = pd.Series(df.name) #to track the name of the product within the matrix


In [320]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['ingredients Cleaned'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)
get_recommendations('Rose Deep Hydration Facial Toner')

['Rose Deep Hydration Facial Toner',
 'Rose Face Mask',
 'Rose Stem Cell Bio-Repair Gel Mask',
 'Face Mask - Rose - Moisturizing',
 'Soy Face Cleanser',
 'Rose Cleansing Foam',
 'Rose Deep Hydration Face Serum',
 'Dermaclear™ Micro Water',
 'Rose Deep Hydration Moisturizer',
 'Keep Young And Beautiful™ Instant Brightening Beauty Shot Eye Lift']

In [321]:
count = TfidfVectorizer()
count_matrix = count.fit_transform(df['ingredients Cleaned'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)
get_recommendations('Rose Deep Hydration Facial Toner')

['Rose Deep Hydration Facial Toner',
 'Rose Face Mask',
 'Rose Stem Cell Bio-Repair Gel Mask',
 'Face Mask - Rose - Moisturizing',
 'Soy Face Cleanser',
 'Rose Cleansing Foam',
 'Rose Deep Hydration Face Serum',
 'Dermaclear™ Micro Water',
 'Rose Deep Hydration Moisturizer',
 'Keep Young And Beautiful™ Instant Brightening Beauty Shot Eye Lift']