 This is the part 3 of cosmetic recommendation: analyzing cosmetic items similarities based on their ingredients
You can also download the csv file from same repository: cosmetic_p.csv

# Preprocess

In [67]:
#libraries
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

In [68]:
cosm_data_2= pd.read_csv('datasets/cosmetic_p.csv')

will be tokenizing the list of ingredients in <code>Ingredients</code> column. After splitting them into tokens, we'll make a binary bag of words. Then we will create a dictionary with the tokens, <code>ingredient_idx</code>, which will have the following format:</p>
<p>{ <strong><em>"ingredient"</em></strong>: index value, … }</p>

In [69]:
# Initialize dictionary, list, and initial index
ingredient_idx = {}
corpus = []
idx = 0

# For loop for tokenization
for i in range(len(cosm_data_2)):    
    ingredients = cosm_data_2['ingredients'][i]
    ingredients_lower = ingredients.lower()
    tokens = ingredients_lower.split(', ')
    corpus.append(tokens)
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx
            idx += 1

print("The index for glycerin is", ingredient_idx['glycerin'])

The index for glycerin is 4


In [70]:
# Get the number of items and tokens 
M = len(cosm_data_2)
N = len(ingredient_idx)

# Initialize a matrix of zeros
A = np.zeros((M,N))

# Define the oh_encoder function
def oh_encoder(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[ingredient]
        # Put 1 at the corresponding indices
        x[idx] = 1
    return x

i = 0
for tokens in corpus:
    A[i, :] = oh_encoder(tokens)
    i += 1

# TNSE

In [71]:
# Dimension reduction with t-SNE
model = TSNE(n_components = 2, learning_rate = 50, random_state = 42)
tsne_features = model.fit_transform(A)

# Make X, Y columns 
cosm_data_2['X'] = tsne_features[:,0]
cosm_data_2['Y'] = tsne_features[:,1]



# Visualization

In [72]:
from bokeh.io import show, output_notebook, push_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()

# Make a source and a scatter plot  
source = ColumnDataSource(cosm_data_2)
plot = figure(x_axis_label = 'T-SNE 1', 
              y_axis_label = 'T-SNE 2', 
              width = 500, height = 400)
plot.circle(x = 'X', 
    y = 'Y', 
    source = source, 
    size = 10, color = '#FF7373', alpha = .8)
hover = HoverTool(tooltips = [('Item', '@name'),
                              ('Brand', '@brand'), 
                              ('Price', '@price'),
                              ])
plot.add_tools(hover)

show(plot)

In [76]:

df_2 = cosm_data_2.reset_index().drop('index', axis = 1)
df_2['dist'] = 0.00

myItem = df_2[df_2.name.str.contains('Lait Capillaire Smoothie Litchi-Mure')]
myItem

Unnamed: 0,Label,URL,brand,name,price,ingredients,X,Y,dist
166,Styling,https://labelleboucle.fr/collections/coiffer/p...,LES SECRETS DE LOLY,Lait Capillaire Smoothie Litchi-Mure,,"Aqua, Simmondsia Chinensis Oil, Prunus Amygdal...",-3.329255,3.419685,0.0


In [77]:
# getting the array for myItem
P1 = np.array([myItem.X.values, myItem.Y.values]).reshape(1, -1)
P1

# cosine similarities with other items
for i in range(len(df_2)):
    P2 = np.array([df_2['X'][i], df_2['Y'][i]]).reshape(-1, 1)
    dist = (P1 * P2).sum() / (np.sqrt(np.sum(P1))*np.sqrt(np.sum(P2)))
    df_2.loc[i, 'dist'] = dist

df_2 = df_2.sort_values('dist')
df_2[['name', 'brand', 'ingredients', 'dist']].head(5)

  dist = (P1 * P2).sum() / (np.sqrt(np.sum(P1))*np.sqrt(np.sum(P2)))


Unnamed: 0,name,brand,ingredients,dist
141,Crème Capillaire Kurl Nectar,LES SECRETS DE LOLY,"Aqua, Aloe Barbadensis Leaf Juice, Cetearyl Al...",0.046328
85,Huile de Jojoba Bio,CENTIFOLIA,Simmondsia Chinensis Seed Oil,0.089638
166,Lait Capillaire Smoothie Litchi-Mure,LES SECRETS DE LOLY,"Aqua, Simmondsia Chinensis Oil, Prunus Amygdal...",0.090432
156,Lait Capillaire Smoothie Vanille-Ylang,LES SECRETS DE LOLY,"Aqua, Simmondsia Chinesis Oil, Prunus Amygdalu...",0.091085
168,Lait Capillaire Smoothie Ananas,LES SECRETS DE LOLY,"Aqua, Simmondsia Chinesis Oil, Prunus Amygdalu...",0.091195
