In [38]:
# import the data
import pandas as pd
foundation = pd.read_csv("foundation.csv")
print(foundation.shape)
foundation.head()

(217, 3)


Unnamed: 0.1,Unnamed: 0,name,ingredient
0,1,’Hello Flawless!’ Oxygen Wow Liquid Foundation,I just got this today and when I went to use i...
1,2,’Hello Flawless!’ Powder Foundation,Previously used this then tried a Clinique pow...
2,3,#FauxFilter Foundation,I purchased two shades that I thought would ma...
3,4,10 HR Wear Perfection Foundation,Perfect Match.......matches perfectly to my sk...
4,5,8 HR Mattifying Compact Foundation,I absolutely adore this foundation. It's light...


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [4]:
# make it into DocumentTermMatrix
tf_vect = CountVectorizer(lowercase=True, stop_words="english") 
X = tf_vect.fit_transform(foundation['ingredient'])
X = X.toarray()
X

array([[5, 0, 0, ..., 0, 0, 0],
       [7, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 2, 0, ..., 0, 0, 0]], dtype=int64)

In [39]:
dtm = pd.DataFrame(data = X, index = foundation.name, columns = tf_vect.vocabulary_)
dtm.head()
dtm.shape

(217, 43697)

In [6]:
# Dimension Reduction
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [9]:
Z_df = pd.DataFrame(data = Z, columns = ['SVD1', 'SVD2'])
cosm_svd = pd.concat([foundation.iloc[:, 1], Z_df], axis = 1)
cosm_svd.head()

Unnamed: 0,name,SVD1,SVD2
0,’Hello Flawless!’ Oxygen Wow Liquid Foundation,3912.995322,-33.534149
1,’Hello Flawless!’ Powder Foundation,4736.539946,1486.429862
2,#FauxFilter Foundation,4445.494099,-839.743679
3,10 HR Wear Perfection Foundation,5547.132895,-556.698862
4,8 HR Mattifying Compact Foundation,1328.655802,230.87183


In [12]:
cosm_svd.to_csv('svd.csv')

In [18]:
# read in the final data
final_svd = pd.read_csv("final svd.csv")
final_svd.head()

Unnamed: 0.1,Unnamed: 0,name,brand,price,SVD1,SVD2,rating
0,1,’Hello Flawless!’ Oxygen Wow Liquid Foundation,Benefit Cosmetics,36.0,3912.995322,-33.534149,4
1,2,’Hello Flawless!’ Powder Foundation,Benefit Cosmetics,34.0,4736.539946,1486.429862,4
2,3,#FauxFilter Foundation,HUDA BEAUTY,40.0,4445.494099,-839.743679,4
3,4,10 HR Wear Perfection Foundation,SEPHORA COLLECTION,20.0,5547.132895,-556.698862,4
4,5,8 HR Mattifying Compact Foundation,SEPHORA COLLECTION,20.0,1328.655802,230.87183,4


In [15]:
from bokeh.plotting import figure, ColumnDataSource
from bokeh.io import output_file, show
from bokeh.models import HoverTool

In [16]:
# use bokeh to visualize the similarity
source = ColumnDataSource(final_svd)

p = figure()
p.circle(x = 'SVD1', y = 'SVD2', source = source, size = 8, color = 'pink')

hover = HoverTool(tooltips = [
        ('Item', '@name'),
        ('brand', '@brand'),
        ('Price', '$ @price'),
        ('Rating', '@rating')])

p.add_tools(hover)

show(p)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
# find the index of my favorite foundation
myindex = final_svd.index[final_svd.name.str.contains('Luminous Silk Foundation')]
print(myindex)

Int64Index([99], dtype='int64')


In [34]:
# calcualte the cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
final_svd['cos_sim'] = 0.0
for i in range(len(X)):
    final_svd['cos_sim'][i] = cosine_similarity([X[99]], [X[i]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [36]:
# sorting by the similarity
final_svd = final_svd.sort_values('cos_sim', ascending=False)
final_svd[['brand', 'name', 'cos_sim']].head(6)

Unnamed: 0,brand,name,cos_sim
99,Giorgio Armani Beauty,Luminous Silk Foundation,1.0
34,Too Faced,Born This Way Foundation,0.980288
120,Urban Decay,Naked Skin Weightless Ultra Definition Liquid ...,0.977903
198,MAKE UP FOR EVER,Ultra HD Invisible Cover Foundation,0.977609
56,Dior,Diorskin Forever Perfect Foundation Broad Spec...,0.976383
146,NARS,Sheer Glow Foundation,0.976049
