In [259]:
import numpy as np
import pandas as pd
import random
from collections import Counter


In [260]:
df = pd.read_csv('../Dataset/skincare_new.csv', delimiter = ',')
df.shape

(1138, 5)

In [261]:
list_ingredients = []

for i in df['ingredients']:
    ingreds_list = i.split(', ')
    for j in ingreds_list:
        list_ingredients.append(j)

In [262]:
list_ingredients = sorted(set(list_ingredients))
list_ingredients.remove('')
for i in range(len(list_ingredients)):
    if list_ingredients[i][-1] == ' ':
        list_ingredients[i] = list_ingredients[i][0:-1]
        
list_ingredients = sorted(set(list_ingredients))
list_ingredients[10:20]

['7-dehydrocholesterol',
 'abies alba leaf oil',
 'abies balsamea extract',
 'abies sibirica oil',
 'acacia concinna fruit extract',
 'acacia decurrens wax',
 'acacia senegal gum',
 'acacia seyal gum extract',
 'acer saccharum extract',
 'acetate']

In [263]:
one_hot_list = [[0] * 0 for i in range(len(list_ingredients))]

for i in df['ingredients']:
    k=0
    for j in list_ingredients:
        if j in i:
            one_hot_list[k].append(1)
        else:
            one_hot_list[k].append(0)
        k+=1
        
matrix_ingredients = pd.DataFrame(one_hot_list).transpose()
matrix_ingredients.columns = [sorted(set(list_ingredients))]

matrix_ingredients

Unnamed: 0,"1,10-decanediol","1,2-hexanediol",1-methylhydantoin-2-imide,10-hydroxydecanoic acid,"2,6-dimethyl-7-octen-2-ol","2-bromo-2-nitropropane-1,3-diol",2-oleamido-1,3-o-ethyl ascorbic acid,3-octadecanediol,4-t-butylcyclohexanol,...,zinc glycine,zinc laurate,zinc oxide,zinc pca,zinc sulfate,zingiber aromaticus extract,zingiber cassumunar root oil,zingiber officinale root extract,zingiber officinale root oil,zizyphus jujuba seed extract
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1134,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1136,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


-----

salicylic acid = oily <br>
ceramide = sensitive<br>
niacinamide = combination<br>
squalene = dry skin<br>
sodium hyaluronate = redness<br>
benzoyl = acne<br>
retinol = aging<br>

In [264]:
def calculate_cosine_similarity(p1, p2):
    dot_product = np.dot(p1, p2)
    norm_1 = np.linalg.norm(p1)
    norm_2 = np.linalg.norm(p2)
    cos_sim = dot_product / (norm_1 * norm_2)
    return cos_sim

In [265]:
def get_product_vector(product):
    binary_list = []
    idx = df[df['skincare_name'] == product].index.item()
    for i in matrix_ingredients.iloc[idx][1:]:
        binary_list.append(i)
    p = np.array(binary_list).reshape(1, -1)
    p = [val for sublist in p for val in sublist]
    return p

In [266]:
def calculate_similarity_scores(p1, data_by_type):
    similarity_scores = []
    for j in range(data_by_type.index[0], data_by_type.index[0] + len(data_by_type)):
        binary_list2 = []
        for k in matrix_ingredients.iloc[j][1:]:
            binary_list2.append(k)
        p2 = np.array(binary_list2).reshape(1, -1)
        p2 = [val for sublist in p2 for val in sublist]
        cos_sim = calculate_cosine_similarity(p1, p2)
        similarity_scores.append(cos_sim)
    return similarity_scores

In [267]:
def recommend_products(product):
    similarity_scores = []

    p1 = get_product_vector(product)
    
    prod_type = df['skincare_type'][df['skincare_name'] == product].iat[0]
    
    data_by_type = df[df['skincare_type'] == prod_type]
    
    similarity_scores = calculate_similarity_scores(p1, data_by_type)

    data_by_type = pd.DataFrame(data_by_type)
    data_by_type['cos_sim'] = similarity_scores

    data_by_type = data_by_type.sort_values('cos_sim', ascending=False)
    
    data_by_type = data_by_type[data_by_type.skincare_name != product] 
    
    return data_by_type

In [268]:
def get_top_ingredients(data_by_type):
    x = 0
    brands = []
    output = []
    ingredients_list = []
    brand_search = df['brand'][df['skincare_name'] == product].iat[0]

    for m in range(len(data_by_type)):
        brand = data_by_type['brand'].iloc[x]
        if len(brands) == 0:
            if brand != brand_search:
                brands.append(brand)
                output.append(data_by_type.iloc[x])
                ingredients_list.append(data_by_type['ingredients'].iloc[x])
        elif brands.count(brand) < 2:
            if brand != brand_search:
                brands.append(brand)
                output.append(data_by_type.iloc[x])
                ingredients_list.append(data_by_type['ingredients'].iloc[x])
        x += 1

    df5 = pd.DataFrame(output)['ingredients'].head(5)

    # Split the ingredients into separate values and flatten the list
    ingredients_list = df5.str.split(', ').sum()

    # Count the occurrences of each ingredient
    ingredient_counts = pd.Series(ingredients_list).value_counts()

    return ingredient_counts.head(5)

In [269]:
def recommend_products_by_ingredient(search_term):
    matching_products = df[df['ingredients'].str.contains(search_term, case=False)]
    matching_products = matching_products.sample(frac=1)
    num_products = min(1, len(matching_products))
    random_products = matching_products.head(num_products)

    for product in random_products['skincare_name']:
        recommended_products = recommend_products(product)
        top_ingredients = get_top_ingredients(recommended_products)
        print(top_ingredients)

In [271]:
recommend_products_by_ingredient('salicylic acid')

dimethicon                             7
butylene glycol                        5
phenoxyethanol                         5
tocopheryl acetate                     5
ammonium acryloyldimethyltaurate/vp    5
dtype: int64
