بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيْم

# **I. Keyword Extraction**

In [250]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

df_idf = pd.read_excel('sc-price.xlsx')
df_idf[['DESC','ingredients']]= df_idf['DESC'].str.split('Ingredients:\n', expand=True)
df_idf.head()

Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,ingredients
0,4.0,serum,Niacinamide 10% + Zinc 1%,The Ordinary,The Ordinary Niacinamide 10% + Zinc 1% menyama...,157000,"Aqua (Water), Niacinamide, Zinc PCA, Tamarind..."
1,4.3,serum,Niacinamide + Moisture Beet Serum,Somethinc,Somethinc Niacinamide + Moisture Beet Serum me...,115000,"Aqua, Allantoin, edta, Niacinamide, hydroxyet..."
2,4.7,serum,N10-Dose+ Brightening Serum,Whitelab,Whitelab N10-Dose+ Brightening Serum merupakan...,75000,"Aqua, Propylene Glycol, Niacinamide, Ethyl Alc..."
3,4.8,serum,C White Lightening Serum,Azarine Cosmetics,Azarine Cosmetics C White Lightening Serum ada...,220000,"Kakadu Plum, Lemon, Niacinamide, Papaya, Aloe..."
4,4.3,serum,Advanced Snail 96 Mucin Power Essence,Cosrx,Cosrx Advanced Snail 96 Mucin Power Essence di...,47500,"Snail Secretion Filtrate, Betaine, Butylene G..."


In [251]:
df_idf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   RATING       209 non-null    float64
 1   CATEGORY     209 non-null    object 
 2   PRODUCT      209 non-null    object 
 3   BRAND        209 non-null    object 
 4   DESC         209 non-null    object 
 5   PRICE        209 non-null    object 
 6   ingredients  209 non-null    object 
dtypes: float64(1), object(6)
memory usage: 11.6+ KB


## 1.1 Case Folding

In [252]:
import re
def pre_process(text):

    # lowercase
    text=text.lower()

    #remove tags
    text=re.sub("</?.*?>"," <> ",text)

    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)

    return text

df_idf['DESC'] = df_idf['DESC'].apply(lambda x:pre_process(x))

## 1.2 Tokenizing - Stopword Removal - Count Vectorizer

### 1.2.1 CountVectorizer to create a vocabulary and generate word counts

CountVectorizer akan menghitung frekuensi kemunculan setiap kata atau token pada setiap dokumen `df_idf['text']` dalam bentuk sparse matriks `word_count_vector.shape` yang merupakan representasi dari jumlah, artinya setiap kolom mewakili kata dalam kosakata dan setiap baris mewakili dokumen dalam kumpulan data dimana nilainya adalah jumlah kata. Dengan representasi ini, hitungan beberapa kata bisa menjadi 0 jika kata tersebut tidak muncul di dokumen terkait.

In [253]:
from sklearn.feature_extraction.text import CountVectorizer
import re

with open('stpwrd.txt', 'r') as file:
    stopwords = [word.strip() for word in file.readlines()]

docs=df_idf['DESC'].tolist()

cv=CountVectorizer(min_df=0.07,stop_words=stopwords,ngram_range=(1,2))
word_count_vector=cv.fit_transform(docs)
word_count_vector.shape

(209, 109)

Bentuk vektor yang dihasilkan adalah **(209, 109)** karena memiliki 209 dokumen dalam kumpulan data (baris) dan ukuran kosakatanya adalah 109 artinya memiliki 20489 kata unik (kolom) dalam kumpulan data dikurangi stopwords. Untuk membatasi ukuran kosakata dengan menyetting `min_features=vocab_size` saat membuat `instance CountVectorizer`.

## 1.3 TF-IDF Transformer to Compute Inverse Document Frequency (IDF)

Mengambil sparse matrix dari `CountVectorizer` untuk menghasilkan IDF saat memanggil fit. IDF harus didasarkan pada kumpulan besar dan harus mewakili teks yang akan digunakan untuk mengekstrak keyword.

In [254]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [255]:
tfidf_transformer.idf_

array([3.57451881, 3.40266855, 2.70952137, 3.30258509, 3.45673577,
       3.57451881, 3.40266855, 3.51389419, 2.58590742, 2.39586381,
       3.51389419, 2.63353546, 2.94591015, 3.57451881, 2.91312033,
       3.40266855, 3.51389419, 3.45673577, 3.25606508, 3.35137526,
       2.82074701, 2.68354588, 3.40266855, 2.70952137, 3.25606508,
       1.59351734, 1.62860866, 2.73618962, 3.40266855, 2.39586381,
       3.1690537 , 3.51389419, 2.14241491, 3.35137526, 2.65822808,
       1.        , 2.73618962, 3.57451881, 3.57451881, 2.28666452,
       3.51389419, 3.51389419, 2.91312033, 2.54044504, 2.85059997,
       2.60943791, 3.21161331, 2.2039728 , 3.57451881, 2.91312033,
       2.68354588, 3.40266855, 2.94591015, 3.30258509, 2.15745279,
       2.37681562, 2.4152819 , 2.70952137, 3.1690537 , 2.32175584,
       3.30258509, 3.40266855, 2.76358859, 3.57451881, 3.35137526,
       3.57451881, 2.94591015, 3.12823171, 3.30258509, 3.51389419,
       2.79175947, 3.51389419, 3.35137526, 3.51389419, 2.33977

## 1.4 Computing TF-IDF and Extracting Keywords

Setelah menghitung `IDF` selanjutnya menghitung `TF-IDF` dan mengekstrak keywword teratas. Dimulai dengan membaca file pengujian, mengekstrak bidang yang diperlukan (product dan desc) dan memasukkan teks ke dalam daftar.

In [256]:
# read test docs into a dataframe and concatenate product and desc
df_test=pd.read_excel('sc-price.xlsx')
df_test['text'] = df_test['DESC'].apply(lambda x:pre_process(x))

# get test docs into a list
docs_test=df_test['DESC'].tolist()

In [257]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""

    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]

        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

Langkah selanjutnya adalah menghitung nilai **tf-idf** untuk dokumen tertentu dalam set pengujian dengan memanggil `tfidf_transformer.transform(...)`. Ini menghasilkan vektor skor **tf-idf**. Lalu mengurutkan kata-kata dalam vektor dalam urutan menurun dari nilai **tf-idf** dan kemudian mengulang untuk mengekstrak _item top-n_ dengan nama fitur yang sesuai. Pada contoh di bawah ini keyword yang diekstrak adalah dokumen pertama dalam pengujian.

`Metode sort_coo(...)` pada dasarnya mengurutkan nilai dalam vektor sambil mempertahankan indeks kolom. Setelah memiliki indeks kolom maka sangat mudah untuk mencari nilai kata yang sesuai seperti di `extract_topn_from_vector(...)` dimana `feature_vals.append(feature_names[idx])`.

In [258]:
# you only needs to do this once
feature_names=cv.get_feature_names_out()

# get the document that we want to extract keywords from
doc=docs_test[0]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

## 1.5 Generate keywords for a batch of documents

In [259]:
tf_idf_vector=tfidf_transformer.transform(cv.transform(docs_test))

results=[]
for i in range(tf_idf_vector.shape[0]):

    # get vector for a single document
    curr_vector=tf_idf_vector[i]

    #sort the tf-idf vector by descending order of scores
    sorted_items=sort_coo(curr_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)


    results.append(keywords)

df=pd.DataFrame(zip(docs,results),columns=['doc','keywords'])
df

Unnamed: 0,doc,keywords
0,the ordinary niacinamide zinc menyamarkan noda...,"{'wajah': 0.385, 'bekas jerawat': 0.356, 'meny..."
1,somethinc niacinamide moisture beet serum meru...,"{'mengatasi': 0.433, 'kulit': 0.388, 'menyamar..."
2,whitelab n dose brightening serum merupakan re...,"{'kulit': 0.37, 'menyamarkan': 0.344, 'mencera..."
3,azarine cosmetics c white lightening serum ada...,"{'kulit': 0.591, 'mencerahkan kulit': 0.279, '..."
4,cosrx advanced snail mucin power essence difor...,"{'tekstur': 0.482, 'kulit': 0.335, 'sehat': 0...."
...,...,...
204,safi white expert deep exfoliator adalah pembe...,"{'membersihkan': 0.409, 'pori': 0.331, 'wajah'..."
205,neogen bio peel gauze peeling wine dikemas dal...,"{'wajah': 0.357, 'kulit': 0.332, 'sel': 0.316,..."
206,paula s choice skin perfecting bha liquid exfo...,"{'pori': 0.615, 'pori pori': 0.315, 'sel': 0.3..."
207,wardah exfoliating lotion merupakan lotion yan...,"{'pori': 0.387, 'sel': 0.382, 'mengangkat sel'..."


In [260]:
df['keywords'] = df['keywords'].astype(str)

---

# **II. Keyword Assignment**

In [261]:
import pandas as pd

def assign_keywords(data, keyword_column, keywords):
    data[keyword_column] = data["DESC"].apply(lambda x: [word for word in x.split() if word.lower() in keywords])
    return data

data = pd.read_excel('sc-price.xlsx')
keyword_column = "skin_type"
keywords = ["kering", "berminyak", "sensitif", "normal", "kombinasi", "semua"]

data = assign_keywords(data, keyword_column, keywords)
data['skin_type'] = data['skin_type'].astype(str)

In [262]:
data['skin_type'].value_counts()

['Semua']                                      84
['kering']                                     29
['berminyak']                                  22
['normal']                                     15
['sensitif']                                   12
['semua', 'Semua']                              9
['sensitif', 'Semua']                           3
['berminyak', 'Semua']                          2
['kering', 'kering']                            2
['kombinasi']                                   2
['semua', 'sensitif', 'Semua']                  2
['Semua', 'Kombinasi']                          1
['kering', 'sensitif']                          1
['kering', 'kering', 'berminyak']               1
['semua', 'sensitif']                           1
['kombinasi', 'Semua']                          1
['berminyak', 'berminyak']                      1
['normal', 'normal']                            1
['kering', 'berminyak']                         1
['berminyak', 'kombinasi']                      1


In [263]:
(data[data['skin_type'].str.len() == 0])

Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,skin_type


-----

# **III. New Dataframe**

## 3.1 Skin type dataframe

In [264]:
st = data.copy()
st.head()

Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,skin_type
0,4.0,serum,Niacinamide 10% + Zinc 1%,The Ordinary,The Ordinary Niacinamide 10% + Zinc 1% menyama...,157000,['normal']
1,4.3,serum,Niacinamide + Moisture Beet Serum,Somethinc,Somethinc Niacinamide + Moisture Beet Serum me...,115000,['Semua']
2,4.7,serum,N10-Dose+ Brightening Serum,Whitelab,Whitelab N10-Dose+ Brightening Serum merupakan...,75000,['Semua']
3,4.8,serum,C White Lightening Serum,Azarine Cosmetics,Azarine Cosmetics C White Lightening Serum ada...,220000,['Semua']
4,4.3,serum,Advanced Snail 96 Mucin Power Essence,Cosrx,Cosrx Advanced Snail 96 Mucin Power Essence di...,47500,['Semua']


### 3.1.1 Clean unnecessary characters

In [265]:
st['skin_type'] = st['skin_type'].replace({r"[].'.[]":""}, regex=True)
st

Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,skin_type
0,4.0,serum,Niacinamide 10% + Zinc 1%,The Ordinary,The Ordinary Niacinamide 10% + Zinc 1% menyama...,157000,normal
1,4.3,serum,Niacinamide + Moisture Beet Serum,Somethinc,Somethinc Niacinamide + Moisture Beet Serum me...,115000,Semua
2,4.7,serum,N10-Dose+ Brightening Serum,Whitelab,Whitelab N10-Dose+ Brightening Serum merupakan...,75000,Semua
3,4.8,serum,C White Lightening Serum,Azarine Cosmetics,Azarine Cosmetics C White Lightening Serum ada...,220000,Semua
4,4.3,serum,Advanced Snail 96 Mucin Power Essence,Cosrx,Cosrx Advanced Snail 96 Mucin Power Essence di...,47500,Semua
...,...,...,...,...,...,...,...
204,4.4,scrub,White Expert Deep Exfoliator,Safi,SAFI White Expert Deep Exfoliator adalah pembe...,46800,"semua, Semua"
205,4.5,scrub,Wine Lift PHA Gauze Peeling,Neogen,Neogen Bio-Peel Gauze Peeling Wine dikemas dal...,400000,Semua
206,4.4,scrub,Skin Perfecting 2% BHA Liquid Exfoliant,Paula's Choice,Paula's Choice Skin Perfecting 2% BHA Liquid E...,390000,berminyak
207,3.7,scrub,White Secret Exfoliating Lotion,Wardah,Wardah Exfoliating Lotion merupakan lotion yan...,70000,Semua


### 3.1.2 Remove duplicates in row

In [266]:
st['skin_type'] = st['skin_type'].str.lower().str.split(',\s*').apply(set).str.join(', ')
st

Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,skin_type
0,4.0,serum,Niacinamide 10% + Zinc 1%,The Ordinary,The Ordinary Niacinamide 10% + Zinc 1% menyama...,157000,normal
1,4.3,serum,Niacinamide + Moisture Beet Serum,Somethinc,Somethinc Niacinamide + Moisture Beet Serum me...,115000,semua
2,4.7,serum,N10-Dose+ Brightening Serum,Whitelab,Whitelab N10-Dose+ Brightening Serum merupakan...,75000,semua
3,4.8,serum,C White Lightening Serum,Azarine Cosmetics,Azarine Cosmetics C White Lightening Serum ada...,220000,semua
4,4.3,serum,Advanced Snail 96 Mucin Power Essence,Cosrx,Cosrx Advanced Snail 96 Mucin Power Essence di...,47500,semua
...,...,...,...,...,...,...,...
204,4.4,scrub,White Expert Deep Exfoliator,Safi,SAFI White Expert Deep Exfoliator adalah pembe...,46800,semua
205,4.5,scrub,Wine Lift PHA Gauze Peeling,Neogen,Neogen Bio-Peel Gauze Peeling Wine dikemas dal...,400000,semua
206,4.4,scrub,Skin Perfecting 2% BHA Liquid Exfoliant,Paula's Choice,Paula's Choice Skin Perfecting 2% BHA Liquid E...,390000,berminyak
207,3.7,scrub,White Secret Exfoliating Lotion,Wardah,Wardah Exfoliating Lotion merupakan lotion yan...,70000,semua


In [267]:
st['skin_type'].value_counts()

semua                       94
kering                      32
berminyak                   23
normal                      17
sensitif                    13
semua, sensitif              7
semua, berminyak             4
kombinasi                    3
kering, berminyak            3
semua, kering                2
normal, kering               2
kombinasi, semua             2
kering, sensitif             1
kombinasi, berminyak         1
normal, semua                1
normal, berminyak            1
normal, semua, sensitif      1
kombinasi, normal            1
normal, semua, berminyak     1
Name: skin_type, dtype: int64

## 3.2 Keywords dataframe

In [268]:
scc = df[['keywords']]
scc.head()

Unnamed: 0,keywords
0,"{'wajah': 0.385, 'bekas jerawat': 0.356, 'meny..."
1,"{'mengatasi': 0.433, 'kulit': 0.388, 'menyamar..."
2,"{'kulit': 0.37, 'menyamarkan': 0.344, 'mencera..."
3,"{'kulit': 0.591, 'mencerahkan kulit': 0.279, '..."
4,"{'tekstur': 0.482, 'kulit': 0.335, 'sehat': 0...."


### 3.2.1 Clean unnecessary characters

In [269]:
scc.keywords = scc.keywords.replace({r"[:.{}']":""}, regex=True)
scc.head()

Unnamed: 0,keywords
0,"wajah 0385, bekas jerawat 0356, menyamarkan no..."
1,"mengatasi 0433, kulit 0388, menyamarkan noda 0..."
2,"kulit 037, menyamarkan 0344, mencerahkan 0286,..."
3,"kulit 0591, mencerahkan kulit 0279, mengurangi..."
4,"tekstur 0482, kulit 0335, sehat 0294, menyerap..."


In [270]:
scc['keywords'] = scc['keywords'].str.replace('\d', '', regex=True)
scc.head()

Unnamed: 0,keywords
0,"wajah , bekas jerawat , menyamarkan noda , bek..."
1,"mengatasi , kulit , menyamarkan noda , kulit k..."
2,"kulit , menyamarkan , mencerahkan , wajah , be..."
3,"kulit , mencerahkan kulit , mengurangi , diper..."
4,"tekstur , kulit , sehat , menyerap , kelembapa..."


## 3.3 Merging skin type with the keywords

In [271]:
df = pd.merge(st, scc, left_index=True, right_index=True, how='inner')
df.head()

Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,skin_type,keywords
0,4.0,serum,Niacinamide 10% + Zinc 1%,The Ordinary,The Ordinary Niacinamide 10% + Zinc 1% menyama...,157000,normal,"wajah , bekas jerawat , menyamarkan noda , bek..."
1,4.3,serum,Niacinamide + Moisture Beet Serum,Somethinc,Somethinc Niacinamide + Moisture Beet Serum me...,115000,semua,"mengatasi , kulit , menyamarkan noda , kulit k..."
2,4.7,serum,N10-Dose+ Brightening Serum,Whitelab,Whitelab N10-Dose+ Brightening Serum merupakan...,75000,semua,"kulit , menyamarkan , mencerahkan , wajah , be..."
3,4.8,serum,C White Lightening Serum,Azarine Cosmetics,Azarine Cosmetics C White Lightening Serum ada...,220000,semua,"kulit , mencerahkan kulit , mengurangi , diper..."
4,4.3,serum,Advanced Snail 96 Mucin Power Essence,Cosrx,Cosrx Advanced Snail 96 Mucin Power Essence di...,47500,semua,"tekstur , kulit , sehat , menyerap , kelembapa..."


## 3.4 Merging dataframe with allergen extraction

In [272]:
key = pd.DataFrame(df_idf['ingredients'].apply(lambda x:pre_process(x)))
key

Unnamed: 0,ingredients
0,aqua water niacinamide zinc pca tamarindus in...
1,aqua allantoin edta niacinamide hydroxyethylc...
2,aqua propylene glycol niacinamide ethyl alcoho...
3,kakadu plum lemon niacinamide papaya aloe vera
4,snail secretion filtrate betaine butylene gly...
...,...
204,aqua glyceryl stearate glycerin caprylic capri...
205,water disodium edta glycerin butylene glycol ...
206,water methylpropanediol butylene glycol slip ...
207,aqua glyceryl stearate cetyl alcohol stearyl a...


In [273]:
key['allergen'] = key['ingredients'].str.contains('benzyl alcohol|benzyl salicylate|citral|citronellol|eugenol|geraniol|linalool|methylisothiazolinone|methylchloroisothiazolinone|dmdm hydantoin|imidazolidinyl urea|sodium hydroxymethylglycinate|parfum|parfume').map({True: 'Ya', False: 'Tidak'})
key

Unnamed: 0,ingredients,allergen
0,aqua water niacinamide zinc pca tamarindus in...,Tidak
1,aqua allantoin edta niacinamide hydroxyethylc...,Tidak
2,aqua propylene glycol niacinamide ethyl alcoho...,Ya
3,kakadu plum lemon niacinamide papaya aloe vera,Tidak
4,snail secretion filtrate betaine butylene gly...,Tidak
...,...,...
204,aqua glyceryl stearate glycerin caprylic capri...,Tidak
205,water disodium edta glycerin butylene glycol ...,Tidak
206,water methylpropanediol butylene glycol slip ...,Tidak
207,aqua glyceryl stearate cetyl alcohol stearyl a...,Ya


In [274]:
key[key['allergen'].isna()]

Unnamed: 0,ingredients,allergen


In [275]:
len(key[key['allergen'] == 'Ya'])

38

In [276]:
df = pd.merge(df, key['allergen'], left_index=True, right_index=True, how='inner')
df.head()

Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,skin_type,keywords,allergen
0,4.0,serum,Niacinamide 10% + Zinc 1%,The Ordinary,The Ordinary Niacinamide 10% + Zinc 1% menyama...,157000,normal,"wajah , bekas jerawat , menyamarkan noda , bek...",Tidak
1,4.3,serum,Niacinamide + Moisture Beet Serum,Somethinc,Somethinc Niacinamide + Moisture Beet Serum me...,115000,semua,"mengatasi , kulit , menyamarkan noda , kulit k...",Tidak
2,4.7,serum,N10-Dose+ Brightening Serum,Whitelab,Whitelab N10-Dose+ Brightening Serum merupakan...,75000,semua,"kulit , menyamarkan , mencerahkan , wajah , be...",Ya
3,4.8,serum,C White Lightening Serum,Azarine Cosmetics,Azarine Cosmetics C White Lightening Serum ada...,220000,semua,"kulit , mencerahkan kulit , mengurangi , diper...",Tidak
4,4.3,serum,Advanced Snail 96 Mucin Power Essence,Cosrx,Cosrx Advanced Snail 96 Mucin Power Essence di...,47500,semua,"tekstur , kulit , sehat , menyerap , kelembapa...",Tidak


---

# **IV. Cosine Similarity**

In [277]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [278]:
# Record start time
import time
start = time.time()

skin_type = df['skin_type'].unique()
keywords_list = df['keywords'].unique()

def recommend(description, allergen=None):
    # Convert user input to lowercase
    description = description.lower()

    data = df.copy()

    # Extract skin type
    st_input = []
    for st in skin_type:
        if st in description:
            st_input.append(st)
            description = description.replace(st, "")

    res_st = ''.join(st_input)

    for i in description.split():
        if i == 'berjerawat':
            data = data[data['CATEGORY'] != 'peeling']

    # Extract keyword
    keyword_input = []
    for keyword in keywords_list:
        if keyword in description:
            keyword_input.append(keyword)
            description = description.replace(keyword, "")

    if keyword_input:
      data = data[data['keywords'].isin(keyword_input)]

    # Filtering allergen products
    if allergen == 'Ya':
      data = data[data['allergen'] == 'Tidak']

    # Init a TF-IDF vectorizer
    tfidfvec = TfidfVectorizer()

    #  Fit data on processed keywords
    vec = tfidfvec.fit(data["keywords"])
    features = vec.transform(data["keywords"])

    # Transform user input data based on fitted model
    description_vector =  vec.transform([description])

    # Calculate cosine similarities between users processed input and keywords
    cos_sim = cosine_similarity(description_vector, features)

    # Add similarities to data frame
    data['similarity'] = cos_sim[0]

    # Sort data frame by similarities
    data.sort_values(by='similarity', ascending=False, inplace=True)

    data = data[['RATING', 'CATEGORY', 'PRODUCT', 'BRAND', 'DESC', 'PRICE', 'skin_type', 'similarity', 'allergen']]

    result = data[(data['similarity'] > 0.25) & ((data['skin_type'].str.contains(res_st)) | (data['skin_type'] == 'semua'))]

    print(f'Hasil rekomendasi TF-IDF & Cosine Similarity: {len(result)}\n')

    return result

print("Time taken: %s seconds" % (time.time() - start))

Time taken: 0.002087116241455078 seconds


In [279]:
recommend('kulit kering mengangkat sel kulit mati')

Hasil rekomendasi TF-IDF & Cosine Similarity: 15



Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,skin_type,similarity,allergen
172,4.5,peeling,NovAge ProCeuticals 6% AHA Peel Solution,Oriflame,Oriflame NovAge ProCeuticals 6% AHA Peel Solut...,299000,semua,0.816592,Tidak
163,4.7,peeling,Dead Skin Cell Remover,Envygreen,Envygreen Dead Skin Cell Remover merupakan pee...,56000,semua,0.769356,Tidak
159,4.2,peeling,Clear C Peeling Serum,Laneige,Laneige Clear C Peeling Serum adalah serum den...,300000,kering,0.729279,Tidak
207,3.7,scrub,White Secret Exfoliating Lotion,Wardah,Wardah Exfoliating Lotion merupakan lotion yan...,70000,semua,0.688667,Ya
202,3.3,scrub,Lightening Gentle Exfoliator,Wardah,Wardah Lightening Gentle Exfoliator merupakan ...,98000,semua,0.65055,Tidak
197,4.2,scrub,AHA 7 Whitehead Power Liquid,Cosrx,Cosrx AHA 7 Whitehead Power Liquid merupakan c...,205000,semua,0.591485,Tidak
64,3.6,night cream,White Secret Night Cream - Discontinued,Wardah,Wardah White Secret Night Cream adalah krim ma...,98000,semua,0.55534,Tidak
7,4.1,serum,Facial Treatment Essence,SK-II,SK-II Facial Treatment Essence adalah Miracle ...,1825000,semua,0.450191,Tidak
170,4.1,peeling,Age Defy Deep Exfoliator,Safi,KASMARAN Triple Acid with Skinperf Peeling Sol...,46800,semua,0.442443,Tidak
194,4.3,scrub,Lemon Bright PHA Gauze Peeling,Neogen,Neoge Bio-Peel Gauze Peeling Lemon dikemas dal...,69000,semua,0.399517,Tidak


# **V. Validation**

## 5.1 Cleaning data for validation

In [280]:
st['DESC'] = st['DESC'].apply(lambda x:pre_process(x))
st.head()

Unnamed: 0,RATING,CATEGORY,PRODUCT,BRAND,DESC,PRICE,skin_type
0,4.0,serum,Niacinamide 10% + Zinc 1%,The Ordinary,the ordinary niacinamide zinc menyamarkan noda...,157000,normal
1,4.3,serum,Niacinamide + Moisture Beet Serum,Somethinc,somethinc niacinamide moisture beet serum meru...,115000,semua
2,4.7,serum,N10-Dose+ Brightening Serum,Whitelab,whitelab n dose brightening serum merupakan re...,75000,semua
3,4.8,serum,C White Lightening Serum,Azarine Cosmetics,azarine cosmetics c white lightening serum ada...,220000,semua
4,4.3,serum,Advanced Snail 96 Mucin Power Essence,Cosrx,cosrx advanced snail mucin power essence difor...,47500,semua


## 5.2 recommended_items

In [281]:
def recommended_items(a):
  lst = []
  output = recommend(a)
  kolom = output['PRODUCT']
  for i in kolom:
    lst.append(i)

  return lst

## 5.3 relevant_items

In [282]:
tipe_kulit = st['skin_type'].unique()

def relevant_items(deskripsi):

    deskripsi = deskripsi.lower().replace('kulit', '', 1).strip()

    df = st.copy()

    tipe_input = []
    for tk in tipe_kulit:
        if tk in deskripsi:
          tipe_input.append(tk)
          deskripsi = deskripsi.replace(tk, "")

    hasil = ''.join(tipe_input)

    lst = []
    
    output = df[((df['skin_type'].str.contains(hasil)) | (df['skin_type'] == 'semua')) & (df['DESC'].str.contains(deskripsi))]
    
    kolom = output['PRODUCT']
    for i in kolom:
        lst.append(i)
        
    print(f'Hasil rekomendasi Cosine Similarity: {len(output)}\n')
    
    return lst

## 5.4 validation

In [283]:
def validation(recommended_items, relevant_items):
    # Calculate true positives, false positives, true negatives, and false negatives
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for item in recommended_items:
        if item in relevant_items:
            TP += 1
        else:
            FP += 1

    for item in relevant_items:
        if item not in recommended_items:
            FN += 1

    # Calculate true negatives
    TN = len(relevant_items) - FN

    # Calculate precision, recall, F1-score, and accuracy
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * (precision * recall) / (precision + recall)
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    
    print(f'TP: {TP}, FP: {FP}, TN: {TN}, FN: {FN}\n')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", F1)
    print("Accuracy:", accuracy)

In [284]:
validation(recommended_items('kulit kering menyamarkan noda'), relevant_items('kulit kering menyamarkan noda'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 19

Hasil rekomendasi Cosine Similarity: 14

TP: 14, FP: 5, TN: 14, FN: 0

Precision: 0.7368421052631579
Recall: 1.0
F1-score: 0.8484848484848484
Accuracy: 0.8484848484848485


In [285]:
validation(recommended_items('kulit berminyak menyamarkan noda'), relevant_items('kulit berminyak menyamarkan noda'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 17

Hasil rekomendasi Cosine Similarity: 12

TP: 12, FP: 5, TN: 12, FN: 0

Precision: 0.7058823529411765
Recall: 1.0
F1-score: 0.8275862068965517
Accuracy: 0.8275862068965517


In [286]:
validation(recommended_items('kulit normal menyamarkan noda hitam'), relevant_items('kulit normal menyamarkan noda hitam'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 17

Hasil rekomendasi Cosine Similarity: 8

TP: 8, FP: 9, TN: 8, FN: 0

Precision: 0.47058823529411764
Recall: 1.0
F1-score: 0.6399999999999999
Accuracy: 0.64


In [287]:
validation(recommended_items('kulit sensitif menyamarkan noda hitam'), relevant_items('kulit sensitif menyamarkan noda hitam'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 17

Hasil rekomendasi Cosine Similarity: 9

TP: 9, FP: 8, TN: 9, FN: 0

Precision: 0.5294117647058824
Recall: 1.0
F1-score: 0.6923076923076924
Accuracy: 0.6923076923076923


In [288]:
validation(recommended_items('kulit kombinasi menyamarkan noda hitam'), relevant_items('kulit kombinasi menyamarkan noda hitam'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 18

Hasil rekomendasi Cosine Similarity: 8

TP: 8, FP: 10, TN: 8, FN: 0

Precision: 0.4444444444444444
Recall: 1.0
F1-score: 0.6153846153846153
Accuracy: 0.6153846153846154


In [289]:
validation(recommended_items('kulit kering mencerahkan kulit'), relevant_items('kulit kering mencerahkan kulit'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 21

Hasil rekomendasi Cosine Similarity: 16

TP: 14, FP: 7, TN: 14, FN: 2

Precision: 0.6666666666666666
Recall: 0.875
F1-score: 0.7567567567567567
Accuracy: 0.7567567567567568


In [290]:
validation(recommended_items('kulit berminyak mencerahkan kulit'), relevant_items('kulit berminyak mencerahkan kulit'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 18

Hasil rekomendasi Cosine Similarity: 14

TP: 14, FP: 4, TN: 13, FN: 1

Precision: 0.7777777777777778
Recall: 0.9333333333333333
F1-score: 0.8484848484848485
Accuracy: 0.84375


In [291]:
validation(recommended_items('kulit normal mencerahkan kulit'), relevant_items('kulit normal mencerahkan kulit'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 16

Hasil rekomendasi Cosine Similarity: 15

TP: 14, FP: 2, TN: 14, FN: 1

Precision: 0.875
Recall: 0.9333333333333333
F1-score: 0.9032258064516129
Accuracy: 0.9032258064516129


In [292]:
validation(recommended_items('kulit sensitif mencerahkan kulit'), relevant_items('kulit sensitif mencerahkan kulit'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 14

Hasil rekomendasi Cosine Similarity: 14

TP: 12, FP: 2, TN: 12, FN: 2

Precision: 0.8571428571428571
Recall: 0.8571428571428571
F1-score: 0.8571428571428571
Accuracy: 0.8571428571428571


In [293]:
validation(recommended_items('kulit kombinasi mencerahkan kulit'), relevant_items('kulit kombinasi mencerahkan kulit'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 14

Hasil rekomendasi Cosine Similarity: 14

TP: 12, FP: 2, TN: 12, FN: 2

Precision: 0.8571428571428571
Recall: 0.8571428571428571
F1-score: 0.8571428571428571
Accuracy: 0.8571428571428571


In [294]:
validation(recommended_items('kulit berminyak minyak berlebih'), relevant_items('kulit berminyak minyak berlebih'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 10

Hasil rekomendasi Cosine Similarity: 10

TP: 10, FP: 0, TN: 10, FN: 0

Precision: 1.0
Recall: 1.0
F1-score: 1.0
Accuracy: 1.0


In [295]:
validation(recommended_items('kulit kering melembabkan'), relevant_items('kulit kering melembabkan'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 17

Hasil rekomendasi Cosine Similarity: 29

TP: 17, FP: 0, TN: 17, FN: 12

Precision: 1.0
Recall: 0.5862068965517241
F1-score: 0.7391304347826086
Accuracy: 0.7391304347826086


In [296]:
validation(recommended_items('kulit normal melembabkan'), relevant_items('kulit normal melembabkan'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 11

Hasil rekomendasi Cosine Similarity: 22

TP: 11, FP: 0, TN: 11, FN: 11

Precision: 1.0
Recall: 0.5
F1-score: 0.6666666666666666
Accuracy: 0.6666666666666666


In [297]:
validation(recommended_items('kulit sensitif melembabkan'), relevant_items('kulit sensitif melembabkan'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 11

Hasil rekomendasi Cosine Similarity: 24

TP: 11, FP: 0, TN: 11, FN: 13

Precision: 1.0
Recall: 0.4583333333333333
F1-score: 0.6285714285714286
Accuracy: 0.6285714285714286


In [298]:
validation(recommended_items('kulit kombinasi melembabkan'), relevant_items('kulit kombinasi melembabkan'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 10

Hasil rekomendasi Cosine Similarity: 21

TP: 10, FP: 0, TN: 10, FN: 11

Precision: 1.0
Recall: 0.47619047619047616
F1-score: 0.6451612903225806
Accuracy: 0.6451612903225806


In [299]:
validation(recommended_items('kulit kering jerawat'), relevant_items('kulit kering jerawat'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 15

Hasil rekomendasi Cosine Similarity: 20

TP: 15, FP: 0, TN: 15, FN: 5

Precision: 1.0
Recall: 0.75
F1-score: 0.8571428571428571
Accuracy: 0.8571428571428571


In [300]:
validation(recommended_items('kulit berminyak jerawat'), relevant_items('kulit berminyak jerawat'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 14

Hasil rekomendasi Cosine Similarity: 23

TP: 14, FP: 0, TN: 14, FN: 9

Precision: 1.0
Recall: 0.6086956521739131
F1-score: 0.7567567567567568
Accuracy: 0.7567567567567568


In [301]:
validation(recommended_items('kulit normal jerawat'), relevant_items('kulit normal jerawat'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 13

Hasil rekomendasi Cosine Similarity: 18

TP: 13, FP: 0, TN: 13, FN: 5

Precision: 1.0
Recall: 0.7222222222222222
F1-score: 0.8387096774193548
Accuracy: 0.8387096774193549


In [302]:
validation(recommended_items('kulit sensitif jerawat'), relevant_items('kulit sensitif jerawat'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 12

Hasil rekomendasi Cosine Similarity: 16

TP: 12, FP: 0, TN: 12, FN: 4

Precision: 1.0
Recall: 0.75
F1-score: 0.8571428571428571
Accuracy: 0.8571428571428571


In [303]:
validation(recommended_items('kulit kombinasi jerawat'), relevant_items('kulit kombinasi jerawat'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 10

Hasil rekomendasi Cosine Similarity: 13

TP: 10, FP: 0, TN: 10, FN: 3

Precision: 1.0
Recall: 0.7692307692307693
F1-score: 0.8695652173913044
Accuracy: 0.8695652173913043


In [304]:
validation(recommended_items('kulit kering mengangkat sel kulit mati'), relevant_items('kulit kering mengangkat sel kulit mati'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 15

Hasil rekomendasi Cosine Similarity: 5

TP: 5, FP: 10, TN: 5, FN: 0

Precision: 0.3333333333333333
Recall: 1.0
F1-score: 0.5
Accuracy: 0.5


In [305]:
validation(recommended_items('kulit berminyak mengangkat sel kulit mati'), relevant_items('kulit berminyak mengangkat sel kulit mati'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 22

Hasil rekomendasi Cosine Similarity: 8

TP: 8, FP: 14, TN: 8, FN: 0

Precision: 0.36363636363636365
Recall: 1.0
F1-score: 0.5333333333333333
Accuracy: 0.5333333333333333


In [306]:
validation(recommended_items('kulit normal mengangkat sel kulit mati'), relevant_items('kulit normal mengangkat sel kulit mati'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 16

Hasil rekomendasi Cosine Similarity: 6

TP: 6, FP: 10, TN: 6, FN: 0

Precision: 0.375
Recall: 1.0
F1-score: 0.5454545454545454
Accuracy: 0.5454545454545454


In [307]:
validation(recommended_items('kulit sensitif mengangkat sel kulit mati'), relevant_items('kulit sensitif mengangkat sel kulit mati'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 15

Hasil rekomendasi Cosine Similarity: 5

TP: 5, FP: 10, TN: 5, FN: 0

Precision: 0.3333333333333333
Recall: 1.0
F1-score: 0.5
Accuracy: 0.5


In [308]:
validation(recommended_items('kulit kombinasi mengangkat sel kulit mati'), relevant_items('kulit kombinasi mengangkat sel kulit mati'))

Hasil rekomendasi TF-IDF & Cosine Similarity: 15

Hasil rekomendasi Cosine Similarity: 5

TP: 5, FP: 10, TN: 5, FN: 0

Precision: 0.3333333333333333
Recall: 1.0
F1-score: 0.5
Accuracy: 0.5


صَدَقَ اللهُ الْعَظِيْمُ