In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Membaca data dari file CSV
data = pd.read_csv('data_skinrec_fix.csv', sep=';')

# Memastikan tidak ada nilai NaN di kolom deskripsi
data['description'].fillna('', inplace=True)

# Daftar kata kunci untuk setiap keluhan
issues_keywords = {
    'kulit_kusam': ['kusam'],
    'jerawat': ['jerawat'],
    'bekas_jerawat': ['bekas jerawat'],
    'pori_pori_besar': ['pori-pori besar'],
    'flek_hitam': ['flek hitam'],
    'garis_halus_dan_kerutan': ['garis halus', 'kerutan'],
    'komedo': ['komedo'],
    'warna_kulit_tidak_merata': ['tidak merata'],
    'kemerahan': ['kemerahan'],
    'kulit_kendur': ['kendur']
}

# Menggabungkan semua kata kunci keluhan menjadi satu string
all_issue_keywords = [' '.join(keywords) for keywords in issues_keywords.values()]

# Menggabungkan deskripsi produk dan kata kunci keluhan
all_text = list(data['description']) + all_issue_keywords

# Menghitung TF-IDF untuk semua teks
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_text)

# Memisahkan matriks TF-IDF untuk deskripsi produk dan kata kunci keluhan
tfidf_matrix_products = tfidf_matrix[:len(data)]
tfidf_matrix_issues = tfidf_matrix[len(data):]

# Menghitung skor kesamaan antara deskripsi produk dan kata kunci keluhan
similarities = cosine_similarity(tfidf_matrix_products, tfidf_matrix_issues)

# Ambang batas kesamaan untuk menambahkan keluhan ke daftar
threshold = 0.01  # Anda dapat menyesuaikan nilai ini

# Menyusun keluhan relevan dalam bentuk string untuk kolom 'keluhan'
keluhan_relevan = []
for idx, similarity in enumerate(similarities):
    relevant_issues = [list(issues_keywords.keys())[i] for i in range(len(similarity)) if similarity[i] > threshold]
    if not relevant_issues:
        # Jika tidak ada keluhan yang melebihi ambang batas, pilih keluhan dengan skor tertinggi
        max_similarity_index = similarity.argmax()
        relevant_issues = [list(issues_keywords.keys())[max_similarity_index]]
    keluhan_relevan.append(', '.join(relevant_issues))

# Menambahkan kolom 'keluhan' dengan keluhan relevan
data['keluhan'] = keluhan_relevan

# Membuat kolom biner untuk setiap keluhan unik
for issue in issues_keywords.keys():
    data[issue] = data['keluhan'].apply(lambda x: 1 if issue in x else 0)

# Menyimpan data dengan kolom baru
data.to_csv('data_keluhan.csv', index=False)

In [9]:
import pandas as pd
# Menyimpan data dengan kolom baru
data_kel = pd.read_csv('data_keluhan.csv').drop(columns='labels')  # Ganti 'data_dengan_keluhan.csv' dengan nama file output Anda
data_kel.head()

Unnamed: 0,product_href,product_name,product_type,brand,notable_effects,skintype,price,description,picture_src,sensitive,...,kulit_kusam,jerawat,bekas_jerawat,pori_pori_besar,flek_hitam,garis_halus_dan_kerutan,komedo,warna_kulit_tidak_merata,kemerahan,kulit_kendur
0,https://www.beautyhaul.com/product/detail/bubb...,ACWELL Bubble Free PH Balancing Cleanser,Face Wash,ACWELL,"Acne-Free, Pore-Care, Brightening, Anti-Aging",Oily,209000,Mengangkat kotoran dan menghapus makeup dalam ...,https://www.beautyhaul.com/assets/uploads/prod...,0,...,0,1,1,0,0,0,0,0,0,0
1,https://www.sociolla.com/face-wash/62452-ph-ba...,ACWELL pH Balancing Soothing Cleansing Foam,Face Wash,ACWELL,"Soothing, Balancing","Normal, Dry, Combination",181800,Membersihkan dan menenangkan kulit sensitif de...,https://images.soco.id/8f08ced0-344d-41f4-a15e...,0,...,1,0,0,0,0,0,0,0,0,0
2,https://www.sociolla.com/toner/15871-licorice-...,Acwell Licorice pH Balancing Cleansing Toner,Toner,ACWELL,"Soothing, Balancing","Normal, Dry, Oily, Combination, Sensitive",149000,"Mengangkat sisa kotoran, debu, dan make up sek...","https://www.soco.id/cdn-cgi/image/w=73,format=...",1,...,1,0,0,0,0,0,0,0,0,0
3,https://www.beautyhaul.com/product/detail/aqua...,ACWELL Aquaseal Soothing Tonic,Toner,ACWELL,"Acne-Free, Pore-Care, Brightening, Anti-Aging",Oily,290000,Pre-essence yang diformulasikan dengan ekstrak...,https://www.beautyhaul.com/assets/uploads/prod...,0,...,1,0,0,0,0,0,0,0,0,0
4,https://www.sociolla.com/essence/38023-licoric...,Licorice pH Balancing Essence Mist,Toner,ACWELL,"Brightening, Soothing","Normal, Dry",194650,Essens mist dengan kelembapan tinggi yang memb...,"https://www.sociolla.com/cdn-cgi/image/w=425,f...",0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
pd.DataFrame(data_kel['notable_effects'].value_counts())

Unnamed: 0,notable_effects
"Pore-Care, Brightening, Anti-Aging",146
Anti-Aging,102
"Acne-Free, Oil-Control, Pore-Care",78
"Hydrating, Moisturizing",68
"Moisturizing, Brightening, Black-Spot",66
...,...
"Soothing, Moisturizing, UV-Protection",1
"Balancing, Acne-Free, Pore-Care",1
"Moisturizing, Acne-Free, Oil-Control",1
"Acne-Free, UV-Protection",1


In [11]:
data_kel['product_type'].value_counts()

Serum          287
Toner          244
Moisturizer    236
Sunscreen      207
Face Wash      193
Name: product_type, dtype: int64

In [12]:
data_kel['skintype'].value_counts()

Oily                                         303
Normal, Dry, Oily, Combination, Sensitive    243
Dry                                          164
Normal, Dry                                   91
Normal, Dry, Oily, Combination                62
Normal, Dry, Oily, Sensitive                  51
Sensitive                                     50
Oily, Combination, Sensitive                  37
Dry, Oily, Sensitive                          33
Dry, Sensitive                                27
Oily, Sensitive                               24
Oily, Combination                             24
Normal, Dry, Combination                      22
Normal, Combination                           18
Normal, Oily                                  18
Name: skintype, dtype: int64

In [13]:
data_kel['keluhan'].value_counts()

kulit_kusam                                                                              631
garis_halus_dan_kerutan                                                                   80
warna_kulit_tidak_merata                                                                  77
pori_pori_besar                                                                           48
jerawat, bekas_jerawat                                                                    42
                                                                                        ... 
jerawat, bekas_jerawat, pori_pori_besar, flek_hitam, garis_halus_dan_kerutan, komedo       1
kulit_kusam, jerawat, bekas_jerawat, garis_halus_dan_kerutan                               1
jerawat, bekas_jerawat, flek_hitam, garis_halus_dan_kerutan, warna_kulit_tidak_merata      1
jerawat, bekas_jerawat, pori_pori_besar, flek_hitam, kemerahan                             1
garis_halus_dan_kerutan, komedo                                       

In [14]:
# Memastikan tidak ada nilai NaN di kolom notable_effects
data_kel['notable_effects'].fillna('', inplace=True)

# Memisahkan kondisi unik di kolom notable_effects
all_effects = set()
for index, row in data_kel.iterrows():
    effects = row['notable_effects'].split(',')
    for effect in effects:
        effect = effect.strip().lower().replace('-', '_')
        if effect:
            all_effects.add(effect)

# Membuat kolom biner untuk setiap kondisi unik
for effect in all_effects:
    data_kel[effect] = data_kel['notable_effects'].apply(lambda x: 1 if effect in x.lower().split(', ') else 0)

# Menyimpan data dengan kolom baru ke dalam file CSV
data_kel.to_csv('data_skinrec_bismillah.csv', index=False)

In [15]:
result = pd.read_csv('data_skinrec_bismillah.csv')
print(result.columns)

Index(['product_href', 'product_name', 'product_type', 'brand',
       'notable_effects', 'skintype', 'price', 'description', 'picture_src',
       'sensitive', 'combination', 'oily', 'dry', 'normal', 'rating',
       'keluhan', 'kulit_kusam', 'jerawat', 'bekas_jerawat', 'pori_pori_besar',
       'flek_hitam', 'garis_halus_dan_kerutan', 'komedo',
       'warna_kulit_tidak_merata', 'kemerahan', 'kulit_kendur', 'acne_free',
       'skin_barrier', 'soothing', 'oil_control', 'hydrating', 'anti_aging',
       'moisturizing', 'brightening', 'uv_protection', 'balancing',
       'refreshing', 'black_spot', 'no_whitecast', 'pore_care'],
      dtype='object')
