In [1]:
#Import Library dan Load Dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Sistem rekomendasi skincare yang saya bangun menggunakan pendekatan Hybrid Filtering, yaitu menggabungkan dua metode utama:



*   Content-Based Filtering Berdasarkan kemiripan ingredients antar produk. Menggunakan teknik TF-IDF Vectorizer dan Cosine Similarity untuk mengukur kesamaan antar produk dari sisi kandungan.

*   Attribute-Based Filtering Berdasarkan kecocokan jenis kulit target produk (seperti Combination, Dry, Oily, dll).

### Sistem menghitung kemiripan atribut jenis kulit antar produk untuk menyesuaikan dengan kebutuhan pengguna. Kedua skor ini dikombinasikan menggunakan bobot tertentu (alpha) untuk menghasilkan Hybrid Score yang lebih personal dan relevan.



In [2]:
df = pd.read_csv('/content/drive/MyDrive/Semester 6/skincare_ingredients.csv')

#Pembersihan Kolom Ingredients
df['ingredients'] = df['ingredients'].str.lower().str.replace(r'[^\w\s,]', '', regex=True)
df['ingredients_joined'] = df['ingredients'].str.replace(',', ' ')

# Tampilkan 5 baris pertama sebelum dan sesudah preprocessing
df[['ingredients_raw', 'ingredients']].head()

KeyError: "['ingredients_raw'] not in index"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#TF-IDF dan Cosine Similarity
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['ingredients_joined'])
cosine_sim = cosine_similarity(tfidf_matrix)

print(df.head())

In [None]:
#Fungsi Precision@K Hybrid
def get_precision_at_k_hybrid(product_name, df, cosine_matrix, alpha=0.7, k=5):
    try:
        idx = df[df['name'].str.lower() == product_name.lower()].index[0]
    except IndexError:
        return "Produk tidak ditemukan."

    target_skin = df.loc[idx, ['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']]
    skin_matrix = df[['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']].values
    skin_sim = (skin_matrix == target_skin.values).sum(axis=1) / 5

    final_score = (alpha * cosine_matrix[idx]) + ((1 - alpha) * skin_sim)
    sim_scores = list(enumerate(final_score))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in sim_scores if i != idx][:k]
    recommended = df.iloc[top_indices]

    def is_relevant(row):
        skin = row[['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']]
        return (skin == target_skin).sum() / 5 >= 0.6

    relevant_count = recommended.apply(is_relevant, axis=1).sum()
    return relevant_count / k

In [None]:
#Evaluasi Rata-Rata Precision Semua Produk
def evaluate_precision_all_hybrid(df, cosine_matrix, alpha=0.7, k=5):
    precisions = []
    for i in range(len(df)):
        product_name = df.iloc[i]['name']
        precision = get_precision_at_k_hybrid(product_name, df, cosine_matrix, alpha=alpha, k=k)
        if isinstance(precision, float):
            precisions.append(precision)
    return sum(precisions) / len(precisions)

In [None]:
# 5. Fungsi untuk Menampilkan Top-K Produk Rekomendasi dengan Skor
def recommend_products_hybrid(product_name, df, cosine_matrix, alpha=0.7, k=6):
    try:
        idx = df[df['name'].str.lower() == product_name.lower()].index[0]
    except IndexError:
        return "Produk tidak ditemukan."

    target_skin = df.loc[idx, ['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']]
    skin_matrix = df[['Combination', 'Dry', 'Normal', 'Oily', 'Sensitive']].values
    skin_sim = (skin_matrix == target_skin.values).sum(axis=1) / 5

    final_score = (alpha * cosine_matrix[idx]) + ((1 - alpha) * skin_sim)
    sim_scores = list(enumerate(final_score))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in sim_scores if i != idx][:k]

    recommended = df.iloc[top_indices].copy()
    recommended['Hybrid_Score'] = [final_score[i] for i in top_indices]


# Buat output sebagai DataFrame
    output_cols = ['name', 'Hybrid_Score']
    if 'brand' in df.columns:
        output_cols.insert(1, 'brand')

    print(f"\nProduk input: '{df.loc[idx, 'name']}'")
    print(f"\n{len(top_indices)} Rekomendasi Produk Teratas:\n")
    return recommended[output_cols]

In [None]:
recommend_products_hybrid("Vinoperfect Glycolic Brightening Essence", df, cosine_sim, alpha=0.7, k=5)


In [None]:
print("Precision@5 (Hybrid) untuk satu produk:")
print(get_precision_at_k_hybrid("EGF Serum", df, cosine_sim, alpha=0.7, k=5))

print("\nRata-rata Precision@5 semua produk:")
print(evaluate_precision_all_hybrid(df, cosine_sim, alpha=0.7, k=5))

# Task
Evaluate the hybrid recommendation system with various alpha and k parameters, visualize the results using a bar chart, and analyze the impact of the parameters on performance.

## Evaluasi dengan berbagai parameter

### Subtask:
Jalankan fungsi `evaluate_precision_all_hybrid` untuk berbagai kombinasi nilai `alpha` (misalnya, 0.1, 0.3, 0.5, 0.7, 0.9) dan `k` (misalnya, 3, 5, 10). Simpan hasilnya dalam sebuah struktur data (misalnya, list of dictionaries atau DataFrame).


**Reasoning**:
Iterate through different alpha and k values to evaluate the hybrid recommendation system's performance and store the results.



In [None]:
alpha_values = [0.1, 0.3, 0.5, 0.7, 0.9]
k_values = [3, 5, 10]
evaluation_results = []

for alpha in alpha_values:
    for k in k_values:
        avg_precision = evaluate_precision_all_hybrid(df, cosine_sim, alpha=alpha, k=k)
        evaluation_results.append({
            'alpha': alpha,
            'k': k,
            'average_precision': avg_precision
        })

# Convert the results to a DataFrame for better visualization and handling
results_df = pd.DataFrame(evaluation_results)
display(results_df)

## Persiapan data untuk visualisasi

### Subtask:
Ubah hasil evaluasi ke dalam format yang cocok untuk visualisasi, misalnya DataFrame pandas dengan kolom untuk `alpha`, `k`, dan rata-rata Precision@K.


**Reasoning**:
Convert the list of evaluation results into a pandas DataFrame and display the head to verify the structure.



In [None]:
# Convert the evaluation_results list to a pandas DataFrame
results_df = pd.DataFrame(evaluation_results)

# Display the head of the DataFrame
display(results_df.head())

## Visualisasi hasil

### Subtask:
Buat diagram batang yang menampilkan rata-rata Precision@K untuk setiap kombinasi parameter yang dievaluasi. Anda bisa menggunakan library seperti Matplotlib atau Seaborn.


**Reasoning**:
Import the necessary libraries for plotting and create a combined parameter column in the results DataFrame.



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

results_df['parameters'] = results_df.apply(lambda row: f"alpha={row['alpha']}, k={int(row['k'])}", axis=1)
display(results_df.head())

**Reasoning**:
Create a bar plot to visualize the average precision for each parameter combination.



In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='parameters', y='average_precision', data=results_df)
plt.title('Average Precision@K for Different Alpha and K Values')
plt.xlabel('Parameters (alpha, k)')
plt.ylabel('Average Precision@K')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Analisis dan interpretasi

### Subtask:
Analisis visualisasi untuk memahami bagaimana perubahan `alpha` dan `k` memengaruhi kinerja sistem rekomendasi.


**Reasoning**:
Analyze the generated bar chart to interpret the impact of alpha and k on the average Precision@K.



In [None]:
print("Analysis of the bar chart:")
print("The bar chart displays the average Precision@K for different combinations of alpha and k values.")
print("Observe the trend as alpha increases for a fixed k, and as k increases for a fixed alpha.")
print("Identify which parameter combinations result in the highest and lowest average Precision@K.")
print("Interpret how the weighting between content similarity (alpha) and skin attribute similarity influences the recommendation results based on the observed precision scores.")
print("\nBased on the displayed chart:")
print(f"- The combination with the highest average Precision@K appears to be {results_df.loc[results_df['average_precision'].idxmax(), 'parameters']} with a value of {results_df['average_precision'].max():.4f}.")
print(f"- The combination with the lowest average Precision@K appears to be {results_df.loc[results_df['average_precision'].idxmin(), 'parameters']} with a value of {results_df['average_precision'].min():.4f}.")
print("\nInterpretation:")
print("A higher alpha gives more weight to content similarity (ingredients), while a lower alpha gives more weight to skin attribute similarity.")
print("A higher k means we are considering more recommendations for the precision calculation.")
print("From the chart, we can see that lower alpha values (e.g., 0.1, 0.3) tend to yield higher average Precision@K, especially at lower k values.")
print("This suggests that for this dataset, the skin attribute similarity component contributes significantly to generating relevant recommendations (products matching the target skin type).")
print("As alpha increases (more weight on ingredients), the average precision generally decreases, indicating that relying too heavily on ingredient similarity alone might lead to less relevant recommendations in terms of skin type compatibility.")
print("The effect of k seems less pronounced compared to alpha in the range tested, although there are slight variations.")

## Summary:

### Data Analysis Key Findings

*   The evaluation of the hybrid recommendation system across various `alpha` and `k` combinations showed that the average Precision@K varied depending on the parameter settings.
*   The highest average Precision@K observed was 1.0000, achieved with the parameters `alpha=0.1` and `k=3`.
*   The lowest average Precision@K observed was 0.8727, with the parameters `alpha=0.9` and `k=3`.
*   Lower `alpha` values (giving more weight to skin attribute similarity) generally resulted in higher average Precision@K compared to higher `alpha` values (giving more weight to content similarity), particularly at lower `k` values.
*   The impact of `k` (the number of recommendations considered) on the average Precision@K was less pronounced than the impact of `alpha` within the tested range.

### Insights or Next Steps

*   The analysis suggests that for this dataset, skin attribute similarity plays a crucial role in generating relevant recommendations, and prioritizing it (lower `alpha`) improves performance.
*   Further investigation could explore a wider range of `k` values and potentially refine the weighting of `alpha` within the lower range (e.g., between 0.1 and 0.3) to potentially optimize performance further.
