In [1]:
# %%
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold

# %%
# Load data
df_wisata = pd.read_csv("../0. Dataset/Dataset_Toba_Text_Processed.csv")
df_history = pd.read_csv("../0. Dataset/userHistory.csv")

# TF-IDF untuk semua tempat wisata
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df_wisata['tags_joined'])

# %%
def hit_rate_fold(train_visits, test_visits, top_n=5):
    # Ambil indeks train dari df_wisata
    visited_indices = df_wisata[df_wisata['title'].isin(train_visits)].index
    if len(visited_indices) == 0:
        return None  # Profil tidak bisa dibentuk

    # Bangun profil user
    user_profile_matrix = tfidf_matrix[visited_indices].mean(axis=0)
    user_profile = np.asarray(user_profile_matrix).reshape(1, -1)

    # Hitung similarity
    similarities = cosine_similarity(user_profile, tfidf_matrix).flatten()
    df_wisata['similarity'] = similarities

    # Buang wisata yang ada di train
    rekomendasi = df_wisata[~df_wisata['title'].isin(train_visits)]
    rekomendasi = rekomendasi.sort_values(by='similarity', ascending=False)

    # Ambil Top-N rekomendasi
    top_rekomendasi = rekomendasi['title'].head(top_n).tolist()

    # Hit Rate: berapa dari test_visits yang muncul di Top-N
    hits = len(set(top_rekomendasi) & set(test_visits))
    hit_rate = hits / len(test_visits)
    return hit_rate

# %%
def evaluate_user_kfold(user_id, top_n=5, k=5):
    user_visits = df_history[df_history['userID'] == user_id]['namaWisata'].tolist()
    
    if len(user_visits) < k:
        return None  # Data terlalu sedikit untuk KFold

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_hit_rates = []

    for train_index, test_index in kf.split(user_visits):
        train_visits = [user_visits[i] for i in train_index]
        test_visits = [user_visits[i] for i in test_index]

        hr = hit_rate_fold(train_visits, test_visits, top_n=top_n)
        if hr is not None:
            fold_hit_rates.append(hr)

    if fold_hit_rates:
        return {
            'user_id': user_id,
            'avg_hit_rate': np.mean(fold_hit_rates),
            'folds': k,
            'fold_hit_rates': fold_hit_rates
        }
    else:
        return None

# %%
# Evaluasi semua user secara dinamis
user_ids = df_history['userID'].unique()
all_results = []

for uid in user_ids:
    result = evaluate_user_kfold(uid, top_n=5, k=4)
    if result:
        all_results.append(result)

# Hasil evaluasi ke DataFrame
df_eval = pd.DataFrame(all_results)
print(df_eval[['user_id', 'avg_hit_rate']])

# Rata-rata keseluruhan
average_hit_rate_all = df_eval['avg_hit_rate'].mean()
print(f"\nRata-rata Hit Rate Top-5 dengan 4 lipatan: {average_hit_rate_all:.2f}")


   user_id  avg_hit_rate
0        1          0.75
1        2          0.00

Rata-rata Hit Rate Top-5 dengan 4 lipatan: 0.38
