In [15]:
import pandas as pd

df_corpus = pd.read_csv('corpus_kemah_jateng_diy - Sheet2.csv')

print(df_corpus.head())

   Doc_ID         Nama_Tempat                 Lokasi  Rating  \
0       1  Kuncen Camp Ground  Kab. Semarang, Jateng     5.0   
1       2  Kuncen Camp Ground  Kab. Semarang, Jateng     5.0   
2       3  Kuncen Camp Ground  Kab. Semarang, Jateng     5.0   
3       4  Kuncen Camp Ground  Kab. Semarang, Jateng     5.0   
4       5  Kuncen Camp Ground  Kab. Semarang, Jateng     5.0   

                                         Teks_Mentah  
0  Bagus banget tempatnya, terkonsep dan guide ny...  
1  Sangat menyenangkan untuk camping ceria.\r\nNy...  
2  Tempatnya asri dan sejuk, sudah lumayan ramai ...  
3  3 kali ke sini, sekali ikut acara, 2 kali biki...  
4  Tempat yang cocok untuk acara kemah, kami kema...  


In [16]:
!pip install Sastrawi

import re
import nltk
import math
import sys
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\Candra\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [17]:
# Import library untuk menghapus output di Colab/Jupyter
from IPython.display import clear_output

# --- SAFETY CHECKS & DUMMY CLASSES FOR ENVIRONMENT ISSUES ---
# Import Sastrawi (menggunakan dummy jika tidak tersedia)
try:
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
except ImportError:
    print("Warning: Sastrawi is not available. Using Dummy Stemmer.")
    class DummyStemmer:
        def stem(self, text):
            return text
    class StemmerFactory:
        def create_stemmer(self):
            return DummyStemmer()
        
# Safety for NLTK Stopwords
try:
    from nltk.corpus import stopwords
    stopwords_id = set(stopwords.words('indonesian')) 
except (LookupError, ImportError):
    print("Warning: Indonesian stopwords failed to load. Using minimal manual list.")
    stopwords_id = {"yang", "dan", "di", "ke", "adalah", "dengan", "saya", "ini"}

# Inisialisasi Tools
stemmer = StemmerFactory().create_stemmer()

# --- 2. DEFENISI FUNGSI HELPER & VSM CLASSES ---
# Fungsi Pembersihan Karakter Spesial
def remove_special_characters(text):
    if not isinstance(text, str):
        return "" 
    regex = re.compile(r'[^a-zA-Z0-9\s]')
    return re.sub(regex, '', text)

# Fungsi Proses Penuh (Preprocessing)
def full_preprocessing(text):
    if not isinstance(text, str):
        return []
        
    cleaned_text = remove_special_characters(text)
    cleaned_text = re.sub(r'\d', '', cleaned_text)
    
    # Simple Tokenization (split by whitespace) & Lowercasing
    words = cleaned_text.lower().split()
    
    words = [w for w in words if w not in stopwords_id]
    
    # Stemming
    stemmed_words = [stemmer.stem(w) for w in words]
    
    final_words = [w for w in stemmed_words if len(w) > 1]
    return final_words

# Inverted Index Classes
class Node:
    def __init__(self, docId, freq=None):
        self.freq = freq # TF-IDF weight
        self.doc = docId
        self.nextval = None

class SlinkedList:
    def __init__(self, head=None):
        self.head = head


# --- 3. APLIKASI PREPROCESSING & HITUNG DF & IDF (INDEXING PHASE 1) ---
df_corpus = pd.read_csv('corpus_kemah_jateng_diy - Sheet1.csv')
df_corpus['Teks_Mentah'] = df_corpus['Teks_Mentah'].fillna('')
df_corpus['Clean_Tokens'] = df_corpus['Teks_Mentah'].apply(full_preprocessing)

N = len(df_corpus)
df_counts = {} # Document Frequency

for tokens in df_corpus['Clean_Tokens']:
    for word in set(tokens): 
        df_counts[word] = df_counts.get(word, 0) + 1

idf_scores = {}
for term, count in df_counts.items():
    idf_scores[term] = math.log10(N / count)

# --- 4. BUILDING THE INVERTED INDEX WITH TF-IDF (INDEXING PHASE 2) ---
linked_list_data = {}
unique_words_all = set(df_counts.keys())

for word in unique_words_all:
    linked_list_data[word] = SlinkedList()
    linked_list_data[word].head = Node(docId=0, freq=None) 

for index, row in df_corpus.iterrows():
    doc_id = row['Doc_ID']
    tokens = row['Clean_Tokens']
    
    tf_in_doc = {}
    for word in tokens:
        tf_in_doc[word] = tf_in_doc.get(word, 0) + 1

    for term, tf in tf_in_doc.items():
        tfidf = tf * idf_scores[term]
        
        linked_list = linked_list_data[term].head
        while linked_list.nextval is not None:
            linked_list = linked_list.nextval
        
        linked_list.nextval = Node(docId=doc_id, freq=tfidf)

# Mapping Doc ID to Name and Rating for final result
df_metadata = df_corpus[['Doc_ID', 'Nama_Tempat', 'Lokasi', 'Rating']].copy()
avg_rating_per_place = df_metadata.groupby('Nama_Tempat')['Rating'].mean().reset_index()
avg_rating_per_place.rename(columns={'Rating': 'Avg_Rating'}, inplace=True)
df_metadata = df_metadata.merge(avg_rating_per_place, on='Nama_Tempat', how='left')
df_metadata.set_index('Doc_ID', inplace=True)


# --- 5. FUNGSI VSM RANKING MURNI ---
def search_by_keyword(query_text, region=None):
    """
    Melakukan pencarian VSM murni berdasarkan kata kunci (relevansi ulasan).
    """
    
    # 1. Preprocessing Query
    query_tokens = full_preprocessing(query_text)
    
    if not query_tokens:
        return []
    
    # 2. Query Vectorization (TF-IDF)
    query_tf = {}
    for word in query_tokens:
        query_tf[word] = query_tf.get(word, 0) + 1
        
    query_weights = {}
    involved_docs = set()
    
    for term, tf in query_tf.items():
        if term in idf_scores:
            query_weights[term] = tf * idf_scores[term]
            
            # Collect all documents involved from the Index
            current_node = linked_list_data[term].head.nextval
            while current_node is not None:
                involved_docs.add(current_node.doc)
                current_node = current_node.nextval
        else:
            continue

    if not involved_docs:
        return []

    # 3. Cosine Similarity (Dot Product Only)
    doc_scores = {doc_id: 0 for doc_id in involved_docs}
    
    # Calculate DOT PRODUCT: Sum(W(t,d) * W(t,q))
    for term, W_q in query_weights.items():
        current_node = linked_list_data[term].head.nextval
        while current_node is not None:
            doc_id = current_node.doc
            W_d = current_node.freq # TF-IDF weight W(t,d)
            doc_scores[doc_id] += W_d * W_q
            current_node = current_node.nextval
            
    # 4. Ranking Ulasan (Doc ID)
    ranked_results_by_doc = sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)
    
    # 5. Agregasi ke Nama Tempat (Mengambil ulasan paling relevan per tempat)
    final_recommendations = []
    unique_names = set()
    
    for doc_id, vsm_score in ranked_results_by_doc:
        meta = df_metadata.loc[doc_id]
        
        # Filter berdasarkan region jika diminta (opsional)
        if region and region.lower() not in meta['Lokasi'].lower():
            continue
            
        name = meta['Nama_Tempat']
        
        if name not in unique_names:
            unique_names.add(name)
            final_recommendations.append({
                'name': name,
                'location': meta['Lokasi'],
                'avg_rating': meta['Avg_Rating'],
                'top_vsm_score': vsm_score, # Skor VSM dari ulasan paling relevan
            })
            
    return final_recommendations

In [18]:
print("\n==================================================")
print("MESIN PENCARIAN REKOMENDASI TEMPAT KEMAH VSM SIAP!")
print("==================================================")
print("Anda dapat memasukkan kata kunci untuk mencari rekomendasi.")

while True:
    # Mengambil input query dari pengguna
    query_text = input("\nMasukkan kata kunci pencarian (atau ketik 'keluar' untuk berhenti): ").strip()
    
    if query_text.lower() in ('keluar', 'exit', 'berhenti', 'quit', 'stop', 'kembali'):
        print("\nSesi pencarian diakhiri. Terima kasih!")
        break
    
    if not query_text:
        continue
        
    # Panggil fungsi pencarian VSM
    vsm_ranking = search_by_keyword(query_text, region=None) 
    
    print("\n--------------------------------------------------")
    print(f"HASIL PENCARIAN untuk: '{query_text}'")
    print(f"Kata Kunci Diproses: {full_preprocessing(query_text)}")
    print("--------------------------------------------------")

    if vsm_ranking:
        print("Rekomendasi Tempat Kemah (Diurutkan berdasarkan Relevansi Ulasan Tertinggi):")
        
        # Menampilkan 5 hasil teratas
        for i, item in enumerate(vsm_ranking[:5]):
            print(f"{i+1}. {item['name']}")
            print(f"   | Lokasi: {item['location']}")
            print(f"   | Rata-rata Rating Tempat: {item['avg_rating']:.2f}")
            print(f"   | Skor Relevansi (VSM Score): {item['top_vsm_score']:.4f}")
        
        # --- LOGIKA TAMBAHAN: TANYA LANJUT ---
        continue_input = input("\nApakah Anda ingin melanjutkan pencarian? (ya/tidak): ").strip().lower()
        
        if continue_input not in ('ya', 'y'):
            print("\nSesi pencarian diakhiri. Terima kasih!")
            break
            
        # Hapus output sebelum loop selanjutnya
        clear_output(wait=True) 
        print("Mesin pencarian siap untuk query selanjutnya...")
    else:
        print("Tidak ditemukan tempat kemah yang relevan dengan kata kunci ini.")
        
        # --- LOGIKA TAMBAHAN: TANYA LANJUT ---
        continue_input = input("\nApakah Anda ingin melanjutkan pencarian? (ya/tidak): ").strip().lower()
        
        if continue_input not in ('ya', 'y'):
            print("\nSesi pencarian diakhiri. Terima kasih!")
            break
            
        # Hapus output sebelum loop selanjutnya
        clear_output(wait=True) 
        print("Mesin pencarian siap untuk query selanjutnya...")

Mesin pencarian siap untuk query selanjutnya...

--------------------------------------------------
HASIL PENCARIAN untuk: 'dataran tinggi atau dekat gunung'
Kata Kunci Diproses: ['datar', 'gunung']
--------------------------------------------------
Rekomendasi Tempat Kemah (Diurutkan berdasarkan Relevansi Ulasan Tertinggi):
1. camp ground
   | Lokasi: Gunungkidul, DIY
   | Rata-rata Rating Tempat: 4.70
   | Skor Relevansi (VSM Score): 2.8865
2. Camping Umbul Sidomukti
   | Lokasi: Kab. Semarang, Jateng
   | Rata-rata Rating Tempat: 4.60
   | Skor Relevansi (VSM Score): 1.4929

Sesi pencarian diakhiri. Terima kasih!
