# UTS_STKI — Bintang Rifky Ananta

**Notebook**: preprocessing → Boolean / VSM / BM25 → evaluation & visualisasi

Dataset: `ulasan_enterkomputer_tokopedia.csv` + `ulasan_enterkomputer_gmaps.csv` (digabung).


In [None]:
# Imports & setup
import os, sys
sys.path.insert(0, os.path.abspath('..'))
print('Added project root to sys.path:', os.path.abspath('..'))

import pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
%matplotlib inline

from src.preprocess import preprocess_text
from src.boolean_ir import build_inverted_index
from src import vsm_ir
from src import eval as ir_eval


In [None]:
# Load CSV datasets (Tokopedia + GMaps)
tok_path = '/mnt/data/ulasan_enterkomputer_tokopedia.csv'
gmaps_path = '/mnt/data/ulasan_enterkomputer_gmaps.csv'

df_tok = pd.read_csv(tok_path)
df_gmaps = pd.read_csv(gmaps_path)

print('Tokopedia columns:', df_tok.columns.tolist())
print('GMaps columns:', df_gmaps.columns.tolist())

def find_text_column(df):
    candidates = ['review','ulasan','komentar','text','content','review_text','comment','body','Review Text','reviewText']
    for c in candidates:
        if c in df.columns:
            return c
    for c in df.columns:
        if df[c].dtype == object:
            return c
    raise ValueError('No text column found')

txt_col_tok = find_text_column(df_tok)
txt_col_gmaps = find_text_column(df_gmaps)
print('Using text columns:', txt_col_tok, 'and', txt_col_gmaps)

df_tok = df_tok.rename(columns={txt_col_tok: 'text'})
df_gmaps = df_gmaps.rename(columns={txt_col_gmaps: 'text'})
df = pd.concat([df_tok[['text']], df_gmaps[['text']]], ignore_index=True).reset_index(drop=True)
print('Combined dataframe shape:', df.shape)
df.head(5)


In [None]:
# Sample 15 documents from combined data (random but reproducible)
df = df[df['text'].notna() & df['text'].astype(str).str.len()>10].copy()
df_sample = df.sample(n=min(15, len(df)), random_state=42).reset_index(drop=True)
print('Sampled', len(df_sample), 'documents.')
for i, t in enumerate(df_sample['text'][:3], start=1):
    print(f'--- Doc {i} ---\n', t[:300].replace('\n',' '), '\n')


In [None]:
# Preprocess sampled docs using src.preprocess.preprocess_text
processed_tokens = []
processed_texts = []
for txt in df_sample['text']:
    toks = preprocess_text(txt)
    processed_tokens.append(toks)
    processed_texts.append(' '.join(toks))

df_sample['processed'] = processed_texts

# Show before/after for first 2 docs
for i in range(min(2, len(df_sample))):
    print(f'=== Document {i+1} ORIGINAL ===\n{df_sample.loc[i,"text"][:500]}\n')
    print(f'--- Processed tokens ({len(processed_tokens[i])}) ---\n{processed_tokens[i][:60]}\n\n')


In [None]:
# Build inverted index (Boolean retrieval)
docs = {f'doc{idx+1:02d}.txt': txt for idx, txt in enumerate(df_sample['processed'])}
inv_index = build_inverted_index(docs)
print('Vocabulary size (inverted index):', len(inv_index))
for term in list(inv_index.keys())[:8]:
    print(term, '->', inv_index[term])


In [None]:
# Build TF-IDF matrix using scikit-learn for convenience
vectorizer = TfidfVectorizer(use_idf=True, norm='l2')
tfidf_matrix = vectorizer.fit_transform(df_sample['processed'])  # shape (n_docs, n_terms)
print('TF-IDF matrix shape:', tfidf_matrix.shape)

def search_top_k_vsm(query, k=5):
    q_toks = preprocess_text(query)
    q_text = ' '.join(q_toks)
    q_vec = vectorizer.transform([q_text])
    sims = cosine_similarity(q_vec, tfidf_matrix)  # shape (1, n_docs)
    top_idx = sims[0].argsort()[-k:][::-1]
    return top_idx, sims[0][top_idx]

# test search
query = 'aplikasi crypto'
top_idx, scores = search_top_k_vsm(query, k=5)
print('Top docs (VSM) for query:', query)
for idx, s in zip(top_idx, scores):
    print(idx, f'(score={s:.4f})', df_sample.loc[idx,'text'][:200].replace('\n',' '))


In [None]:
# BM25 ranking using src.vsm_ir implementation (expects processed tokens joined by space)
bm25_index = vsm_ir.build_bm25(docs)  # docs is dict created earlier where values are processed joined text
def search_top_k_bm25(query, k=5):
    q_toks = preprocess_text(query)
    q_text = ' '.join(q_toks)
    return vsm_ir.score_bm25(q_text, bm25_index, topk=k)

# test BM25
bm25_results = search_top_k_bm25('garansi rusak', k=5)
print('Top docs (BM25):')
for doc, sc in bm25_results:
    idx = int(doc.replace('doc','').replace('.txt',''))-1
    print(doc, f'(score={sc:.4f})', df_sample.loc[idx,'text'][:200].replace('\n',' '))


In [None]:
# Evaluation: create a simple gold set based on keywords
gold_queries = {
    'pelayanan baik': [i for i, t in enumerate(df_sample['processed']) if 'pelayanan' in t or 'layanan' in t],
    'pengiriman cepat': [i for i, t in enumerate(df_sample['processed']) if 'kirim' in t or 'pengiriman' in t],
    'garansi': [i for i, t in enumerate(df_sample['processed']) if 'garansi' in t]
}
print('Gold set (example):', gold_queries)

# Evaluate for one query using VSM results
q = 'pelayanan baik'
top_idx, scores = search_top_k_vsm(q, k=5)
retrieved_docs = [f'doc{i+1:02d}.txt' for i in top_idx]
relevant_docs = [f'doc{i+1:02d}.txt' for i in gold_queries[q]]
print('Retrieved:', retrieved_docs)
print('Relevant (gold):', relevant_docs)

prec = ir_eval.precision(retrieved_docs, relevant_docs)
rec = ir_eval.recall(retrieved_docs, relevant_docs)
f1 = ir_eval.f1_score(retrieved_docs, relevant_docs)
print(f'Precision@5: {prec:.4f}, Recall@5: {rec:.4f}, F1: {f1:.4f}')


In [None]:
# Visualize VSM top-k results bar chart for the query used earlier
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
labels = [f'doc{idx+1:02d}' for idx in top_idx]
plt.bar(labels, scores)
plt.xlabel('Document ID')
plt.ylabel('Cosine similarity')
plt.title(f'VSM Top-{len(labels)} results for query: "{query}"')
plt.show()


In [None]:
# Save sampled docs to data/raw for reproducibility (optional)
out_dir = os.path.abspath(os.path.join('..','data','raw'))
os.makedirs(out_dir, exist_ok=True)
for i, txt in enumerate(df_sample['text']):
    fname = os.path.join(out_dir, f'doc{i+1:02d}.txt')
    with open(fname, 'w', encoding='utf-8') as f:
        f.write(txt)
print('Saved sampled docs to', out_dir)


## Kesimpulan singkat

- Notebook ini menunjukkan pipeline STKI: preprocessing, Boolean retrieval, VSM (TF-IDF) dan BM25.
- Untuk evaluasi retrieval yang lebih valid, buat gold set manual (relevansi per query) dan jalankan metrik secara terstruktur (Precision@k, MAP@k, nDCG@k) untuk beberapa query.
- Selanjutnya, lengkapi laporan dengan screenshot output notebook, tabel metrik, dan diskusi perbandingan TF-IDF vs BM25.
