# üß† Mini Search Engine (Boolean & VSM)
Notebook ini membangun sistem _Information Retrieval_ sederhana berbasis:
- **Preprocessing teks (cleaning, tokenizing, stopword removal)**
- **Boolean Retrieval (AND, OR, NOT)**
- **Vector Space Model (TF-IDF Standard & Sublinear)**
- **Evaluasi dengan Precision, Recall, F1, MAP, dan nDCG**

Struktur folder:


In [23]:
import os
import re
import math
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from pathlib import Path
from tabulate import tabulate
from numpy.linalg import norm
from scipy.sparse import csr_matrix


In [24]:
# === [1] SET BASE PATH YANG PASTI BENAR ===
from pathlib import Path
import os
import re
import matplotlib.pyplot as plt

# cari folder STKI_UTS dari posisi file notebook
BASE_DIR = Path(os.getcwd())
while BASE_DIR.name.lower() != "stki_uts":
    if BASE_DIR.parent == BASE_DIR:
        raise RuntimeError("‚ùå Folder project STKI_UTS tidak ditemukan di struktur folder.")
    BASE_DIR = BASE_DIR.parent

# tentukan folder data/raw dan data/processed
RAW_PATH = BASE_DIR / "data" / "raw"
PROCESSED_PATH = BASE_DIR / "data" / "processed"
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

print(f"üìÇ RAW_PATH: {RAW_PATH}")
print(f"üìÇ PROCESSED_PATH: {PROCESSED_PATH}")


üìÇ RAW_PATH: d:\Semester_5\STKI_UTS\data\raw
üìÇ PROCESSED_PATH: d:\Semester_5\STKI_UTS\data\processed


In [25]:
import os
import re
import math
import matplotlib
matplotlib.use("Agg")  # Non-interaktif (tidak muncul jendela GUI)
import matplotlib.pyplot as plt
from pathlib import Path

# === [1] KONFIGURASI STOPWORDS ===
STOPWORDS = set([
    "judul", "dan", "atau", "serta",
    "di", "ke", "dari", "pada", "dalam", "antara",
    "ini", "itu", "tersebut",
    "adalah", "sebagai", "untuk", "sebuah", "seorang",
    "juga", "lebih", "tidak", "bukan", "saat", "hingga",
    "adanya", "agar", "karenanya", "sehingga", "per"
])

# === [2] PATH FOLDER INPUT & OUTPUT ===
RAW_PATH = Path(r"d:\Semester_5\STKI_UTS\data\raw")          # ‚Üê ubah ke path absolut
PROCESSED_PATH = Path(r"d:\Semester_5\STKI_UTS\data\processed")  # ‚Üê ubah ke path absolut
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

# === [3] PREPROCESSING FUNGSI ===
def preprocess_text(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    tokens = text.split()
    tokens = [t for t in tokens if t and t not in STOPWORDS]
    return tokens

# === [4] PROSES SEMUA FILE ===
def process_all_files():
    files = list(RAW_PATH.glob("*.txt"))
    if not files:
        print(f"‚ùå Folder {RAW_PATH} kosong atau tidak ditemukan.")
        return
    doc_lengths = {}
    for file in files:
        raw_text = file.read_text(encoding="utf-8", errors="ignore")
        tokens = preprocess_text(raw_text)
        doc_lengths[file.name] = len(tokens)
        out_path = PROCESSED_PATH / file.name
        out_path.write_text(" ".join(tokens), encoding="utf-8")

    # Ringkasan
    print(f"‚úÖ {len(files)} dokumen berhasil diproses & disimpan di '{PROCESSED_PATH}/'")
    print("Distribusi panjang dokumen (jumlah token):")
    for name, n_tok in doc_lengths.items():
        print(f"  {name:<25} : {n_tok:>5} token")

    # Visualisasi distribusi panjang dokumen
    plt.figure(figsize=(10, 5))
    plt.bar(range(len(doc_lengths)), list(doc_lengths.values()), tick_label=list(doc_lengths.keys()))
    plt.xticks(rotation=45, ha="right")
    plt.title("Distribusi Panjang Dokumen (Setelah Preprocessing)")
    plt.ylabel("Jumlah Token")
    plt.tight_layout()
    out_plot = PROCESSED_PATH / "distribusi_dokumen.png"
    plt.savefig(out_plot)
    print(f"üìä Grafik disimpan ke {out_plot}")

# === [5] EKSEKUSI ===
if __name__ == "__main__":
    process_all_files()


‚úÖ 15 dokumen berhasil diproses & disimpan di 'd:\Semester_5\STKI_UTS\data\processed/'
Distribusi panjang dokumen (jumlah token):
  itb.txt                   :   153 token
  stmik_bm_palu.txt         :   114 token
  ub.txt                    :   122 token
  udinus.txt                :   132 token
  ugm.txt                   :   144 token
  ui.txt                    :   174 token
  unand.txt                 :   122 token
  undip.txt                 :   144 token
  ung.txt                   :   104 token
  unima.txt                 :   132 token
  unimal.txt                :   126 token
  unmubuton.txt             :   119 token
  unpad.txt                 :   135 token
  unsri.txt                 :   102 token
  unud.txt                  :   107 token
üìä Grafik disimpan ke d:\Semester_5\STKI_UTS\data\processed\distribusi_dokumen.png


In [53]:
# Load dokumen dari folder processed
def load_documents(path):
    p = Path(path)
    docs = {}
    for f in p.glob("*.txt"):
        docs[f.name] = f.read_text(encoding="utf-8").split()
    return docs

PROCESSED_PATH = Path("d:\Semester_5\STKI_UTS\data\processed")
docs_boolean = load_documents(PROCESSED_PATH)
print(f"{len(docs_boolean)} dokumen berhasil dimuat.")


15 dokumen berhasil dimuat.


  PROCESSED_PATH = Path("d:\Semester_5\STKI_UTS\data\processed")


In [82]:
def load_docs(path=PROCESSED_PATH):
    docs = {}
    p = Path(path)
    if not p.exists():
        print(f"Folder {p} tidak ditemukan!")
        return docs
    for f in p.glob("*.txt"):
        docs[f.name] = f.read_text(encoding="utf-8").split()
    print(f"{len(docs)} dokumen dimuat dari {p}.")
    return docs


In [86]:
from collections import defaultdict
from pathlib import Path
import re
from tabulate import tabulate

# -----------------------------
# PATH ABSOLUT
# -----------------------------
PROCESSED_PATH = Path(r"D:\Semester_5\STKI_UTS\data\processed")
TOP_K = 3

# -----------------------------
# 1Ô∏è‚É£ Load dokumen
# -----------------------------
def load_docs(path=PROCESSED_PATH):
    docs = {}
    p = Path(path)
    if not p.exists():
        print(f"Folder {p} tidak ditemukan!")
        return docs
    for f in p.glob("*.txt"):
        docs[f.name] = f.read_text(encoding="utf-8").split()
    print(f"{len(docs)} dokumen dimuat dari {p}.")
    return docs

# -----------------------------
# 2Ô∏è‚É£ Build inverted index
# -----------------------------
def build_inverted_index(docs):
    inverted = defaultdict(set)
    for doc, toks in docs.items():
        for t in set(toks):
            inverted[t].add(doc)
    return dict(inverted)

# -----------------------------
# 3Ô∏è‚É£ Eval Boolean Query
# -----------------------------
def eval_boolean(query, inverted_index):
    q = query.lower().strip()
    tokens = re.findall(r'\band\b|\bor\b|\bnot\b|\w+', q)
    tokens = [t.upper() if t in ('and','or','not') else t for t in tokens]

    prec = {"NOT": 3, "AND": 2, "OR": 1}
    output, stack = [], []

    # Infix ‚Üí Postfix
    for tok in tokens:
        if tok in ("AND","OR","NOT"):
            while stack and prec.get(stack[-1],0) >= prec[tok]:
                output.append(stack.pop())
            stack.append(tok)
        else:
            output.append(tok)
    while stack:
        output.append(stack.pop())

    all_docs = set().union(*inverted_index.values()) if inverted_index else set()
    eval_stack = []

    for tok in output:
        if tok == "NOT":
            A = eval_stack.pop()
            eval_stack.append(all_docs - A)
        elif tok == "AND":
            B = eval_stack.pop()
            A = eval_stack.pop()
            eval_stack.append(A & B)
        elif tok == "OR":
            B = eval_stack.pop()
            A = eval_stack.pop()
            eval_stack.append(A | B)
        else:
            eval_stack.append(inverted_index.get(tok, set()))

    return eval_stack[-1] if eval_stack else set()

# -----------------------------
# 4Ô∏è‚É£ Fungsi snippet
# -----------------------------
def get_snippet(tokens, n=100):
    text = " ".join(tokens)
    return text[:n] + "..." if len(text) > n else text

# -----------------------------
# 5Ô∏è‚É£ Precision & Recall
# -----------------------------
def precision_recall(result, gold):
    if not result and not gold:
        return 1.0, 1.0
    if not result:
        return 0.0, 0.0
    tp = len(result & gold)
    precision = tp / len(result)
    recall = tp / len(gold) if gold else 0
    return round(precision,2), round(recall,2)

# -----------------------------
# 6Ô∏è‚É£ Boolean + snippet + evaluasi top-K
# -----------------------------
def boolean_with_eval(query, docs, inverted_index, gold_set=None, top_k=TOP_K):
    results = eval_boolean(query, inverted_index)
    if not results:
        print(f"\nüîé BOOLEAN QUERY: {query}\nHasil Dokumen: Tidak ada")
        return

    top_docs = sorted(results)[:top_k]
    table = [[i+1, doc, get_snippet(docs[doc])] for i, doc in enumerate(top_docs)]
    print(f"\nüîé BOOLEAN QUERY: {query}")
    print(tabulate(table, headers=["Rank","Doc","Snippet"], tablefmt="grid"))

    if gold_set is not None:
        P, R = precision_recall(set(top_docs), gold_set)
        print(f"\nEvaluasi | Precision: {P}, Recall: {R}")

# -----------------------------
# 7Ô∏è‚É£ Jalankan contoh
# -----------------------------
docs = load_docs(PROCESSED_PATH)
inverted_index = build_inverted_index(docs)

# Contoh gold set sederhana
gold_sets = {
    "universitas AND fakultas": {"itb.txt", "ub.txt", "ugm.txt"},
    "fakultas AND teknik": {"itb.txt", "stmik_bm_palu.txt", "ugm.txt"},
    "NOT bandung": {f for f in docs if "bandung" not in docs[f]},
    "universitas OR bandung": {"itb.txt","ub.txt","ugm.txt","ui.txt"}
}

queries = [
    "universitas AND fakultas",
    "NOT bandung",
]

for q in queries:
    boolean_with_eval(q, docs, inverted_index, gold_set=gold_sets.get(q))


15 dokumen dimuat dari D:\Semester_5\STKI_UTS\data\processed.

üîé BOOLEAN QUERY: universitas AND fakultas
+--------+------------+---------------------------------------------------------------------------------------------------------+
|   Rank | Doc        | Snippet                                                                                                 |
|      1 | itb.txt    | informasi umum institut teknologi bandung itb nama singkatan nama resmi institut teknologi bandung i... |
+--------+------------+---------------------------------------------------------------------------------------------------------+
|      2 | ub.txt     | informasi umum universitas brawijaya ub nama singkatan nama resmi universitas brawijaya ub singkatan... |
+--------+------------+---------------------------------------------------------------------------------------------------------+
|      3 | udinus.txt | informasi umum universitas dian nuswantoro udinus nama singkatan nama resmi universitas 

In [51]:
# Load dokumen sebagai token untuk VSM
docs_vsm = load_docs_as_tokens(PROCESSED_PATH)

# Build VSM
doc_ids, vocab, tfidf_std, idf, term_index = build_vsm(docs_vsm, scheme="standard")
_, _, tfidf_sub, _, _ = build_vsm(docs_vsm, scheme="sublinear")


In [42]:
# --- Fungsi snippet ---
def get_snippet(tokens, n=100):
    text = " ".join(tokens)
    return text[:n] + "..." if len(text) > n else text

# --- Fungsi evaluasi VSM ---
def evaluate_vsm(results, gold_set, k=5):
    r = [doc for doc,_ in results[:k]]
    hits = [1 if doc in gold_set else 0 for doc in r]
    precision = sum(hits)/k
    recall = sum(hits)/len(gold_set) if gold_set else 0
    f1 = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0
    # MAP@k
    ap = 0
    num_hits = 0
    for i, h in enumerate(hits,1):
        if h==1:
            num_hits+=1
            ap += num_hits/i
    ap = ap/len(gold_set) if gold_set else 0
    # nDCG@k
    dcg = sum(h/math.log2(i+1) for i,h in enumerate(hits,start=1))
    ideal_hits = min(len(gold_set),k)
    idcg = sum(1/math.log2(i+1) for i in range(1,ideal_hits+1))
    ndcg = dcg/idcg if idcg>0 else 0
    return round(precision,2), round(recall,2), round(f1,2), round(ap,2), round(ndcg,2)

# --- Query VSM + snippet + evaluasi ---
queries_vsm = ["universitas", "fakultas teknik", "bandung"]

for q in queries_vsm:
    print(f"\nüîé VSM QUERY: {q}")

    # Gold set untuk evaluasi
    gold_set = {f for f in docs_vsm if q.split()[0] in " ".join(docs_vsm[f]).lower()}

    # Standard & Sublinear TF-IDF
    res_std = search_vsm(q, idf, term_index, len(vocab), tfidf_std, doc_ids, scheme="standard")
    res_sub = search_vsm(q, idf, term_index, len(vocab), tfidf_sub, doc_ids, scheme="sublinear")

    # --- Tampilkan tabel dengan snippet ---
    table_std = [[i+1, doc, f"{score:.4f}", get_snippet(docs_vsm[doc])] 
                 for i, (doc, score) in enumerate(res_std)]
    table_sub = [[i+1, doc, f"{score:.4f}", get_snippet(docs_vsm[doc])] 
                 for i, (doc, score) in enumerate(res_sub)]

    print("\nTF-IDF Standard:")
    print(tabulate(table_std, headers=["Rank","Doc","Score","Snippet"], tablefmt="grid"))

    print("\nTF-IDF Sublinear:")
    print(tabulate(table_sub, headers=["Rank","Doc","Score","Snippet"], tablefmt="grid"))

    # --- Evaluasi VSM ---
    p, r, f, mapk, ndcg = evaluate_vsm(res_std, gold_set, k=5)
    print(f"\nEvaluasi VSM Standard | Precision: {p}, Recall: {r}, F1: {f}, MAP@5: {mapk}, nDCG@5: {ndcg}")
    
    p, r, f, mapk, ndcg = evaluate_vsm(res_sub, gold_set, k=5)
    print(f"Evaluasi VSM Sublinear | Precision: {p}, Recall: {r}, F1: {f}, MAP@5: {mapk}, nDCG@5: {ndcg}")



üîé VSM QUERY: universitas

TF-IDF Standard:
+--------+---------------+---------+---------------------------------------------------------------------------------------------------------+
|   Rank | Doc           |   Score | Snippet                                                                                                 |
|      1 | undip.txt     |  0.0158 | informasi umum universitas diponegoro undip nama singkatan nama resmi universitas diponegoro undip s... |
+--------+---------------+---------+---------------------------------------------------------------------------------------------------------+
|      2 | unud.txt      |  0.0133 | informasi umum universitas udayana unud nama singkatan nama resmi universitas udayana unud singkatan... |
+--------+---------------+---------+---------------------------------------------------------------------------------------------------------+
|      3 | ub.txt        |  0.0133 | informasi umum universitas brawijaya ub nama singkatan nam

In [95]:
# Tentukan path lengkap ke folder processed
PROCESSED_PATH = r"D:\Semester_5\STKI_UTS\data\processed"

# Load dokumen
docs = load_docs(PROCESSED_PATH)


15 dokumen dimuat dari D:\Semester_5\STKI_UTS\data\processed.


In [135]:
# ============================================================
# üîé SEARCH ENGINE: BOOLEAN + VSM (Standard vs Sublinear Fixed)
# ============================================================
from collections import defaultdict
from pathlib import Path
import re
import numpy as np
from tabulate import tabulate

TOP_K = 3
PROCESSED_PATH = "D:/Semester_5/STKI_UTS/data/processed"

# -----------------------------
# 1Ô∏è‚É£ Load dokumen hasil preprocessing
# -----------------------------
def load_docs(path=PROCESSED_PATH):
    docs = {}
    p = Path(path)
    if not p.exists():
        print(f"Folder {path} tidak ditemukan!")
        return docs
    for f in p.glob("*.txt"):
        docs[f.name] = f.read_text(encoding="utf-8").split()
    print(f"{len(docs)} dokumen dimuat.")
    return docs

# -----------------------------
# 2Ô∏è‚É£ Inverted index (Boolean)
# -----------------------------
def build_inverted_index(docs):
    inverted = defaultdict(set)
    for doc, toks in docs.items():
        for t in set(toks):
            inverted[t].add(doc)
    return dict(inverted)

# -----------------------------
# 3Ô∏è‚É£ Boolean query eval
# -----------------------------
def eval_boolean(query, inverted_index):
    q = query.lower().strip()
    tokens = re.findall(r'\band\b|\bor\b|\bnot\b|\w+', q)
    tokens = [t.upper() if t in ('and','or','not') else t for t in tokens]

    prec = {"NOT":3, "AND":2, "OR":1}
    output, stack = [], []

    for tok in tokens:
        if tok in ("AND","OR","NOT"):
            while stack and prec.get(stack[-1],0) >= prec[tok]:
                output.append(stack.pop())
            stack.append(tok)
        else:
            output.append(tok)
    while stack:
        output.append(stack.pop())

    all_docs = set().union(*inverted_index.values()) if inverted_index else set()
    eval_stack = []

    for tok in output:
        if tok == "NOT":
            A = eval_stack.pop()
            eval_stack.append(all_docs - A)
        elif tok == "AND":
            B = eval_stack.pop()
            A = eval_stack.pop()
            eval_stack.append(A & B)
        elif tok == "OR":
            B = eval_stack.pop()
            A = eval_stack.pop()
            eval_stack.append(A | B)
        else:
            eval_stack.append(inverted_index.get(tok,set()))

    return eval_stack[-1] if eval_stack else set()

# -----------------------------
# 4Ô∏è‚É£ Snippet
# -----------------------------
def get_snippet(tokens, n=100):
    text = " ".join(tokens)
    return text[:n] + "..." if len(text) > n else text

# -----------------------------
# 5Ô∏è‚É£ Boolean search + snippet
# -----------------------------
def boolean_with_snippet(query, docs, inverted_index, top_k=TOP_K):
    results = eval_boolean(query, inverted_index)
    if not results:
        return [], []
    top_docs = sorted(results)[:top_k]
    table = [[i+1, doc, get_snippet(docs[doc])] for i, doc in enumerate(top_docs)]
    return table, top_docs

# -----------------------------
# 6Ô∏è‚É£ Build VSM (TF-IDF standard & sublinear fixed)
# -----------------------------
def build_vsm(docs):
    vocab = sorted({t for toks in docs.values() for t in toks})
    term_index = {t:i for i,t in enumerate(vocab)}
    doc_ids = list(docs.keys())
    N = len(doc_ids)

    # Term frequency matrix
    tf = np.zeros((N,len(vocab)))
    for i, doc in enumerate(doc_ids):
        for t in docs[doc]:
            tf[i,term_index[t]] += 1

    # Document frequency & IDF
    df = np.sum(tf>0, axis=0)
    idf = np.log((N+1)/(df+1)) + 1

    # TF-IDF Standard
    tfidf_std = tf * idf

    # TF-IDF Sublinear fixed
    tf_sub = np.where(tf>0, 1 + np.log(tf), 0)  # hanya dokumen dengan tf>0
    tfidf_sub = tf_sub * idf

    return doc_ids, vocab, tfidf_std, tfidf_sub, idf, term_index

# -----------------------------
# 7Ô∏è‚É£ VSM query
# -----------------------------
def vsm_query(q, docs, doc_ids, vocab, tfidf_std, tfidf_sub, term_index, top_k=TOP_K):
    q_tokens = q.lower().split()
    q_vec_std = np.zeros(len(vocab))
    q_vec_sub = np.zeros(len(vocab))
    for t in q_tokens:
        if t in term_index:
            q_vec_std[term_index[t]] = 1
            q_vec_sub[term_index[t]] = 1  # sublinear query tf=1

    def cosine_sim(tfidf_matrix, q_vector):
        sim = tfidf_matrix @ q_vector
        doc_norm = np.linalg.norm(tfidf_matrix, axis=1)
        q_norm = np.linalg.norm(q_vector)
        return sim / (doc_norm*q_norm + 1e-8)

    sim_std = cosine_sim(tfidf_std, q_vec_std)
    sim_sub = cosine_sim(tfidf_sub, q_vec_sub)

    def top_k_table(sim):
        idx_sorted = np.argsort(-sim)
        table, pred = [], []
        for rank,i in enumerate(idx_sorted[:top_k],1):
            if sim[i] > 0:
                table.append([rank, doc_ids[i], round(sim[i],4), get_snippet(docs[doc_ids[i]])])
                pred.append(doc_ids[i])
        return table, pred

    return top_k_table(sim_std), top_k_table(sim_sub)

# -----------------------------
# 8Ô∏è‚É£ Evaluation metrics
# -----------------------------
def precision_recall_f1(pred, gold):
    pred_set, gold_set = set(pred), set(gold)
    tp = len(pred_set & gold_set)
    precision = tp/len(pred_set) if pred_set else 0
    recall = tp/len(gold_set) if gold_set else 0
    f1 = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0
    return round(precision,2), round(recall,2), round(f1,2)

def mean_average_precision(pred, gold):
    gold_set = set(gold)
    score = hits = 0
    for i, p in enumerate(pred,1):
        if p in gold_set:
            hits += 1
            score += hits/i
    return round(score/len(gold),2) if gold else 0

def ndcg(pred, gold):
    dcg = idcg = 0
    gold_set = set(gold)
    for i, p in enumerate(pred,1):
        if p in gold_set:
            dcg += 1/np.log2(i+1)
    for i in range(1, min(len(gold), len(pred))+1):
        idcg += 1/np.log2(i+1)
    return round(dcg/idcg,2) if idcg>0 else 0

# -----------------------------
# 9Ô∏è‚É£ Run search engine
# -----------------------------
if __name__ == "__main__":
    docs = load_docs(PROCESSED_PATH)
    if not docs:
        exit()

    inverted_index = build_inverted_index(docs)
    doc_ids, vocab, tfidf_std, tfidf_sub, idf, term_index = build_vsm(docs)

    # --- Boolean queries ---
    queries_boolean = [
        "universitas AND fakultas",
        "fakultas AND teknik",
        "NOT bandung",
        "universitas OR bandung"
    ]
    gold_boolean = {
        "universitas AND fakultas":["itb.txt","ub.txt","udinus.txt"],
        "fakultas AND teknik":["itb.txt","stmik_bm_palu.txt","ub.txt"],
        "NOT bandung":["stmik_bm_palu.txt","ub.txt","udinus.txt"],
        "universitas OR bandung":["itb.txt","ub.txt","udinus.txt"]
    }

    for q in queries_boolean:
        table, pred = boolean_with_snippet(q, docs, inverted_index)
        print(f"\nüîé BOOLEAN QUERY: {q}")
        if table: print(tabulate(table, headers=["Rank","Doc","Snippet"], tablefmt="grid"))
        else: print("Tidak ada hasil Boolean.")
        P,R,F1 = precision_recall_f1(pred, gold_boolean.get(q,[]))
        MAP = mean_average_precision(pred, gold_boolean.get(q,[]))
        nDCG = ndcg(pred, gold_boolean.get(q,[]))
        print(f"Precision:{P}, Recall:{R}, F1:{F1}, MAP@{TOP_K}:{MAP}, nDCG@{TOP_K}:{nDCG}")

    # --- VSM queries ---
    queries_vsm = ["universitas", "fakultas teknik", "bandung"]
    gold_vsm = {
        "universitas":["itb.txt","ub.txt","udinus.txt"],
        "fakultas teknik":["itb.txt","stmik_bm_palu.txt","ub.txt"],
        "bandung":["itb.txt"]
    }

    for q in queries_vsm:
        print(f"\nüîé VSM QUERY: {q}")
        (table_std, pred_std), (table_sub, pred_sub) = vsm_query(q, docs, doc_ids, vocab, tfidf_std, tfidf_sub, term_index)
        
        print("TF-IDF Standard:")
        if table_std: print(tabulate(table_std, headers=["Rank","Doc","Score","Snippet"], tablefmt="grid"))
        else: print("Tidak ada hasil.")
        
        print("TF-IDF Sublinear:")
        if table_sub: print(tabulate(table_sub, headers=["Rank","Doc","Score","Snippet"], tablefmt="grid"))
        else: print("Tidak ada hasil.")

        # Evaluasi
        P,R,F1 = precision_recall_f1(pred_std, gold_vsm.get(q,[]))
        MAP = mean_average_precision(pred_std, gold_vsm.get(q,[]))
        nDCG = ndcg(pred_std, gold_vsm.get(q,[]))
        print(f"[Standard] Precision:{P}, Recall:{R}, F1:{F1}, MAP@{TOP_K}:{MAP}, nDCG@{TOP_K}:{nDCG}")

        P,R,F1 = precision_recall_f1(pred_sub, gold_vsm.get(q,[]))
        MAP = mean_average_precision(pred_sub, gold_vsm.get(q,[]))
        nDCG = ndcg(pred_sub, gold_vsm.get(q,[]))
        print(f"[Sublinear] Precision:{P}, Recall:{R}, F1:{F1}, MAP@{TOP_K}:{MAP}, nDCG@{TOP_K}:{nDCG}")


15 dokumen dimuat.

üîé BOOLEAN QUERY: universitas AND fakultas
+--------+------------+---------------------------------------------------------------------------------------------------------+
|   Rank | Doc        | Snippet                                                                                                 |
|      1 | itb.txt    | informasi umum institut teknologi bandung itb nama singkatan nama resmi institut teknologi bandung i... |
+--------+------------+---------------------------------------------------------------------------------------------------------+
|      2 | ub.txt     | informasi umum universitas brawijaya ub nama singkatan nama resmi universitas brawijaya ub singkatan... |
+--------+------------+---------------------------------------------------------------------------------------------------------+
|      3 | udinus.txt | informasi umum universitas dian nuswantoro udinus nama singkatan nama resmi universitas dian nuswant... |
+--------+------------+--

  tf_sub = np.where(tf>0, 1 + np.log(tf), 0)  # hanya dokumen dengan tf>0
