In [1]:
"""
Information Retrieval System
Mencakup:
- Text Preprocessing dan Tokenisasi (15%)
- Representasi Dokumen (Bag of Words) (15%)
- Implementasi Indexing dengan Whoosh (25%)
- Pencarian dan Ranking menggunakan Cosine Similarity (25%)
"""

import re
import math
import os
import shutil
from collections import defaultdict, Counter
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from whoosh import scoring

# Import Sastrawi untuk stemming Bahasa Indonesia
try:
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    SASTRAWI_AVAILABLE = True
except ImportError:
    SASTRAWI_AVAILABLE = False
    print("Warning: Sastrawi not installed. Install with: pip install Sastrawi")
    print("Using simple stemming as fallback.\n")


In [2]:

class TextPreprocessor:
    """Text Preprocessing dan Tokenisasi (15%)"""
    
    def __init__(self, use_sastrawi=True):
        # Daftar stopwords bahasa Indonesia dan Inggris
        self.stopwords = set([
            'dan', 'di', 'ke', 'dari', 'yang', 'untuk', 'pada', 'dengan', 'adalah',
            'ini', 'itu', 'atau', 'se', 'akan', 'telah', 'ada', 'sebagai',
            'a', 'an', 'the', 'is', 'are', 'was', 'were', 'and', 'or', 'but', 
            'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'
        ])
        
        # Inisialisasi Sastrawi Stemmer
        self.use_sastrawi = use_sastrawi and SASTRAWI_AVAILABLE
        if self.use_sastrawi:
            factory = StemmerFactory()
            self.sastrawi_stemmer = factory.create_stemmer()
            print("✓ Sastrawi Stemmer initialized\n")
        else:
            self.sastrawi_stemmer = None
            print("✗ Using simple stemming (Sastrawi not available)\n")
    
    def case_folding(self, text):
        """Mengubah semua huruf menjadi lowercase"""
        return text.lower()
    
    def remove_punctuation(self, text):
        """Menghapus tanda baca"""
        return re.sub(r'[^\w\s]', ' ', text)
    
    def tokenize(self, text):
        """Memecah teks menjadi token (kata-kata)"""
        tokens = text.split()
        return [token for token in tokens if token]
    
    def remove_stopwords(self, tokens):
        """Menghapus stopwords"""
        return [token for token in tokens if token not in self.stopwords]
    
    def stemming_simple(self, token):
        """Stemming sederhana (suffix removal) - fallback"""
        # Stemming untuk bahasa Inggris
        if token.endswith('ing'):
            return token[:-3]
        elif token.endswith('ed'):
            return token[:-2]
        elif token.endswith('s') and len(token) > 3:
            return token[:-1]
        
        # Stemming untuk bahasa Indonesia
        elif token.endswith('kan'):
            return token[:-3]
        elif token.endswith('an'):
            return token[:-2]
        elif token.endswith('i') and len(token) > 3:
            return token[:-1]
        
        return token
    
    def stemming(self, tokens):
        """Stemming menggunakan Sastrawi (untuk Bahasa Indonesia)"""
        if self.use_sastrawi:
            # Sastrawi dapat memproses list atau string
            stemmed_tokens = [self.sastrawi_stemmer.stem(token) for token in tokens]
            return stemmed_tokens
        else:
            # Fallback ke simple stemming
            return [self.stemming_simple(token) for token in tokens]
    
    def preprocess(self, text, verbose=False):
        """Pipeline preprocessing lengkap"""
        if verbose:
            print(f"Original Text: {text}")
        
        # 1. Case folding
        text = self.case_folding(text)
        if verbose:
            print(f"Case Folding: {text}")
        
        # 2. Remove punctuation
        text = self.remove_punctuation(text)
        if verbose:
            print(f"Remove Punctuation: {text}")
        
        # 3. Tokenization
        tokens = self.tokenize(text)
        if verbose:
            print(f"Tokenization ({len(tokens)} tokens): {tokens}")
        
        # 4. Remove stopwords
        tokens_before_stopword = tokens.copy()
        tokens = self.remove_stopwords(tokens)
        if verbose:
            removed = set(tokens_before_stopword) - set(tokens)
            print(f"Remove Stopwords ({len(tokens)} remaining): {tokens}")
            if removed:
                print(f"  Removed: {removed}")
        
        # 5. Stemming dengan Sastrawi
        tokens_before_stem = tokens.copy()
        tokens = self.stemming(tokens)
        if verbose:
            print(f"Stemming ({len(tokens)} tokens): {tokens}")
            # Tampilkan perubahan stemming
            changes = [(before, after) for before, after in zip(tokens_before_stem, tokens) if before != after]
            if changes:
                print(f"  Stemming changes:")
                for before, after in changes:
                    print(f"    {before} → {after}")
        
        return tokens



In [3]:

class BagOfWords:
    """Representasi Dokumen (Bag of Words) (15%)"""
    
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor
        self.vocabulary = set()
        self.doc_vectors = []
    
    def fit(self, documents):
        """Membuat vocabulary dari dokumen"""
        for doc in documents:
            tokens = self.preprocessor.preprocess(doc)
            self.vocabulary.update(tokens)
        
        self.vocabulary = sorted(list(self.vocabulary))
        print(f"\nVocabulary ({len(self.vocabulary)} terms): {self.vocabulary}\n")
    
    def transform(self, documents):
        """Mengubah dokumen menjadi vector BoW"""
        self.doc_vectors = []
        
        for idx, doc in enumerate(documents):
            tokens = self.preprocessor.preprocess(doc, verbose=False)
            token_counts = Counter(tokens)
            
            # Membuat vector berdasarkan vocabulary
            vector = [token_counts.get(term, 0) for term in self.vocabulary]
            self.doc_vectors.append(vector)
            
            print(f"Document {idx + 1} Vector:")
            print(f"  Text: {doc}")
            print(f"  Tokens: {tokens}")
            print(f"  Vector: {vector}")
            print()
        
        return self.doc_vectors
    
    def fit_transform(self, documents):
        """Fit dan transform sekaligus"""
        self.fit(documents)
        return self.transform(documents)
    
    def get_term_frequency(self, doc_idx, term):
        """Mendapatkan frekuensi term dalam dokumen"""
        if term in self.vocabulary:
            term_idx = self.vocabulary.index(term)
            return self.doc_vectors[doc_idx][term_idx]
        return 0



In [4]:
class WhooshIndexer:
    """Implementasi Indexing dengan Whoosh (25%)"""
    
    def __init__(self, index_dir="indexdir"):
        self.index_dir = index_dir
        self.schema = Schema(
            doc_id=ID(stored=True),
            content=TEXT(stored=True)
        )
        self.ix = None
    
    def create_index(self):
        """Membuat index baru"""
        # Hapus index lama jika ada
        if os.path.exists(self.index_dir):
            shutil.rmtree(self.index_dir)
        
        os.mkdir(self.index_dir)
        self.ix = create_in(self.index_dir, self.schema)
        print(f"Index created in '{self.index_dir}'")
    
    def add_documents(self, documents):
        """Menambahkan dokumen ke index"""
        writer = self.ix.writer()
        
        for idx, doc in enumerate(documents):
            writer.add_document(
                doc_id=str(idx + 1),
                content=doc
            )
            print(f"Added Document {idx + 1}: {doc[:50]}...")
        
        writer.commit()
        print(f"\n{len(documents)} documents indexed successfully!\n")
    
    def search_whoosh(self, query_text, limit=10):
        """Mencari menggunakan Whoosh"""
        if not self.ix:
            self.ix = open_dir(self.index_dir)
        
        with self.ix.searcher(weighting=scoring.BM25F()) as searcher:
            query = QueryParser("content", self.ix.schema).parse(query_text)
            results = searcher.search(query, limit=limit)
            
            print(f"Whoosh Search Results for: '{query_text}'")
            print(f"Found {len(results)} results\n")
            
            search_results = []
            for hit in results:
                search_results.append({
                    'doc_id': hit['doc_id'],
                    'content': hit['content'],
                    'score': hit.score
                })
                print(f"Doc ID: {hit['doc_id']}")
                print(f"Score: {hit.score:.4f}")
                print(f"Content: {hit['content'][:100]}...")
                print()
            
            return search_results



In [5]:

class CosineSimilarityRanker:
    """Pencarian dan Ranking menggunakan Cosine Similarity (25%)"""
    
    def __init__(self, bow_model):
        self.bow_model = bow_model
    
    def calculate_cosine_similarity(self, vec1, vec2):
        """Menghitung cosine similarity antara dua vector"""
        # Dot product
        dot_product = sum(a * b for a, b in zip(vec1, vec2))
        
        # Magnitude
        magnitude1 = math.sqrt(sum(a * a for a in vec1))
        magnitude2 = math.sqrt(sum(b * b for b in vec2))
        
        # Cosine similarity
        if magnitude1 == 0 or magnitude2 == 0:
            return 0.0
        
        return dot_product / (magnitude1 * magnitude2)
    
    def search(self, query, documents):
        """Mencari dokumen relevan menggunakan cosine similarity"""
        # Preprocess query
        query_tokens = self.bow_model.preprocessor.preprocess(query, verbose=False)
        query_counts = Counter(query_tokens)
        
        # Buat query vector
        query_vector = [query_counts.get(term, 0) for term in self.bow_model.vocabulary]
        
        print(f"\nQuery: '{query}'")
        print(f"Query Tokens: {query_tokens}")
        print(f"Query Vector: {query_vector}\n")
        
        # Hitung similarity untuk setiap dokumen
        results = []
        for idx, doc_vector in enumerate(self.bow_model.doc_vectors):
            similarity = self.calculate_cosine_similarity(query_vector, doc_vector)
            results.append({
                'doc_id': idx + 1,
                'content': documents[idx],
                'similarity': similarity,
                'score_percent': similarity * 100
            })
        
        # Sort by similarity (descending)
        results.sort(key=lambda x: x['similarity'], reverse=True)
        
        # Filter hasil dengan similarity > 0
        results = [r for r in results if r['similarity'] > 0]
        
        print("Cosine Similarity Ranking:")
        print("-" * 80)
        for rank, result in enumerate(results, 1):
            print(f"Rank {rank}:")
            print(f"  Doc ID: {result['doc_id']}")
            print(f"  Similarity: {result['similarity']:.4f} ({result['score_percent']:.2f}%)")
            print(f"  Content: {result['content'][:100]}...")
            print()
        
        return results



In [None]:

def main():
    """Demo lengkap sistem Information Retrieval"""
    
    print("=" * 80)
    print("INFORMATION RETRIEVAL SYSTEM WITH SASTRAWI STEMMER")
    print("=" * 80)
    
    # Sample documents (campuran Bahasa Indonesia dan Inggris)
    documents = [
        "Pemrosesan bahasa alami adalah cabang dari kecerdasan buatan",
        "Algoritma pembelajaran mesin dapat memproses dan menganalisis dataset besar",
        "Sistem temu kembali informasi membantu pengguna menemukan dokumen yang relevan",
        "Python adalah bahasa pemrograman populer untuk ilmu data",
        "Model pembelajaran mendalam memerlukan data pelatihan dalam jumlah besar"
    ]
    
    print("\n" + "=" * 80)
    print("1. TEXT PREPROCESSING DAN TOKENISASI (15%)")
    print("=" * 80)
    
    preprocessor = TextPreprocessor(use_sastrawi=True)
    sample_text = documents[0]
    print(f"\nSample Text: {sample_text}\n")
    tokens = preprocessor.preprocess(sample_text, verbose=True)
    
    print("\n" + "=" * 80)
    print("2. REPRESENTASI DOKUMEN - BAG OF WORDS (15%)")
    print("=" * 80)
    
    bow = BagOfWords(preprocessor)
    doc_vectors = bow.fit_transform(documents)
    
    print("\n" + "=" * 80)
    print("3. IMPLEMENTASI INDEXING DENGAN WHOOSH (25%)")
    print("=" * 80 + "\n")
    
    indexer = WhooshIndexer()
    indexer.create_index()
    indexer.add_documents(documents)
    
    # Search dengan Whoosh
    whoosh_query = "pembelajaran mesin"
    indexer.search_whoosh(whoosh_query)
    
    print("\n" + "=" * 80)
    print("4. PENCARIAN DAN RANKING - COSINE SIMILARITY (25%)")
    print("=" * 80)
    
    ranker = CosineSimilarityRanker(bow)
    search_query = "pembelajaran mesin data"
    results = ranker.search(search_query, documents)
    
    # Demo tambahan dengan query berbeda
    print("\n" + "=" * 80)
    print("DEMO PENCARIAN TAMBAHAN")
    print("=" * 80)
    
    queries = [
        "sistem informasi",
        "pemrograman python",
        "kecerdasan buatan"
    ]
    
    for query in queries:
        print(f"\n--- Query: '{query}' ---")
        results = ranker.search(query, documents)
        if not results:
            print("No relevant documents found.")


    print("=== INFORMATION RETRIEVAL SYSTEM ===")
    print("[1] load & Index Dataset")
    print("[2] Search Query")
    print("[3] Exit")

    print("PERBANDINGAN METODE & KEUNGGULAN SASTRAWI")
    print("=" * 80)
    print("\nWhoosh (BM25F) vs Cosine Similarity:")
    print("- Whoosh: Menggunakan algoritma BM25F yang mempertimbangkan TF-IDF")
    print("- Cosine Similarity: Mengukur kesamaan sudut antar vector dokumen")
    print("\nKeunggulan Sastrawi Stemmer:")
    print("- ✓ Akurasi tinggi untuk Bahasa Indonesia")
    print("- ✓ Menangani imbuhan kompleks (me-, ber-, pe-, ter-, dll)")
    print("- ✓ Kamus kata dasar yang lengkap")
    print("- ✓ Algoritma Nazief & Adriani yang terbukti efektif")
    print("\nContoh stemming dengan Sastrawi:")
    test_words = ["memproses", "pembelajaran", "menganalisis", "menemukan", "memerlukan"]
    print(f"{'Kata Asli':<20} → {'Kata Dasar':<20}")
    print("-" * 42)
    for word in test_words:
        stemmed = preprocessor.sastrawi_stemmer.stem(word) if preprocessor.use_sastrawi else word
        print(f"{word:<20} → {stemmed:<20}")
    

if __name__ == "__main__":
    main()

INFORMATION RETRIEVAL SYSTEM WITH SASTRAWI STEMMER

1. TEXT PREPROCESSING DAN TOKENISASI (15%)
✓ Sastrawi Stemmer initialized


Sample Text: Pemrosesan bahasa alami adalah cabang dari kecerdasan buatan

Original Text: Pemrosesan bahasa alami adalah cabang dari kecerdasan buatan
Case Folding: pemrosesan bahasa alami adalah cabang dari kecerdasan buatan
Remove Punctuation: pemrosesan bahasa alami adalah cabang dari kecerdasan buatan
Tokenization (8 tokens): ['pemrosesan', 'bahasa', 'alami', 'adalah', 'cabang', 'dari', 'kecerdasan', 'buatan']
Remove Stopwords (6 remaining): ['pemrosesan', 'bahasa', 'alami', 'cabang', 'kecerdasan', 'buatan']
  Removed: {'adalah', 'dari'}
Stemming (6 tokens): ['pemrosesan', 'bahasa', 'alami', 'cabang', 'cerdas', 'buat']
  Stemming changes:
    kecerdasan → cerdas
    buatan → buat

2. REPRESENTASI DOKUMEN - BAG OF WORDS (15%)

Vocabulary (32 terms): ['ajar', 'alami', 'algoritma', 'analis', 'bahasa', 'bantu', 'besar', 'buat', 'cabang', 'cerdas', 'dalam', 'da