In [26]:
!pip install pandas tabulate

Collecting tabulate
  Obtaining dependency information for tabulate from https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl.metadata
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [38]:
import pymongo
from pymongo import MongoClient, UpdateOne
import re
from tqdm import tqdm
from nltk.tokenize import word_tokenize
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
client = MongoClient("mongodb://localhost:27017")
db = client["arxiv_db"]
collection = db["papers"]

In [18]:
total_docs = collection.count_documents({})
print(f"Total dokumen dalam MongoDB: {total_docs}")

Total dokumen dalam MongoDB: 16799


In [19]:
print("\n SAMPLE DATA:")
sample_doc = collection.find_one({})
if sample_doc:
    print("Struktur dokumen:")
    for key, value in sample_doc.items():
        if isinstance(value, str) and len(value) > 100:
            print(f"  {key}: {value[:100]}...")
        else:
            print(f"  {key}: {value}")
else:
    print(" Tidak ada data dalam collection")


 SAMPLE DATA:
Struktur dokumen:
  _id: 684134f79cfee88b7605bd0a
  id: http://arxiv.org/abs/2001.12004v2
  title: Neural MMO v1.3: A Massively Multiagent Game Environment for Training and Evaluating Neural Networks
  authors: Joseph Suarez, Yilun Du, Igor Mordatch, Phillip Isola
  summary: Progress in multiagent intelligence research is fundamentally limited by the
number and quality of e...
  published: 2020-01-31
  updated: 2020-04-17
  primary_category: cs.LG
  categories: cs.LG, cs.AI, cs.MA, stat.ML
  pdf_url: http://arxiv.org/pdf/2001.12004v2
  is_english: True
  combined_text: 
  is_processed: False
  processed_summary: 
  processed_title: 
  text_length: 0


In [20]:
# Statistik kategori
print("\n STATISTIK KATEGORI:")
pipeline = [
    {"$group": {"_id": "$primary_category", "count": {"$sum": 1}}},
    {"$sort": {"count": -1}},
    {"$limit": 10}
]
top_categories = list(collection.aggregate(pipeline))
print("Top 10 kategori:")
for cat in top_categories:
    print(f"  {cat['_id']}: {cat['count']} papers")


 STATISTIK KATEGORI:
Top 10 kategori:
  cs.CV: 2466 papers
  cs.LG: 1968 papers
  eess.IV: 1072 papers
  cs.SE: 976 papers
  cs.CR: 970 papers
  cs.DS: 854 papers
  cs.DB: 852 papers
  eess.AS: 788 papers
  eess.SY: 764 papers
  cs.GT: 741 papers


In [21]:
class MongoDBPreprocessor:
    def __init__(self, collection):
        self.collection = collection
        
    def preprocess_text(self, text):
        """Preprocessing text for BERT model"""
        if not text or pd.isna(text):
            return ""
        
        text = str(text)
        text = text.lower()
        
        # Remove URLs dan email
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove special characters
        text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
        
        # Remove extra whitespaces
        text = ' '.join(text.split())
        
        words = text.split()
        
        return ' '.join(words)
    
    def process_sample(self, limit=5):
        """Process sample data for testing"""
        print(f" Processing {limit} sample documents...")
        
        cursor = self.collection.find({}).limit(limit)
        results = []
        
        for doc in cursor:
            original_title = doc.get('title', '')
            original_summary = doc.get('summary', '')
            
            processed_title = self.preprocess_text(original_title)
            processed_summary = self.preprocess_text(original_summary)
            combined_text = f"{processed_title} {processed_summary}".strip()
            
            result = {
                '_id': doc['_id'],
                'original_title': original_title,
                'processed_title': processed_title,
                'original_summary': original_summary[:200] + "..." if len(original_summary) > 200 else original_summary,
                'processed_summary': processed_summary[:200] + "..." if len(processed_summary) > 200 else processed_summary,
                'combined_text': combined_text[:300] + "..." if len(combined_text) > 300 else combined_text,
                'text_length': len(combined_text.split())
            }
            results.append(result)
        
        return results

# Jalankan preprocessing sample
preprocessor = MongoDBPreprocessor(collection)
sample_results = preprocessor.process_sample(limit=3)

 Processing 3 sample documents...


In [22]:
print(" HASIL PREPROCESSING SAMPLE:")
for i, result in enumerate(sample_results, 1):
    print(f"\n--- DOKUMEN {i} ---")
    print(f"ID: {result['_id']}")
    print(f"Original Title: {result['original_title']}")
    print(f"Processed Title: {result['processed_title']}")
    print(f"Original Summary: {result['original_summary']}")
    print(f"Processed Summary: {result['processed_summary']}")
    print(f"Combined Text Length: {result['text_length']} words")
    print("-" * 50)

 HASIL PREPROCESSING SAMPLE:

--- DOKUMEN 1 ---
ID: 684134f79cfee88b7605bd0a
Original Title: Neural MMO v1.3: A Massively Multiagent Game Environment for Training and Evaluating Neural Networks
Processed Title: neural mmo v1 3 a massively multiagent game environment for training and evaluating neural networks
Original Summary: Progress in multiagent intelligence research is fundamentally limited by the
number and quality of environments available for study. In recent years,
simulated games have become a dominant research pl...
Processed Summary: progress in multiagent intelligence research is fundamentally limited by the number and quality of environments available for study in recent years simulated games have become a dominant research plat...
Combined Text Length: 165 words
--------------------------------------------------

--- DOKUMEN 2 ---
ID: 684134f79cfee88b7605bd0b
Original Title: Deontological Ethics By Monotonicity Shape Constraints
Processed Title: deontological ethics by m

## PRE=PROCESS SELURUH DOKUMEN

In [23]:
class MongoDBPreprocessor:
    def __init__(self, collection):
        self.collection = collection
        
    def preprocess_text(self, text):
        """Preprocessing minimal untuk BERT tanpa stopword removal/lemmatisasi"""
        if not text or pd.isna(text):
            return ""
        
        text = str(text)
        
        # 1. Remove URLs dan email
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\S+@\S+', '', text)
        
        # 2. Remove special characters (tapi pertahankan tanda baca dasar)
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', ' ', text)
        
        # 3. Remove extra whitespace
        text = ' '.join(text.split())
        
        return text.strip()
    
    def process_all_documents(self, batch_size=500):
        """Preprocess seluruh dokumen dan simpan ke MongoDB"""
        total_docs = self.collection.count_documents({})
        print(f"Memulai preprocessing untuk {total_docs} dokumen...")
        
        # Setup progress bar
        pbar = tqdm(total=total_docs)
        
        # Process in batches
        for i in range(0, total_docs, batch_size):
            batch = list(self.collection.find({}).skip(i).limit(batch_size))
            
            bulk_operations = []
            for doc in batch:
                # Preprocess title dan summary
                processed_title = self.preprocess_text(doc.get('title', ''))
                processed_summary = self.preprocess_text(doc.get('summary', ''))
                combined_text = f"{processed_title} {processed_summary}".strip()
                
                # Prepare update operation
                bulk_operations.append(
                    pymongo.UpdateOne(
                        {'_id': doc['_id']},
                        {'$set': {
                            'processed_title': processed_title,
                            'processed_summary': processed_summary,
                            'combined_text': combined_text,
                            'text_length': len(combined_text.split()),
                            'is_processed': True
                        }}
                    )
                )
            
            # Execute bulk write
            if bulk_operations:
                self.collection.bulk_write(bulk_operations)
            
            # Update progress bar
            pbar.update(len(batch))
        
        pbar.close()
        print(f"Preprocessing selesai. Total {total_docs} dokumen diproses.")

In [36]:
# Koneksi ke MongoDB
client = MongoClient("mongodb://localhost:27017")
db = client["arxiv_db"]
collection = db["papers"]

# Inisialisasi field untuk data yang sudah diproses
collection.update_many(
    {},
    {'$set': {
        'is_processed': False,
        'processed_title': '',
        'processed_summary': '',
        'combined_text': '',
        'text_length': 0
    }},
    upsert=False
)

# Jalankan preprocessing
preprocessor = MongoDBPreprocessor(collection)
preprocessor.process_all_documents()


def display_as_dataframe(num_samples=3):
    samples = list(collection.find({'is_processed': True}).limit(num_samples))
    
    data = []
    for sample in samples:
        data.append({
            'Type': 'Original',
            'Title': sample.get('title', '')[:80] + "..." if len(sample.get('title', '')) > 80 else sample.get('title', ''),
            'Summary': sample.get('summary', '')[:100] + "..." if len(sample.get('summary', '')) > 100 else sample.get('summary', ''),
            'Length': len(sample.get('summary', '').split())
        })
        data.append({
            'Type': 'Processed', 
            'Title': sample.get('processed_title', '')[:80] + "..." if len(sample.get('processed_title', '')) > 80 else sample.get('processed_title', ''),
            'Summary': sample.get('processed_summary', '')[:100] + "..." if len(sample.get('processed_summary', '')) > 100 else sample.get('processed_summary', ''),
            'Length': sample.get('text_length', 0)
        })
    
    df = pd.DataFrame(data)
    pd.set_option('display.max_colwidth', 60)
    print(df.to_markdown(tablefmt="grid", index=False))

display_as_dataframe()

Memulai preprocessing untuk 16799 dokumen...


100%|██████████| 16799/16799 [00:03<00:00, 5196.23it/s]

Preprocessing selesai. Total 16799 dokumen diproses.
+-----------+-------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------+----------+
| Type      | Title                                                                               | Summary                                                                                                 |   Length |
| Original  | Neural MMO v1.3: A Massively Multiagent Game Environment for Training and Evalua... | Progress in multiagent intelligence research is fundamentally limited by the                            |      147 |
|           |                                                                                     | number and quality of e...                                                                              |          |
+-----------+------------------------------------------------------------------




In [39]:
# Load model BERT ringan untuk efisiensi
model = SentenceTransformer('all-MiniLM-L6-v2')

# Ambil dokumen yang sudah diproses tapi belum punya embedding
cursor = collection.find({
    "is_processed": True,
    "embedding": {"$exists": False}
})

batch = list(cursor)
texts = [doc['combined_text'] for doc in batch]
embeddings = model.encode(texts, show_progress_bar=True)

# Simpan ke MongoDB
ops = []
for doc, embed in zip(batch, embeddings):
    ops.append(UpdateOne(
        {"_id": doc["_id"]},
        {"$set": {"embedding": embed.tolist()}}
    ))

if ops:
    collection.bulk_write(ops)
    print(f"{len(ops)} embedding berhasil disimpan.")
else:
    print("Tidak ada dokumen baru untuk diproses.")

Batches: 100%|██████████| 525/525 [08:31<00:00,  1.03it/s]


16799 embedding berhasil disimpan.


In [40]:
# Ambil beberapa dokumen yang sudah punya embedding
cursor = collection.find(
    {"embedding": {"$exists": True}},
    {
        "title": 1,
        "processed_title": 1,
        "processed_summary": 1,
        "combined_text": 1,
        "text_length": 1,
        "embedding": {"$slice": 5}, 
        "published": 1,
        "primary_category": 1,
        "pdf_url": 1
    }
).limit(3)

# Format tampilan
docs = list(cursor)

if not docs:
    print("Belum ada dokumen dengan embedding.")
else:
    for i, doc in enumerate(docs, 1):
        print(f"\n===== DOKUMEN #{i} =====")
        print(f"Title             : {doc.get('title', '')}")
        print(f"Kategori         : {doc.get('primary_category', '')}")
        print(f"Published         : {doc.get('published', '')}")
        print(f"PDF URL           : {doc.get('pdf_url', '')}")
        print(f"Processed Title   : {doc.get('processed_title', '')}")
        print(f"Processed Summary : {doc.get('processed_summary', '')[:200]}...")
        print(f"Combined Text     : {doc.get('combined_text', '')[:200]}...")
        print(f"Text Length       : {doc.get('text_length', 0)} words")
        print(f"Embedding Sample  : {doc.get('embedding', [])[:5]} ... (dimensi: {len(doc.get('embedding', []))} total)")
        print("=" * 70)


===== DOKUMEN #1 =====
Title             : Neural MMO v1.3: A Massively Multiagent Game Environment for Training and Evaluating Neural Networks
Kategori         : cs.LG
Published         : 2020-01-31
PDF URL           : http://arxiv.org/pdf/2001.12004v2
Processed Title   : Neural MMO v1.3 A Massively Multiagent Game Environment for Training and Evaluating Neural Networks
Processed Summary : Progress in multiagent intelligence research is fundamentally limited by the number and quality of environments available for study. In recent years, simulated games have become a dominant research pl...
Combined Text     : Neural MMO v1.3 A Massively Multiagent Game Environment for Training and Evaluating Neural Networks Progress in multiagent intelligence research is fundamentally limited by the number and quality of e...
Text Length       : 167 words
Embedding Sample  : [-0.02622828260064125, -0.05907943844795227, -0.013319294899702072, -0.021589957177639008, 0.011467973701655865] ... (dimensi: 

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_articles(query, top_k=5):
    # Encode query ke embedding
    query_embedding = model.encode([query])[0]
    
    # Ambil semua dokumen yang sudah punya embedding
    cursor = collection.find({"embedding": {"$exists": True}})
    
    results = []
    for doc in cursor:
        doc_embedding = np.array(doc["embedding"])
        similarity = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        results.append((doc, similarity))
    
    # Urutkan berdasarkan similarity
    results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]
    
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n== Artikel #{i} ==")
        print(f"Title      : {doc['title']}")
        print(f"Score      : {score:.4f}")
        print(f"Authors    : {doc['authors']}")
        print(f"Published  : {doc['published']}")
        print(f"Category   : {doc['primary_category']}")
        print(f"PDF URL    : {doc['pdf_url']}")
        print(f"Summary    : {doc['summary'][:300]}...\n")

# Contoh penggunaan
recommend_articles("Convolutional Neural Networks for Image Classification")


== Artikel #1 ==
Title      : Convolutional Neural Networks as a Model of the Visual System: Past, Present, and Future
Score      : 0.5899
Authors    : Grace W. Lindsay
Published  : 2020-01-20
Category   : q-bio.NC
PDF URL    : http://arxiv.org/pdf/2001.07092v2
Summary    : Convolutional neural networks (CNNs) were inspired by early findings in the
study of biological vision. They have since become successful tools in computer
vision and state-of-the-art models of both neural activity and behavior on
visual tasks. This review highlights what, in the context of CNNs, it...


== Artikel #2 ==
Title      : Research Progress of Convolutional Neural Network and its Application in Object Detection
Score      : 0.5846
Authors    : Wei Zhang, Zuoxiang Zeng
Published  : 2020-07-27
Category   : cs.CV
PDF URL    : http://arxiv.org/pdf/2007.13284v1
Summary    : With the improvement of computer performance and the increase of data volume,
the object detection based on convolutional neural network 