# Install dan import library + Setting Google Drive

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install biopython
!pip install nltk
!pip install sentence-transformers
!pip install -U transformers torch sentencepiece accelerate bitsandbytes
!pip install torch torchvision torchaudio --upgrade

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvid

In [None]:
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from Bio import Entrez
from Bio import Medline
import time
import pandas as pd
import numpy as np
import re
import os
from google.colab import drive
from typing import List, Optional, Any, Tuple, Dict

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
print("Mounting Google Drive...")
drive.mount('/content/drive')

Mounting Google Drive...
Mounted at /content/drive


In [None]:
gdrive_base_dir = "/content/drive/MyDrive/pubmed-chatbot-using-rag-llm"
os.makedirs(gdrive_base_dir, exist_ok=True)
print(f"File akan disimpan di: {gdrive_base_dir}")

File akan disimpan di: /content/drive/MyDrive/pubmed-chatbot-using-rag-llm


# Membangun dataset

## Fungsi untuk meng-scraping dataset dari PubMed

Untuk dataset, saya akan menggunakan artikel dari PubMed dengan keyword yang telah dikurasi. Untuk scraping, saya menggunakan library `Entrez`.

In [None]:
Entrez.email = "afiqilyasakmal@gmail.com"

def fetch_pubmed_abstracts_multi_keyword(keyword_list: list[str], max_results_per_keyword: int) -> list[dict[str, str]]:
    """
    Mencari artikel di PubMed untuk setiap kata kunci dalam daftar dan mengambil detailnya.
    Hasil dari semua kata kunci akan digabungkan dan diduplikasi.

    Args:
        keyword_list (list[str]): Daftar string kata kunci untuk dicari.
        max_results_per_keyword (int): Jumlah maksimal hasil yang diambil per kata kunci.

    Returns:
        list[dict[str, str]]: Daftar dictionary yang berisi detail artikel (pmid, title, abstract,
                                authors, journal, publication_date, retrieved_with_keyword),
                                atau list kosong jika tidak ada hasil.
    """
    all_articles_data: list[dict[str, str]] = []
    processed_pmids: set[str] = set()

    for keyword_query in keyword_list:
        handle_search = None
        handle_fetch = None
        try:
            handle_search = Entrez.esearch(db="pubmed", term=keyword_query, retmax=str(max_results_per_keyword), sort="relevance")
            record_search = Entrez.read(handle_search)
            ids: list[str] = record_search["IdList"]

            if not ids:
                print(f"Tidak ada artikel yang ditemukan untuk kata kunci: '{keyword_query}'")
                time.sleep(0.34)
                continue

            new_ids_to_fetch = [pmid for pmid in ids if pmid not in processed_pmids]
            if not new_ids_to_fetch:
                time.sleep(0.34)
                continue

            handle_fetch = Entrez.efetch(db="pubmed", id=new_ids_to_fetch, rettype="medline", retmode="text")
            records_fetch = Medline.parse(handle_fetch)

            articles_from_current_keyword = 0
            for record in records_fetch:
                if articles_from_current_keyword >= len(new_ids_to_fetch):
                     break

                pmid = record.get("PMID", "")
                if not pmid or pmid in processed_pmids:
                    continue

                title: str = record.get("TI", "No title available")
                abstract: str = record.get("AB", "No abstract available")
                authors_list: list[str] = record.get("AU", [])
                authors_str: str = ", ".join(authors_list) if authors_list else "No authors listed"
                journal_ta: str = record.get("TA", "No journal TA available")
                journal_jt: str = record.get("JT", "No journal Title")
                journal: str = journal_ta if journal_ta != "No journal TA available" else journal_jt
                pub_date: str = record.get("DP", "No publication date available")

                if abstract != "No abstract available":
                    all_articles_data.append({
                        "pmid": pmid,
                        "title": title,
                        "abstract": abstract,
                        "authors": authors_str,
                        "journal": journal,
                        "publication_date": pub_date,
                        "retrieved_with_keyword": keyword_query
                    })
                    processed_pmids.add(pmid)
                    articles_from_current_keyword += 1
                time.sleep(0.34)

        except Exception as e:
            print(f"Terjadi kesalahan saat memproses kata kunci '{keyword_query}': {e}")
            time.sleep(1)
        finally:
            if handle_search:
                handle_search.close()
            if handle_fetch:
                handle_fetch.close()

    if not all_articles_data:
        print("Tidak ada artikel yang berhasil diambil dari semua kata kunci yang diberikan.")
    else:
        print(f"Total {len(all_articles_data)} artikel unique discraping dari semua keyword.")

    return all_articles_data

## Melakukan scraping dari PubMed dan membuat DataFrame
Scraping dengan keyword yang didefinisikan pada `search_keyword_list`, kemudian dimasukkan ke dalam DataFrame `df_pubmed`. File hasil scraping disimpan di `pubmed_dataset.csv`.

In [None]:
search_keywords_list = [
    "Hypertension AND Indonesia", "Stroke AND Indonesia", "Ischemic Heart Disease AND Indonesia",
    "Type 2 Diabetes Mellitus AND Indonesia", "Acute Respiratory Infection AND Indonesia",
    "Dengue Fever AND Indonesia", "Tuberculosis AND Indonesia", "COVID-19 AND Indonesia",
    "Avian Influenza H5N1 AND Indonesia", "Mpox OR Monkeypox AND Indonesia",
    "Typhoid Fever AND Indonesia", "Acute Diarrhea AND Indonesia", "Measles AND Indonesia",
    "Hepatitis A OR Hepatitis B AND Indonesia", "Cancer AND Indonesia",
    "Childhood Cancer AND Indonesia", "Helminthiasis AND Indonesia",
    "Skin Diseases AND Indonesia", "Diphtheria AND Indonesia", "Stunting AND Indonesia"
]
articles_per_keyword = 100

print("Memulai proses pengambilan artikel PubMed...")
fetched_articles: list[dict[str, str]] = fetch_pubmed_abstracts_multi_keyword(search_keywords_list, articles_per_keyword)

if fetched_articles:
    print(f"Total artikel unik {len(fetched_articles)}")
    df_pubmed = pd.DataFrame(fetched_articles)
    print("Contoh data:")
    print(df_pubmed.head(2))
    print("Informasi column:")
    df_pubmed.info()

    output_filename_gdrive = os.path.join(gdrive_base_dir, "pubmed_dataset.csv")
    df_pubmed.to_csv(output_filename_gdrive, index=False, encoding='utf-8')
    print(f"Dataset disimpan ke Google Drive di: {output_filename_gdrive}")

    if 'retrieved_with_keyword' in df_pubmed.columns:
        print("Jumlah artikel per kata kunci sumber:")
        print(df_pubmed['retrieved_with_keyword'].value_counts())
else:
    print("Tidak ada artikel yang berhasil diambil atau diproses dari semua kata kunci.")

# Preprocessing

## Fungsi untuk melakukan preprocessing
Preprocessing yang dilakukan:
1. Mengembalikan string kosong kalau string tersebut adalah null
2. Mengubah teks menjadi huruf kecil
3. Menghapus tanda baca
4. Melakukan tokenisasi pakai `WordNet`
5. Melakukan *lemmatization*

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_single_text(text: Optional[str]) -> str:
    """
    Melakukan preprocessing pada satu string teks.
    Langkah-langkah:
    1. Penanganan nilai kosong atau bukan string (mengembalikan string kosong).
    2. Mengubah teks menjadi huruf kecil (lowercase).
    3. Menghapus karakter tanda baca (punctuation).
    4. Tokenisasi teks.
    5. Lematisasi setiap token.
    Stopwords tidak dihapus dalam fungsi ini.

    Args:
        text (Optional[str]): Teks yang akan diproses. Bisa None atau string.

    Returns:
        str: Teks yang telah dipra-pemrosesan. Mengembalikan string kosong jika input None atau bukan string.
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)

    processed_tokens: List[str] = []
    for token in tokens:
        lemma = lemmatizer.lemmatize(token)
        processed_tokens.append(lemma)

    return " ".join(processed_tokens)

In [None]:
def batch_preprocess_dataframe_texts(
    input_csv_path: str,
    output_csv_path: str,
    columns_to_process: List[str]
) -> Optional[pd.DataFrame]:
    """
    Memuat dataset dari file CSV, melakukan preprocessing pada kolom-kolom teks yang ditentukan,
    dan menyimpan hasilnya ke file CSV baru.

    Args:
        input_csv_path (str): Path menuju file CSV dataset input.
        output_csv_path (str): Path untuk menyimpan file CSV dataset yang sudah diproses.
        columns_to_process (List[str]): Daftar nama kolom dalam DataFrame yang berisi teks
                                         untuk dipreprocessing. Kolom baru akan dibuat
                                         dengan nama <nama_kolom_asli>_processed.

    Returns:
        Optional[pd.DataFrame]: DataFrame yang telah diproses, atau None jika terjadi kesalahan saat memuat file.
    """
    try:
        df = pd.read_csv(input_csv_path)
        print(f"Dataset '{input_csv_path}' berhasil dimuat. Jumlah baris: {len(df)}")
    except FileNotFoundError:
        print(f"Error: File '{input_csv_path}' tidak ditemukan. Pastikan path file benar.")
        return None
    except Exception as e:
        print(f"Error saat memuat CSV '{input_csv_path}': {e}")
        return None

    if df.empty:
        print("DataFrame kosong, tidak ada data untuk diproses.")
        return df

    for column_name in columns_to_process:
        if column_name in df.columns:
            print(f"Preprocessing untuk kolom '{column_name}'...")
            processed_column_name = f"processed_{column_name}"
            df[processed_column_name] = df[column_name].apply(preprocess_single_text)
            print(f"Kolom '{processed_column_name}' telah dibuat.")
        else:
            print(f"Peringatan: Kolom '{column_name}' tidak ditemukan dalam DataFrame.")

    try:
        df.to_csv(output_csv_path, index=False, encoding='utf-8')
        print(f"Dataset yang sudah dipreprocess disimpan di '{output_csv_path}'")
    except Exception as e:
        print(f"Error saat menyimpan CSV ke '{output_csv_path}': {e}")
        return df

    if len(df) > 0 and columns_to_process:
        original_col_for_compare = columns_to_process[0]
        processed_col_for_compare = f"processed_{original_col_for_compare}"
        if original_col_for_compare in df.columns and processed_col_for_compare in df.columns:
            print(f"Before and after preprocessing")
            original_sample = df[original_col_for_compare].iloc[0]
            processed_sample = df[processed_col_for_compare].iloc[0]
            print(f"Original:\n{original_sample}\n")
            print(f"Preprocessing:\n{processed_sample}")

    return df

In [None]:
input_file = "pubmed_dataset.csv"
output_file_cleaned = "pubmed_dataset_cleaned.csv"

## Melakukan preprocessing

In [None]:
gdrive_base_dir = "/content/drive/MyDrive/pubmed-chatbot-using-rag-llm"
input_file_gdrive = os.path.join(gdrive_base_dir, "pubmed_dataset.csv")
output_file_cleaned_gdrive = os.path.join(gdrive_base_dir, "pubmed_dataset_cleaned.csv")
df_processed = batch_preprocess_dataframe_texts(input_file_gdrive, output_file_cleaned_gdrive, ['abstract', 'title'])

Dataset '/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/pubmed_dataset.csv' berhasil dimuat. Jumlah baris: 1738
Preprocessing untuk kolom 'abstract'...
Kolom 'processed_abstract' telah dibuat.
Preprocessing untuk kolom 'title'...
Kolom 'processed_title' telah dibuat.
Dataset yang sudah dipreprocess disimpan di '/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/pubmed_dataset_cleaned.csv'
Before and after preprocessing
Original:
BACKGROUND: Hypertension can be detected at the primary health-care level and low-cost treatments can effectively control hypertension. We aimed to measure the prevalence of hypertension and progress in its detection, treatment, and control from 1990 to 2019 for 200 countries and territories. METHODS: We used data from 1990 to 2019 on people aged 30-79 years from population-representative studies with measurement of blood pressure and data on blood pressure treatment. We defined hypertension as having systolic blood pressure 140 mm Hg or greater, diastoli

# Embedding

## Fungsi untuk melakukan embedding
Untuk mengubah teks menjadi vektor, saya menggunakan model `all-MiniLM-L6-v2`. Model ini dipilih karena sangat populer dan sering direkomendasikan untuk menghasilkan sentence embeddings.

In [None]:
def load_cleaned_data(csv_file_path: str, text_column: str = 'processed_abstract') -> pd.DataFrame:
    """
    Memuat data artikel yang sudah dipra-pemrosesan dari file CSV.
    Fungsi ini juga akan menghapus baris di mana kolom teks yang ditentukan kosong atau NaN.

    Args:
        csv_file_path (str): Path menuju file CSV yang berisi data sudah bersih.
        text_column (str): Nama kolom yang berisi teks yang sudah diproses dan akan digunakan
                           untuk embedding (misalnya, 'processed_abstract' atau 'processed_combined_text').

    Returns:
        pd.DataFrame: DataFrame yang berisi data, atau DataFrame kosong jika terjadi error
                      atau kolom teks yang dibutuhkan tidak ada/semuanya kosong.
    """
    try:
        df = pd.read_csv(csv_file_path)
        print(f"Dataset '{csv_file_path}' berhasil dimuat. Jumlah baris awal: {len(df)}")

        if text_column not in df.columns:
            print(f"Error: Kolom '{text_column}' tidak ditemukan di '{csv_file_path}'.")
            return pd.DataFrame()

        # Menghapus baris jika teks pada kolom yang ditentukan kosong atau NaN
        df.dropna(subset=[text_column], inplace=True)
        df = df[df[text_column].astype(str).str.strip() != '']

        if df.empty:
            print(f"Tidak ada data valid di kolom '{text_column}' setelah dibersihkan.")
            return pd.DataFrame()

        print(f"Jumlah baris setelah memfilter kolom '{text_column}' yang kosong: {len(df)}")
        return df

    except FileNotFoundError:
        print(f"Error: File '{csv_file_path}' tidak ditemukan. Pastikan path file benar.")
        return pd.DataFrame()
    except Exception as e:
        print(f"Terjadi kesalahan saat memuat data dari '{csv_file_path}': {e}")
        return pd.DataFrame()

In [None]:
def load_sentence_transformer_model(model_name: str = 'all-MiniLM-L6-v2') -> Optional[SentenceTransformer]:
    """
    Memuat model Sentence Transformer berdasarkan nama model yang diberikan.

    Args:
        model_name (str): Nama atau path model Sentence Transformer yang akan digunakan.
                          Contoh: 'all-MiniLM-L6-v2', 'paraphrase-multilingual-MiniLM-L12-v2'.

    Returns:
        Optional[SentenceTransformer]: Objek model SentenceTransformer jika berhasil dimuat,
                                       None jika gagal.
    """
    print(f"Memuat model Sentence Transformer: '{model_name}'...")
    try:
        model = SentenceTransformer(model_name)
        print(f"Model '{model_name}' berhasil dimuat.")
        return model
    except Exception as e:
        print(f"Gagal memuat model '{model_name}': {e}")
        return None

In [None]:
def generate_embeddings_with_model(
    texts: List[str],
    model: SentenceTransformer,
    batch_size: int = 32
) -> Optional[np.ndarray]:
    """
    Menghasilkan embeddings untuk daftar teks menggunakan model Sentence Transformer yang sudah dimuat.

    Args:
        texts (List[str]): Daftar string teks yang akan di-embed.
        model (SentenceTransformer): Objek model SentenceTransformer yang sudah dimuat.
        batch_size (int): Ukuran batch untuk proses encoding.

    Returns:
        Optional[np.ndarray]: Array NumPy yang berisi embeddings jika berhasil, None jika ada masalah.
    """
    if not texts:
        print("Tidak ada teks yang diberikan untuk menghasilkan embeddings.")
        return None
    if not isinstance(model, SentenceTransformer):
        print("Model yang diberikan bukan instance SentenceTransformer yang valid.")
        return None

    print(f"Menghasilkan embeddings untuk {len(texts)} teks menggunakan model '{model._first_module().name if hasattr(model, '_first_module') and callable(model._first_module) and hasattr(model._first_module(), 'name') else 'yang sudah dimuat'}'...")
    try:
        embeddings = model.encode(texts, show_progress_bar=True, batch_size=batch_size)
        print("Embeddings berhasil dihasilkan.")
        return embeddings
    except Exception as e:
        print(f"Terjadi error saat menghasilkan embeddings: {e}")
        return None


## Melakukan embedding
Hasil embedding disimpan dalam file `pubmed_article_embedding.npy`. Saya pakai NumPy untuk menyimpan vektor karena dokumen tidak terlalu banyak, hanya sekitar 1700an dan sekiranya kemungkinannya kecil untuk diupdate. Jika dataset akan membesar atau ada keperluan untuk mengupdate dataset, saya kemungkinan besar akan menggunakan Pinecone.

In [None]:
def run_embedding_pipeline():
    # konfigurasi path pada google drive
    gdrive_base_dir = "/content/drive/MyDrive/pubmed-chatbot-using-rag-llm" # Contoh path GDrive
    input_cleaned_csv = os.path.join(gdrive_base_dir, "pubmed_dataset_cleaned.csv")
    output_embeddings_file = os.path.join(gdrive_base_dir, "pubmed_article_embeddings.npy")
    output_retrieval_df_file = os.path.join(gdrive_base_dir, "pubmed_articles_for_retrieval.csv")

    # load data yang sudah dipreprocess
    df_processed = load_cleaned_data(input_cleaned_csv, text_column='processed_abstract')

    if df_processed.empty:
        print("Pipeline dihentikan karena gagal memuat data atau data kosong.")
        return

    # load model
    embedding_model = load_sentence_transformer_model(model_name='all-MiniLM-L6-v2')

    if embedding_model is None:
        print("Pipeline dihentikan karena gagal memuat model embedding.")
        return

    # embedding untuk abstract
    abstract_texts = df_processed['processed_abstract'].astype(str).tolist()
    if not abstract_texts:
        print("Tidak ada abstract yang valid untuk di-embed.")
        return

    article_embeddings = generate_embeddings_with_model(abstract_texts, embedding_model)

    if article_embeddings is not None:
        print(f"Shape dari array embeddings artikel: {article_embeddings.shape}")
        np.save(output_embeddings_file, article_embeddings)
        print(f"Embeddings artikel disimpan ke '{output_embeddings_file}'")

        # save dataframe untuk retrieval
        df_for_retrieval = df_processed[['pmid', 'processed_title', 'processed_abstract']].reset_index(drop=True)
        df_for_retrieval.to_csv(output_retrieval_df_file, index=False)
        print(f"Data artikel yang sesuai dengan embeddings disimpan ke '{output_retrieval_df_file}'")
    else:
        print("Gagal menghasilkan embeddings artikel.")

if __name__ == "__main__":
   run_embedding_pipeline()

Dataset '/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/pubmed_dataset_cleaned.csv' berhasil dimuat. Jumlah baris awal: 1738
Jumlah baris setelah memfilter kolom 'processed_abstract' yang kosong: 1738
Memuat model Sentence Transformer: 'all-MiniLM-L6-v2'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model 'all-MiniLM-L6-v2' berhasil dimuat.
Menghasilkan embeddings untuk 1738 teks menggunakan model 'yang sudah dimuat'...


Batches:   0%|          | 0/55 [00:00<?, ?it/s]

Embeddings berhasil dihasilkan.
Shape dari array embeddings artikel: (1738, 384)
Embeddings artikel disimpan ke '/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/pubmed_article_embeddings.npy'
Data artikel yang sesuai dengan embeddings disimpan ke '/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/pubmed_articles_for_retrieval.csv'


# Retrieval
Melakukan retrieval pada dokumen dan query menggunakan cosine similarity. Akan diperoleh top 20 dokumen yang paling mirip dengan query.

In [None]:
def load_retrieval_assets(
    embeddings_file_path: str,
    data_file_path: str
) -> Tuple[Optional[np.ndarray], Optional[pd.DataFrame]]:
    """
    Memuat array embeddings artikel dan DataFrame data artikel yang sesuai dari file.

    Args:
        embeddings_file_path (str): Path menuju file .npy yang berisi embeddings artikel.
        data_file_path (str): Path menuju file .csv yang berisi data artikel
                                (misal:  PMID, judul, abstrak yang sudah diproses).

    Returns:
        Tuple[Optional[np.ndarray], Optional[pd.DataFrame]]:
        Sebuah tuple berisi (article_embeddings, df_retrieval).
        Mengembalikan (None, None) jika terjadi kesalahan dalam memuat file
        atau jika jumlah embeddings tidak cocok dengan jumlah baris data.
    """
    try:
        article_embeddings = np.load(embeddings_file_path)
        df_retrieval = pd.read_csv(data_file_path)
        print(f"Embeddings berhasil dimuat dari '{embeddings_file_path}'. Shape: {article_embeddings.shape}")
        print(f"Data artikel berhasil dimuat dari '{data_file_path}'. Jumlah baris: {len(df_retrieval)}")

        if article_embeddings.shape[0] != len(df_retrieval):
            print("Peringatan: Jumlah embeddings tidak cocok dengan jumlah baris data artikel!")
            print(f"Embeddings: {article_embeddings.shape[0]}, Data: {len(df_retrieval)}")
            return None, None
        return article_embeddings, df_retrieval
    except FileNotFoundError:
        print(f"Error: Salah satu atau kedua file tidak ditemukan.")
        print(f"  - Path embeddings: '{embeddings_file_path}'")
        print(f"  - Path data: '{data_file_path}'")
        return None, None
    except Exception as e:
        print(f"Terjadi kesalahan saat memuat embeddings atau data: {e}")
        return None, None

In [None]:
def find_top_relevant_articles(
    query_embedding: np.ndarray,
    article_embeddings: np.ndarray,
    df_reference_articles: pd.DataFrame,
    top_n: int = 20
) -> List[Dict[str, Any]]:
    """
    Mencari 'top_n' artikel yang paling relevan dengan query_embedding berdasarkan cosine similarity.

    Args:
        query_embedding (np.ndarray): Embedding dari query
        article_embeddings (np.ndarray): Array 2D berisi embeddings dari semua artikel di korpus.
        df_reference_articles (pd.DataFrame): DataFrame yang berisi informasi detail artikel,
                                               dengan urutan yang sama seperti article_embeddings.
                                               Harus memiliki kolom 'pmid', 'processed_title', 'processed_abstract'.
        top_n (int): Jumlah artikel paling relevan yang ingin diambil. Default = 20.

    Returns:
        List[Dict[str, Any]]: Daftar dictionary, di mana setiap dictionary berisi informasi
                               artikel yang relevan ('pmid', 'title', 'abstract', 'similarity').
                               Mengembalikan list kosong jika terjadi error atau tidak ada hasil.
    """
    if query_embedding is None or article_embeddings is None or df_reference_articles is None:
        print("Error: Input embeddings atau DataFrame tidak boleh None.")
        return []
    if query_embedding.ndim == 1:
        query_embedding = query_embedding.reshape(1, -1)
    if query_embedding.shape[1] != article_embeddings.shape[1]:
        print(f"Error: Dimensi embedding query ({query_embedding.shape[1]}) tidak cocok dengan dimensi embedding artikel ({article_embeddings.shape[1]}).")
        return []

    try:
        #  cosine similarity
        similarities = cosine_similarity(query_embedding, article_embeddings)
        similarity_scores_for_query = similarities[0]

        # sort similarity score
        sorted_indices = np.argsort(similarity_scores_for_query)[::-1]
        relevant_articles_info: List[Dict[str, Any]] = []
        num_articles_to_retrieve = min(top_n, len(sorted_indices), len(df_reference_articles))

        for i in range(num_articles_to_retrieve):
            idx = sorted_indices[i]
            similarity_score = similarity_scores_for_query[idx]
            article_data = df_reference_articles.iloc[idx]

            relevant_articles_info.append({
                "rank": i + 1,
                "pmid": article_data.get('pmid', 'N/A'),
                "title": article_data.get('processed_title', 'No Title Available'),
                "abstract": article_data.get('processed_abstract', 'No Abstract Available'),
                "similarity": float(similarity_score)
            })
        return relevant_articles_info
    except Exception as e:
        print(f"Terjadi kesalahan saat mencari artikel relevan: {e}")
        return []

# Re-rank
Re-rank menggunakan cross encoder model `ms-marco-MiniLM-L-6-v2` untuk meningkatkan akurasi dan relevansi dokumen/artikel yang akhirnya diberikan sebagai konteks kepada Large Language Model (LLM).

In [None]:
def load_cross_encoder_model(model_name: str = 'cross-encoder/ms-marco-MiniLM-L-6-v2') -> Optional[CrossEncoder]:
    """
    Memuat model Cross-Encoder dari Hugging Face (via sentence-transformers).

    Args:
        model_name (str): Nama model cross-encoder yang akan digunakan.
                          Contoh lain: 'cross-encoder/ms-marco-TinyBERT-L-2-v2'.

    Returns:
        Optional[CrossEncoder]: Object model CrossEncoder jika berhasil dimuat,
                                None jika gagal.
    """
    print(f"Memuat model Cross-Encoder: '{model_name}'...")
    try:
        model = CrossEncoder(model_name)
        print(f"Model Cross-Encoder '{model_name}' berhasil dimuat.")
        return model
    except Exception as e:
        print(f"Gagal memuat model Cross-Encoder '{model_name}': {e}")
        return None

In [None]:
def rerank_retrieved_articles(
    original_query: str,
    retrieved_articles: List[Dict[str, Any]],
    cross_encoder_model: CrossEncoder,
    top_k_after_rerank: int = 5,
    text_field_for_reranking: str = 'abstract'
) -> List[Dict[str, Any]]:
    """
    Melakukan re-ranking pada daftar artikel yang sudah diambil (retrieved)
    menggunakan model cross-encoder.

    Args:
        original_query (str): Query asli dari pengguna (teks mentah atau yang sudah diproses ringan).
        retrieved_articles (List[Dict[str, Any]]): Daftar dictionary artikel yang didapat dari
                                                   tahap retrieval pertama. Setiap dictionary harus
                                                   memiliki field yang dispesifikasikan oleh
                                                   `text_field_for_reranking`.
        cross_encoder_model (CrossEncoder): Model cross-encoder yang sudah dimuat.
        top_k_after_rerank (int): Jumlah artikel teratas yang diinginkan setelah proses re-ranking.
        text_field_for_reranking (str): Nama field dalam dictionary `retrieved_articles`
                                        yang akan digunakan sebagai teks dokumen untuk dipasangkan
                                        dengan query. Contoh: 'abstract', 'title', atau
                                        jika Anda punya, 'combined_text'.

    Returns:
        List[Dict[str, Any]]: Daftar dictionary artikel yang sudah di-re-rank dan dipotong
                               sejumlah `top_k_after_rerank`, dengan tambahan field 'rerank_score'.
                               Mengembalikan list kosong jika terjadi error.
    """
    if not original_query or not retrieved_articles or cross_encoder_model is None:
        print("Error: Query, daftar artikel, atau model cross-encoder tidak boleh kosong/None.")
        return []

    # Buat pasangan [query, passage] untuk di-input ke cross-encoder
    query_passage_pairs: List[List[str]] = []
    for article in retrieved_articles:
        passage_text = article.get(text_field_for_reranking, "")
        if not passage_text:
            print(f"Peringatan: Artikel PMID {article.get('pmid', 'N/A')} tidak memiliki teks di field '{text_field_for_reranking}'. Dilewati.")
            continue
        query_passage_pairs.append([original_query, passage_text])

    if not query_passage_pairs:
        print("Tidak ada pasangan query-passage yang valid untuk di-re-rank.")
        return []

    print(f"Melakukan prediksi skor relevansi untuk {len(query_passage_pairs)} pasangan query-passage...")
    try:
        # skor cross-encoder, makin tinggi makin oke
        scores = cross_encoder_model.predict(query_passage_pairs, show_progress_bar=True)
    except Exception as e:
        print(f"Error saat prediksi dengan cross-encoder: {e}")
        return []
    # tambah score hasil re-ranking ke tiap artikel dan lakukan sorting
    articles_with_rerank_scores: List[Dict[str, Any]] = []
    valid_article_index = 0
    for i, article in enumerate(retrieved_articles):
        passage_text = article.get(text_field_for_reranking, "")
        if not passage_text:
            continue
        if valid_article_index < len(scores):
            article_copy = article.copy()
            article_copy['rerank_score'] = float(scores[valid_article_index])
            articles_with_rerank_scores.append(article_copy)
            valid_article_index += 1
        else:
            print(f"Peringatan: Kehabisan skor untuk artikel PMID {article.get('pmid', 'N/A')}.")


    # sort artikel
    articles_with_rerank_scores.sort(key=lambda x: x['rerank_score'], reverse=True)

    # ambil top_k_after_rerank
    final_reranked_articles = articles_with_rerank_scores[:top_k_after_rerank]

    # update rank berdasarkan urutan baru
    for i, article in enumerate(final_reranked_articles):
        article['rerank_rank'] = i + 1

    return final_reranked_articles

# Answer generation
Me-load model LLM yang akan digunakan untuk generate jawaban.

In [None]:
hf_llm_model: Optional[AutoModelForCausalLM] = None
hf_llm_tokenizer: Optional[AutoTokenizer] = None
USE_HF_LLM_FLAG = False

In [None]:
def load_or_download_llm(
    model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct",
    gdrive_save_dir_parent: str = "/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/model"
) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer], bool]:
    """
    Memuat model dan tokenizer LLM dari Google Drive jika tersedia,
    jika tidak, unduh dari Hugging Face Hub dan simpan ke Google Drive.

    Args:
        model_id (str): ID model di Hugging Face Hub (misalnya, "meta-llama/Meta-Llama-3-8B-Instruct").
        gdrive_save_dir_parent (str): Direktori induk di Google Drive untuk menyimpan/memuat folder model.
                                      Folder model spesifik akan dibuat di dalam sini.

    Returns:
        Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer], bool]:
        Model, tokenizer, dan flag boolean yang menandakan keberhasilan pemuatan.
    """
    model_save_name = model_id.replace("/", "_")
    gdrive_model_path = os.path.join(gdrive_save_dir_parent, model_save_name)
    loaded_model: Optional[AutoModelForCausalLM] = None
    loaded_tokenizer: Optional[AutoTokenizer] = None
    success_flag = False

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    # 1. Coba muat dari Google Drive
    if os.path.exists(gdrive_model_path):
        print(f"Mencoba memuat model dan tokenizer dari Google Drive: '{gdrive_model_path}'...")
        try:
            loaded_tokenizer = AutoTokenizer.from_pretrained(gdrive_model_path)
            loaded_model = AutoModelForCausalLM.from_pretrained(
                gdrive_model_path,
                quantization_config=quantization_config, # Kuantisasi tetap diterapkan saat memuat
                device_map="auto",
                torch_dtype=torch.float16 # Sesuaikan dengan model, Llama3 mendukung bf16/fp16
            )
            # Pastikan pad_token diatur dengan benar setelah memuat
            if loaded_tokenizer.pad_token is None:
                loaded_tokenizer.pad_token = loaded_tokenizer.eos_token
            if loaded_model.config.pad_token_id is None and hasattr(loaded_model.config, 'eos_token_id'):
                loaded_model.config.pad_token_id = loaded_model.config.eos_token_id

            success_flag = True
            print(f"Model dan tokenizer '{model_id}' berhasil dimuat dari Google Drive.")
        except Exception as e:
            print(f"Gagal memuat dari Google Drive ('{gdrive_model_path}'): {e}. Akan mencoba mengunduh dari Hub.")
            loaded_model, loaded_tokenizer, success_flag = None, None, False # Reset jika gagal

    if not success_flag:
        print(f"Mencoba mengunduh model '{model_id}' dari Hugging Face Hub...")
        try:
            loaded_tokenizer = AutoTokenizer.from_pretrained(model_id)
            loaded_model = AutoModelForCausalLM.from_pretrained(
                model_id,
                quantization_config=quantization_config,
                device_map="auto",
                torch_dtype=torch.float16
            )

            if loaded_tokenizer.pad_token is None:
                loaded_tokenizer.pad_token = loaded_tokenizer.eos_token
            if loaded_model.config.pad_token_id is None and hasattr(loaded_model.config, 'eos_token_id'):
                loaded_model.config.pad_token_id = loaded_model.config.eos_token_id

            success_flag = True
            print(f"Model dan tokenizer '{model_id}' berhasil diunduh dari Hub.")

            # save ke google drive
            print(f"Menyimpan model dan tokenizer ke Google Drive: '{gdrive_model_path}'...")
            os.makedirs(gdrive_model_path, exist_ok=True)
            loaded_model.save_pretrained(gdrive_model_path)
            loaded_tokenizer.save_pretrained(gdrive_model_path)
            print("Model dan tokenizer berhasil disimpan ke Google Drive.")

        except Exception as e:
            print(f"Gagal download atau memproses model '{model_id}' dari Hugging Face Hub: {e}")
            loaded_model, loaded_tokenizer, success_flag = None, None, False


    if not success_flag:
        print("LLM akan disimulasikan karena model gagal dimuat.")

    return loaded_model, loaded_tokenizer, success_flag

# See it live!
Contoh demo program 😀

In [None]:
if __name__ == "__main__":
    # konfigurasi google drive
    use_gdrive_for_assets = True
    gdrive_base_dir = "/content/drive/MyDrive/pubmed-chatbot-using-rag-llm"

    if use_gdrive_for_assets:
        path_article_embeddings = os.path.join(gdrive_base_dir, "pubmed_article_embeddings.npy")
        path_df_retrieval = os.path.join(gdrive_base_dir, "pubmed_articles_for_retrieval.csv")
        llm_model_storage_parent = os.path.join(gdrive_base_dir, "model")
    else:
        path_article_embeddings = "pubmed_article_embeddings.npy"
        path_df_retrieval = "pubmed_articles_for_retrieval.csv"
        llm_model_storage_parent = "./model"

    # load dataset
    article_embeddings, df_retrieval = load_retrieval_assets(
        embeddings_file_path=path_article_embeddings,
        data_file_path=path_df_retrieval
    )
    if article_embeddings is None or df_retrieval is None:
        print("Gagal memuat data embeddings atau artikel. Program berhenti.")
        exit()

    # model MiniLM untuk embedding (sudah dilakukan)
    # model cross-encoder untuk re-rank
    # model Llama-3-8B untuk generation
    sbert_model_name = 'all-MiniLM-L6-v2'
    sbert_model = load_sentence_transformer_model(sbert_model_name)
    if sbert_model is None:
        print(f"Gagal memuat model Sentence Transformer '{sbert_model_name}'. Program berhenti.")
        exit()
    cross_encoder_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
    cross_encoder_model = load_cross_encoder_model(model_name=cross_encoder_model_name)

    HF_LLM_MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
    hf_model, hf_tokenizer, USE_HF_LLM = load_or_download_llm(
        model_id=HF_LLM_MODEL_ID,
        gdrive_save_dir_parent=llm_model_storage_parent
    )

    if not USE_HF_LLM:
        print(f"Gagal memuat atau mengkonfigurasi model LLM '{HF_LLM_MODEL_ID}'. Program berhenti.")
        exit()

    # main program
    print(f"Chatbot ini pakai {HF_LLM_MODEL_ID}")
    print("Ketik 'exit' untuk berhenti.")

    NUM_ARTICLES_RETRIEVED_INITIAL = 20
    NUM_ARTICLES_FOR_LLM_CONTEXT = 5

    while True:
        user_query_original = input("\nEnter your question: ")
        if user_query_original.lower() == 'exit':
            print("Goodbye!")
            break
        if not user_query_original.strip():
            print("Question cannot be empty.")
            continue

        # preprocess pada pertanyaan user
        processed_user_query = preprocess_single_text(user_query_original)
        if not processed_user_query:
            final_articles_for_llm = []
        else:
            query_embedding = sbert_model.encode(processed_user_query)

            # retrieval awal
            retrieved_articles_list = find_top_relevant_articles(
                query_embedding,
                article_embeddings,
                df_retrieval,
                top_n=NUM_ARTICLES_RETRIEVED_INITIAL
            )

            # re-ranking
            if retrieved_articles_list and cross_encoder_model:
                re_ranked_articles_list = rerank_retrieved_articles(
                    original_query=user_query_original,
                    retrieved_articles=retrieved_articles_list,
                    cross_encoder_model=cross_encoder_model,
                    top_k_after_rerank=NUM_ARTICLES_FOR_LLM_CONTEXT,
                    text_field_for_reranking='abstract'
                )
                final_articles_for_llm = re_ranked_articles_list
                if not final_articles_for_llm :
                     final_articles_for_llm = retrieved_articles_list[:NUM_ARTICLES_FOR_LLM_CONTEXT]

            elif retrieved_articles_list:
                final_articles_for_llm = retrieved_articles_list[:NUM_ARTICLES_FOR_LLM_CONTEXT]
            else:
                final_articles_for_llm = []

        # generation
        qa_result = generate_llm_answer(
            user_query=user_query_original,
            re_ranked_articles=final_articles_for_llm,
            llm_model=hf_model,
            llm_tokenizer=hf_tokenizer,
            use_actual_llm=USE_HF_LLM
        )

        print(f"Answer:")
        print(qa_result['answer'])

Embeddings berhasil dimuat dari '/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/pubmed_article_embeddings.npy'. Shape: (1738, 384)
Data artikel berhasil dimuat dari '/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/pubmed_articles_for_retrieval.csv'. Jumlah baris: 1738
Memuat model Sentence Transformer: 'all-MiniLM-L6-v2'...
Model 'all-MiniLM-L6-v2' berhasil dimuat.
Memuat model Cross-Encoder: 'cross-encoder/ms-marco-MiniLM-L-6-v2'...
Model Cross-Encoder 'cross-encoder/ms-marco-MiniLM-L-6-v2' berhasil dimuat.
Mencoba memuat model dan tokenizer dari Google Drive: '/content/drive/MyDrive/pubmed-chatbot-using-rag-llm/model/meta-llama_Meta-Llama-3-8B-Instruct'...




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model dan tokenizer 'meta-llama/Meta-Llama-3-8B-Instruct' berhasil dimuat dari Google Drive.
Chatbot ini pakai meta-llama/Meta-Llama-3-8B-Instruct
Ketik 'exit' untuk berhenti.

Enter your question: what diabetes is
Melakukan prediksi skor relevansi untuk 20 pasangan query-passage...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer:
Based on the provided article excerpts, diabetes is a metabolic disease characterized by high blood glucose levels, which can be caused by various factors such as genetics, environmental factors, and lifestyle. The excerpts suggest that diabetes is a complex condition that can be influenced by multiple factors, including diet, insulin resistance, and genetic predisposition.

The excerpts also highlight the importance of maintaining optimal glycemic control, as high blood sugar levels can lead to complications and worsen the condition. Additionally, the excerpts suggest that depression and anxiety can have a negative impact on the management of diabetes, and that telemedicine and remote continuous glucose monitoring can be useful tools in managing the condition.

References:
1. PMID: 32172486, Title: "the genetic basis of highcarbohydrate and highmonosodium glutamate diet related to the increase of likelihood of type 2 diabetes mellitus a review"
2. PMID: 38450390, Title: "assoc