In [1]:
!pip install datasets
!pip install transformers==4.45.2
!pip install -U sentence-transformers
!pip install wandb

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [2]:
import os
import pickle
import pandas as pd
import time
import logging
import json
import torch
import re
import numpy as np
import nltk
from sentence_transformers import (SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, util)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.datasets import NoDuplicatesDataLoader
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel
from datasets import Dataset, load_dataset, DatasetDict, concatenate_datasets
from sklearn.model_selection import train_test_split
from tqdm.autonotebook import tqdm, trange
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset

In [None]:
dataset_path_1 = '/content/drive/MyDrive/myData/dataset/dataset1.csv'
dataset_path_2 = '/content/drive/MyDrive/myData/dataset/dataset2.csv'

df1 = pd.read_csv(dataset_path_1)
df2 = pd.read_csv(dataset_path_2)

df = pd.concat([df1, df2], ignore_index=True)
df = df.drop(df.columns[-1],axis=1)
df

Unnamed: 0,question,answer
0,Apakah Kita Memahami Perubahan Iklim Lebih Bai...,"Ya, jauh lebih baik. Laporan IPCC pertama, yan..."
1,Di Mana Perubahan Iklim Paling Nyata?,Tanda-tanda perubahan iklim tidak dapat dipung...
2,Apa yang Dapat Diajarkan Iklim Masa Lalu kepad...,"Di masa lalu, Bumi telah mengalami periode ber..."
3,Suhu Bumi Pernah Berubah Sebelumnya. Bagaimana...,Meskipun iklim dapat dicirikan oleh banyak var...
4,Apa Bukti Perubahan Iklim?,Kita telah lama mengamati perubahan iklim kita...
...,...,...
9225,Apa nama konsep yang melihat perubahan iklim s...,Mengurangi jarak psikologis tidak secara andal...
9226,Apa saja masalah operasional yang dihadapi saa...,Beberapa masalah operasional yang dilaporkan t...
9227,Apa tujuan dari standar Rainforest Alliance?,Standar Rainforest Alliance bertujuan untuk me...
9228,Apa saja kebutuhan makanan hewan herbivora dal...,Kebutuhan makanan hewan herbivora dalam petern...


# Pre-processing data

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('indonesian'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def case_folding(text):
    return text.lower()

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def preprocess_text(text):
    text = clean_text(text)
    text = case_folding(text)
    text = remove_stopwords(text)
    return text

data = df.copy()
data['question'] = data['question'].apply(preprocess_text)
data['answer'] = data['answer'].apply(preprocess_text)
data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,question,answer
0,memahami perubahan iklim dibandingkan ipcc,ya laporan ipcc dirilis 1990 menyimpulkan peru...
1,perubahan iklim nyata,tandatanda perubahan iklim dipungkiri skala gl...
2,diajarkan iklim,bumi mengalami periode berkepanjangan peningka...
3,suhu bumi berubah pemanasan global berbeda,iklim dicirikan variabel suhu indikator utama ...
4,bukti perubahan iklim,mengamati perubahan iklim ilmuwan pengamatan m...
...,...,...
9225,nama konsep perubahan iklim mengakibatkan aksi...,mengurangi jarak psikologis andal meningkatkan...
9226,operasional dihadapi minyak sayur bahan bakar ...,operasional dilaporkan terkait penggunaan miny...
9227,tujuan standar rainforest alliance,standar rainforest alliance bertujuan melestar...
9228,kebutuhan makanan hewan herbivora peternakan,kebutuhan makanan hewan herbivora peternakan d...


# Split dataset

In [None]:
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [None]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_data.reset_index(drop=True)),
    "val": Dataset.from_pandas(val_data.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_data.reset_index(drop=True))
})

dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 6461
    })
    val: Dataset({
        features: ['question', 'answer'],
        num_rows: 1384
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1385
    })
})

In [None]:
train_dataset = dataset["train"]
eval_dataset = dataset["val"]
test_dataset = dataset["test"]
train_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 6461
})

# Finetuning indoSBERT model



In [None]:
# 1. Load a model to finetune with
model = SentenceTransformer('denaya/indoSBERT-large')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.23k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/709k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 1024, 'out_features': 256, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [None]:
# 2. Load a dataset to finetune on
train_dataset = dataset["train"]
eval_dataset = dataset["val"]
test_dataset = dataset["test"]

# 3. Define a loss function
loss = MultipleNegativesRankingLoss(model)

# 4. Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/denaya-indoSBERT-large_e5b16",
    # training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    # tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="indoSBERT-large_e5bs16",  # Will be used in W&B if `wandb` is installed
)

In [None]:
# 5. Create an evaluator & evaluate the base model
corpus = {str(idx): answer for idx, answer in enumerate(dataset["val"]["answer"])}
queries = {str(idx): question for idx, question in enumerate(dataset["val"]["question"])}
relevant_docs = {str(idx): {str(idx)} for idx in range(len(dataset["val"]["question"]))}

dev_evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name="indoSBERT-large-eval",
)
dev_evaluator(model)

{'indoSBERT-large-eval_cosine_accuracy@1': 0.48627167630057805,
 'indoSBERT-large-eval_cosine_accuracy@3': 0.6329479768786127,
 'indoSBERT-large-eval_cosine_accuracy@5': 0.684971098265896,
 'indoSBERT-large-eval_cosine_accuracy@10': 0.7601156069364162,
 'indoSBERT-large-eval_cosine_precision@1': 0.48627167630057805,
 'indoSBERT-large-eval_cosine_precision@3': 0.21098265895953758,
 'indoSBERT-large-eval_cosine_precision@5': 0.13699421965317918,
 'indoSBERT-large-eval_cosine_precision@10': 0.07601156069364161,
 'indoSBERT-large-eval_cosine_recall@1': 0.48627167630057805,
 'indoSBERT-large-eval_cosine_recall@3': 0.6329479768786127,
 'indoSBERT-large-eval_cosine_recall@5': 0.684971098265896,
 'indoSBERT-large-eval_cosine_recall@10': 0.7601156069364162,
 'indoSBERT-large-eval_cosine_ndcg@10': 0.6183852893298069,
 'indoSBERT-large-eval_cosine_mrr@10': 0.5737329686209749,
 'indoSBERT-large-eval_cosine_map@100': 0.5805544119423703}

In [None]:
# 6. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33m1122am[0m ([33m1122am-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Indosbert-large-eval Cosine Accuracy@1,Indosbert-large-eval Cosine Accuracy@3,Indosbert-large-eval Cosine Accuracy@5,Indosbert-large-eval Cosine Accuracy@10,Indosbert-large-eval Cosine Precision@1,Indosbert-large-eval Cosine Precision@3,Indosbert-large-eval Cosine Precision@5,Indosbert-large-eval Cosine Precision@10,Indosbert-large-eval Cosine Recall@1,Indosbert-large-eval Cosine Recall@3,Indosbert-large-eval Cosine Recall@5,Indosbert-large-eval Cosine Recall@10,Indosbert-large-eval Cosine Ndcg@10,Indosbert-large-eval Cosine Mrr@10,Indosbert-large-eval Cosine Map@100
100,0.3999,0.186907,0.679913,0.811416,0.856936,0.903179,0.679913,0.270472,0.171387,0.090318,0.679913,0.811416,0.856936,0.903179,0.790855,0.754911,0.75802
200,0.1581,0.105982,0.778902,0.882948,0.905347,0.931358,0.778902,0.294316,0.181069,0.093136,0.778902,0.882948,0.905347,0.931358,0.858049,0.834262,0.836854
300,0.1107,0.088434,0.801301,0.898121,0.916908,0.938584,0.801301,0.299374,0.183382,0.093858,0.801301,0.898121,0.916908,0.938584,0.873816,0.852672,0.854755
400,0.1028,0.082163,0.817919,0.910405,0.931358,0.949422,0.817919,0.303468,0.186272,0.094942,0.817919,0.910405,0.931358,0.949422,0.887159,0.866815,0.868495
500,0.0784,0.069419,0.814306,0.91185,0.934249,0.95448,0.814306,0.30395,0.18685,0.095448,0.814306,0.91185,0.934249,0.95448,0.888636,0.867124,0.868802
600,0.015,0.076403,0.821532,0.909682,0.935694,0.95159,0.821532,0.303227,0.187139,0.095159,0.821532,0.909682,0.935694,0.95159,0.889138,0.868744,0.870456
700,0.0052,0.075724,0.822254,0.918353,0.935694,0.95448,0.822254,0.306118,0.187139,0.095448,0.822254,0.918353,0.935694,0.95448,0.892105,0.8717,0.873328
800,0.0061,0.069137,0.820809,0.910405,0.931358,0.95448,0.820809,0.303468,0.186272,0.095448,0.820809,0.910405,0.931358,0.95448,0.891385,0.870807,0.872528
900,0.0051,0.072295,0.824422,0.916185,0.937139,0.955202,0.824422,0.305395,0.187428,0.09552,0.824422,0.916185,0.937139,0.955202,0.89427,0.874259,0.875796
1000,0.0052,0.070892,0.825867,0.913295,0.940029,0.95737,0.825867,0.304432,0.188006,0.095737,0.825867,0.913295,0.940029,0.95737,0.89498,0.874562,0.875875


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=2020, training_loss=0.04428339495074631, metrics={'train_runtime': 1988.1867, 'train_samples_per_second': 16.248, 'train_steps_per_second': 1.016, 'total_flos': 0.0, 'train_loss': 0.04428339495074631, 'epoch': 5.0})

In [None]:
# 7. Save the trained model
model.save_pretrained("/content/drive/MyDrive/myqas/models1/finetuned-indoSBERT-large-e5bs16")

In [None]:
# 8. Evaluate the trained model on the test set
corpus_test = {str(idx): answer for idx, answer in enumerate(dataset["test"]["answer"])}
queries_test = {str(idx): question for idx, question in enumerate(dataset["test"]["question"])}
relevant_docs_test = {str(idx): {str(idx)} for idx in range(len(dataset["test"]["question"]))}

test_evaluator = InformationRetrievalEvaluator(
    queries=queries_test,
    corpus=corpus_test,
    relevant_docs=relevant_docs_test,
    name="indoSBERT-large-test-e5b16",
)
test_evaluator(model)

{'indoSBERT-large-test-e5b16_cosine_accuracy@1': 0.8541516245487365,
 'indoSBERT-large-test-e5b16_cosine_accuracy@3': 0.9299638989169675,
 'indoSBERT-large-test-e5b16_cosine_accuracy@5': 0.9444043321299639,
 'indoSBERT-large-test-e5b16_cosine_accuracy@10': 0.9574007220216606,
 'indoSBERT-large-test-e5b16_cosine_precision@1': 0.8541516245487365,
 'indoSBERT-large-test-e5b16_cosine_precision@3': 0.30998796630565584,
 'indoSBERT-large-test-e5b16_cosine_precision@5': 0.18888086642599275,
 'indoSBERT-large-test-e5b16_cosine_precision@10': 0.09574007220216606,
 'indoSBERT-large-test-e5b16_cosine_recall@1': 0.8541516245487365,
 'indoSBERT-large-test-e5b16_cosine_recall@3': 0.9299638989169675,
 'indoSBERT-large-test-e5b16_cosine_recall@5': 0.9444043321299639,
 'indoSBERT-large-test-e5b16_cosine_recall@10': 0.9574007220216606,
 'indoSBERT-large-test-e5b16_cosine_ndcg@10': 0.9093403811689553,
 'indoSBERT-large-test-e5b16_cosine_mrr@10': 0.8935192825626042,
 'indoSBERT-large-test-e5b16_cosine_map

# wandb close


In [None]:
wandb.finish()

VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/indoSBERT-large-eval_cosine_accuracy@1,▁▅▇▇▇▇▇▇▇▇██████████████████████████████
eval/indoSBERT-large-eval_cosine_accuracy@10,▁▅▆▇▇▇▇▇▇█▇█████████████████████████████
eval/indoSBERT-large-eval_cosine_accuracy@3,▁▅▇▇█▇█▇▇███████████████████████████████
eval/indoSBERT-large-eval_cosine_accuracy@5,▁▅▇▇▇▇█▇▇███████████████████████████████
eval/indoSBERT-large-eval_cosine_map@100,▁▅▇▇▇▇▇▇▇▇██████████████████████████████
eval/indoSBERT-large-eval_cosine_mrr@10,▁▅▇▇▇▇▇▇▇▇██████████████████████████████
eval/indoSBERT-large-eval_cosine_ndcg@10,▁▅▇▇▇▇▇▇▇▇██████████████████████████████
eval/indoSBERT-large-eval_cosine_precision@1,▁▅▇▇▇▇▇▇▇▇██████████████████████████████
eval/indoSBERT-large-eval_cosine_precision@10,▁▅▆▇▇▇▇▇▇█▇█████████████████████████████
eval/indoSBERT-large-eval_cosine_precision@3,▁▅▇▇█▇█▇▇███████████████████████████████

0,1
eval/indoSBERT-large-eval_cosine_accuracy@1,0.8396
eval/indoSBERT-large-eval_cosine_accuracy@10,0.95954
eval/indoSBERT-large-eval_cosine_accuracy@3,0.92269
eval/indoSBERT-large-eval_cosine_accuracy@5,0.94003
eval/indoSBERT-large-eval_cosine_map@100,0.88541
eval/indoSBERT-large-eval_cosine_mrr@10,0.88406
eval/indoSBERT-large-eval_cosine_ndcg@10,0.90262
eval/indoSBERT-large-eval_cosine_precision@1,0.8396
eval/indoSBERT-large-eval_cosine_precision@10,0.09595
eval/indoSBERT-large-eval_cosine_precision@3,0.30756


# QAS

In [None]:
def get_or_create_embeddings_qa(data, model, embeddings_path):
    corpus_questions = data['question'].tolist()
    corpus_answers = data['answer'].tolist()

    if not os.path.exists(embeddings_path):
        question_embeddings = model.encode(corpus_questions, show_progress_bar=True, convert_to_tensor=True)
        answer_embeddings = model.encode(corpus_answers, show_progress_bar=True, convert_to_tensor=True)

        with open(embeddings_path, 'wb') as fOut:
            pickle.dump(
                {
                    'questions': corpus_questions,
                    'answers': corpus_answers,
                    'question_embeddings': question_embeddings.cpu(),
                    'answer_embeddings': answer_embeddings.cpu()
                },
                fOut
            )
        print("Embeddings saved to:", embeddings_path)
    else:
        print("Embeddings file already exists. Loading embeddings from file...")
        with open(embeddings_path, 'rb') as fIn:
            cached_data = pickle.load(fIn)
            corpus_questions = cached_data['questions']
            corpus_answers = cached_data['answers']

            question_embeddings = cached_data['question_embeddings'].cpu()
            answer_embeddings = cached_data['answer_embeddings'].cpu()

    return corpus_questions, corpus_answers, question_embeddings, answer_embeddings


In [None]:
def semantic_search(queries, corpus, model, top_k=5):
    results_dict = {}

    if isinstance(queries, str):
        queries = [queries]

    query_embeddings = model.encode(queries, convert_to_tensor=True)
    hits_batch = util.semantic_search(query_embeddings, answer_embeddings, top_k=top_k)

    for query, hits in zip(queries, hits_batch):
        results = []
        for hit in hits:
            idx = hit['corpus_id']
            results.append({
                "score": hit['score'],
                "question": corpus.iloc[idx]['question'],
                "answer": corpus.iloc[idx]['answer']
            })
        results_dict[query] = results

    return results_dict


In [None]:
queries_path = '/content/drive/MyDrive/myData/dataset/pengujian_sistem.xlsx'

queries_pd = pd.read_excel(queries_path)
queries = queries_pd['kueri'].iloc[0:10].tolist()
queries

['apa yang bisa kita pelajari dari iklim di masa lalu ',
 'kapan dampak aktivitas manusia terhadap iklim mulai terlihat secara nyata pada tingkat lokal',
 'apakah model iklim semakin berkembang',
 'apa yang perlu diketahui manusia dalam adaptasi terhadap dampak bahaya  iklim',
 'kenapa kota-kota yang berada di sekitar tepi laut yang paling beresiko terkena dampak perubahan iklim',
 'apakah ada pengaruh terhadap rotasi planet apabila es di daerah kutub di bumi mencair ',
 'apa kontribusi yang bisa saya lakukan dalam menghentikan perubahan iklim',
 'bagaimana peluang dan tantangan dalam mitigasi berbeda di setiap wilayah?',
 'cara mengetahui sehangat dan sedingin apa cuaca di jaman dulu',
 'apa yang terjadi pada kehidupan anak-anak di masa depan apabila tidak ada aksi cepat dalam mengurangi gas emisi rumah kaca']

# qas indoSBERT model

In [None]:
model = SentenceTransformer('denaya/indoSBERT-large')
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.23k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/709k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 1024, 'out_features': 256, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [None]:
# embeddings_path_model = '/content/drive/MyDrive/myqas/models1/embeddings/embeddings_denaya-indoSBERT-large.pkl'
# corpus_questions, corpus_answers, question_embeddings, answer_embeddings = get_or_create_embeddings_qa_nopre(df1, model, embeddings_path_model)

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Embeddings saved to: /content/drive/MyDrive/myqas/models1/embeddings/embeddings_denaya-indoSBERT-large.pkl


In [None]:
embeddings_path_model = '/content/drive/MyDrive/myqas/models1/embeddings/embeddings_denaya-indoSBERT-large.pkl'
corpus_questions, corpus_answers, question_embeddings, answer_embeddings = get_or_create_embeddings_qa(df1, model, embeddings_path_model)

Embeddings file already exists. Loading embeddings from file...


In [None]:
results = semantic_search(queries, df1, model, top_k=5)

for query, result_list in results.items():
    print(f"\nQuery: '{query}'")
    for res in result_list:
        print(f"Score: {res['score']:.3f}, Q: {res['question']}, A: {res['answer']}")
    print()


Query: 'apa yang bisa kita pelajari dari iklim di masa lalu '
Score: 0.534, Q: Bagaimana kita mengetahui tingkat gas rumah kaca dan suhu di masa lalu?, A: Inti es merupakan sumber terbaik bagi para ilmuwan untuk mendapatkan data iklim historis. Alat lain untuk mempelajari atmosfer Bumi purba meliputi lingkaran pertumbuhan pada pohon, yang menyimpan catatan kasar suhu, kelembapan, dan tingkat kekeruhan setiap musim tanam yang telah berlangsung sekitar 2.000 tahun. Karang juga membentuk lingkaran pertumbuhan yang memberikan informasi tentang suhu dan nutrisi di lautan tropis. Proksi lainnya, seperti inti bentik, memperluas pengetahuan kita tentang iklim masa lalu hingga sekitar satu miliar tahun yang lalu.
Score: 0.501, Q: Apa yang Dapat Diajarkan Iklim Masa Lalu kepada Kita tentang Masa Depan?, A: Di masa lalu, Bumi telah mengalami periode berkepanjangan dengan peningkatan konsentrasi gas rumah kaca yang menyebabkan suhu global dan permukaan laut naik. Mempelajari periode hangat di mas

In [None]:
# Jawaban benar untuk setiap query berdasarkan kolom 'kueri' dan 'question' pada DataFrame queries_pd
correct_answers = {row['kueri']: row['question'] for _, row in queries_pd.iloc[0:11].iterrows()}

rows = []
for query, result_list in results.items():
    correct_answer = correct_answers[query]
    rank = 0
    questions = []

    # Menyimpan 5 pertanyaan per query
    for i, res in enumerate(result_list[:5]):
        questions.append(f"{i + 1}. {res['question']}")
        if res["question"] == correct_answer:
            rank = i + 1

    reciprocal_rank = 1 / rank if rank > 0 else 0

    question_text = "\n".join(questions)

    rows.append({
        "Query": query,
        "Questions": question_text,
        "Rank": rank,
        "Reciprocal Rank": reciprocal_rank,
    })

mrr_df = pd.DataFrame(rows)

# Menghitung MRR secara keseluruhan
mrr_overall = mrr_df["Reciprocal Rank"].mean()

print(mrr_df)
print(f"\nMean Reciprocal Rank (MRR): {mrr_overall:.3f}")


                                               Query  \
0  apa yang bisa kita pelajari dari iklim di masa...   
1  kapan dampak aktivitas manusia terhadap iklim ...   
2              apakah model iklim semakin berkembang   
3  apa yang perlu diketahui manusia dalam adaptas...   
4  kenapa kota-kota yang berada di sekitar tepi l...   
5  apakah ada pengaruh terhadap rotasi planet apa...   
6  apa kontribusi yang bisa saya lakukan dalam me...   
7  bagaimana peluang dan tantangan dalam mitigasi...   
8  cara mengetahui sehangat dan sedingin apa cuac...   
9  apa yang terjadi pada kehidupan anak-anak di m...   

                                           Questions  Rank  Reciprocal Rank  
0  1. Bagaimana kita mengetahui tingkat gas rumah...     2         0.500000  
1  1. Mengapa sulit untuk memastikan peran peruba...     3         0.333333  
2  1. Apakah sudah terlambat untuk mencegah perub...     0         0.000000  
3  1. Apa itu adaptasi perubahan iklim?\n2. Apa s...     4         0.25

In [None]:
mrr_df.to_excel('/content/drive/MyDrive/myData/dataset/hasilq-a1.xlsx')

# qas finetuned indoSBERT model

In [None]:
model2= SentenceTransformer('/content/drive/MyDrive/myqas/models1/finetuned-indoSBERT-large-e5bs16.h5')
model2

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Dense({'in_features': 1024, 'out_features': 256, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)

In [None]:
# embeddings_path_model2 = '/content/drive/MyDrive/myqas/models1/embeddings/embeddings_indoSBERT-e5b16.pkl'
# corpus_questions, corpus_answers, question_embeddings, answer_embeddings = get_or_create_embeddings_qa_nopre(df1, model2, embeddings_path_model2)

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Embeddings saved to: /content/drive/MyDrive/myqas/models1/embeddings/embeddings_indoSBERT-e5b16.pkl


In [None]:
embeddings_path_model2 = '/content/drive/MyDrive/myqas/models1/embeddings/embeddings_indoSBERT-e5b16.pkl'
corpus_questions, corpus_answers, question_embeddings, answer_embeddings = get_or_create_embeddings_qa(df1, model2, embeddings_path_model2)

Embeddings file already exists. Loading embeddings from file...


In [None]:
results = semantic_search(queries, df1, model2, top_k=5)

for query, result_list in results.items():
    print(f"\nQuery: '{query}'")
    for res in result_list:
        print(f"Score: {res['score']:.3f}, Q: {res['question']}, A: {res['answer']}")
    print()


Query: 'apa yang bisa kita pelajari dari iklim di masa lalu '
Score: 0.501, Q: Apa yang Dapat Diajarkan Iklim Masa Lalu kepada Kita tentang Masa Depan?, A: Di masa lalu, Bumi telah mengalami periode berkepanjangan dengan peningkatan konsentrasi gas rumah kaca yang menyebabkan suhu global dan permukaan laut naik. Mempelajari periode hangat di masa lalu ini memberi tahu kita tentang potensi konsekuensi jangka panjang dari peningkatan gas rumah kaca di atmosfer. Meningkatnya konsentrasi gas rumah kaca mendorong perubahan besar pada sistem Bumi, termasuk pemanasan global, kenaikan permukaan laut, peningkatan iklim dan cuaca ekstrem, pengasaman laut, dan pergeseran ekologi (FAQ 2.2 dan FAQ 7.1). Sebagian besar pengamatan instrumental iklim dimulai pada abad ke-20, ketika emisi gas rumah kaca dari aktivitas manusia menjadi pendorong utama perubahan iklim Bumi (FAQ 3.1). Ketika para ilmuwan berusaha menyempurnakan pemahaman kita tentang sistem iklim Bumi dan bagaimana sistem itu dapat berevo

In [None]:
# Jawaban benar untuk setiap query berdasarkan kolom 'kueri' dan 'question' pada DataFrame queries_pd
correct_answers = {row['kueri']: row['question'] for _, row in queries_pd.iloc[0:11].iterrows()}


rows = []
for query, result_list in results.items():
    correct_answer = correct_answers[query]
    rank = 0
    questions = []

    # Menyimpan 5 pertanyaan per query
    for i, res in enumerate(result_list[:5]):
        questions.append(f"{i + 1}. {res['question']}")
        if res["question"] == correct_answer:
            rank = i + 1

    reciprocal_rank = 1 / rank if rank > 0 else 0

    question_text = "\n".join(questions)

    rows.append({
        "Query": query,
        "Questions": question_text,
        "Rank": rank,
        "Reciprocal Rank": reciprocal_rank,
    })

mrr_df = pd.DataFrame(rows)

# Menghitung MRR secara keseluruhan
mrr_overall = mrr_df["Reciprocal Rank"].mean()

print(mrr_df)
print(f"\nMean Reciprocal Rank (MRR): {mrr_overall:.3f}")


                                               Query  \
0  apa yang bisa kita pelajari dari iklim di masa...   
1  kapan dampak aktivitas manusia terhadap iklim ...   
2              apakah model iklim semakin berkembang   
3  apa yang perlu diketahui manusia dalam adaptas...   
4  kenapa kota-kota yang berada di sekitar tepi l...   
5  apakah ada pengaruh terhadap rotasi planet apa...   
6  apa kontribusi yang bisa saya lakukan dalam me...   
7  bagaimana peluang dan tantangan dalam mitigasi...   
8  cara mengetahui sehangat dan sedingin apa cuac...   
9  apa yang terjadi pada kehidupan anak-anak di m...   

                                           Questions  Rank  Reciprocal Rank  
0  1. Apa yang Dapat Diajarkan Iklim Masa Lalu ke...     1              1.0  
1  1. Kapan Pengaruh Manusia terhadap Iklim Menja...     1              1.0  
2  1. Mengapa Begitu Banyak Model dan Skenario ya...     2              0.5  
3  1. Apa itu adaptasi perubahan iklim?\n2. Bagai...     2             

In [None]:
mrr_df.to_excel('/content/drive/MyDrive/myData/dataset/hasil2.xlsx')

# upload model to hugging face

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import create_repo

repo_name = "indosbert-climate-faq"
create_repo(repo_name, repo_type="model")

RepoUrl('https://huggingface.co/annisamukhri/indosbert-climate-faq', endpoint='https://huggingface.co', repo_type='model', repo_id='annisamukhri/indosbert-climate-faq')

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    folder_path="/content/drive/MyDrive/myqas/models1/finetuned-indoSBERT-large-e5bs16",
    repo_id="annisamukhri/indosbert-climate-faq"
)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/annisamukhri/indosbert-climate-faq/commit/8617564216ad34f4ebedff6237e2142dc1cfc5f8', commit_message='Upload folder using huggingface_hub', commit_description='', oid='8617564216ad34f4ebedff6237e2142dc1cfc5f8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/annisamukhri/indosbert-climate-faq', endpoint='https://huggingface.co', repo_type='model', repo_id='annisamukhri/indosbert-climate-faq'), pr_revision=None, pr_num=None)