In [1]:
pip install transformers torch pandas nltk

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import json
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import random

In [7]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
!wget https://raw.githubusercontent.com/brogangin/Chatbot_Soekarno_DL/refs/heads/main/dataset.json -O data.json

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation and token not in stop_words]
    return " ".join(tokens)

# Load data from JSON file
with open('data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Preprocess and flatten the data
preprocessed_data = []
for intent in data['intents']:
    for pattern in intent['patterns']:
        preprocessed_data.append({
            'tag': intent['tag'],
            'pattern': preprocess_text(pattern),
            'response': intent['responses'][0]  # We'll use the first response for simplicity
        })

# Create DataFrame
df = pd.DataFrame(preprocessed_data)

# Create a mapping of tags to integers
tag_to_index = {tag: i for i, tag in enumerate(df['tag'].unique())}
index_to_tag = {i: tag for tag, i in tag_to_index.items()}

# Convert tags to indices
df['tag_index'] = df['tag'].map(tag_to_index)

# Save to CSV
df.to_csv('qa_data.csv', index=False)

# Load pre-trained BERT tokenizer for Indonesian
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')

# Tokenize and encode the patterns
encoded_data = tokenizer(
    df['pattern'].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

# Save encoded data
torch.save(encoded_data, 'encoded_data.pt')

# Save tag mappings
torch.save(tag_to_index, 'tag_to_index.pt')
torch.save(index_to_tag, 'index_to_tag.pt')

print("Data preparation completed.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


--2025-02-08 05:41:37--  https://raw.githubusercontent.com/brogangin/Chatbot_Soekarno_DL/refs/heads/main/dataset.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14009 (14K) [text/plain]
Saving to: ‘data.json’


2025-02-08 05:41:37 (5.62 MB/s) - ‘data.json’ saved [14009/14009]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Data preparation completed.


In [8]:
# Load encoded data and DataFrame
encoded_data = torch.load('encoded_data.pt')
df = pd.read_csv('qa_data.csv')
tag_to_index = torch.load('tag_to_index.pt', weights_only=True)

# Prepare dataset
dataset = TensorDataset(
    encoded_data['input_ids'],
    encoded_data['attention_mask'],
    torch.tensor(df['tag_index'].values)
)

# Create data loader
batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load pre-trained BERT model
num_labels = len(tag_to_index)
model = BertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased', num_labels=num_labels)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} completed. Average loss: {avg_loss:.4f}")

# Save the fine-tuned model
torch.save(model.state_dict(), 'fine_tuned_bert.pt')

print("Model training completed and saved.")

  encoded_data = torch.load('encoded_data.pt')


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Epoch 1/100 completed. Average loss: 2.9961
Epoch 2/100 completed. Average loss: 2.9285
Epoch 3/100 completed. Average loss: 2.9145
Epoch 4/100 completed. Average loss: 2.8383
Epoch 5/100 completed. Average loss: 2.8110
Epoch 6/100 completed. Average loss: 2.7982
Epoch 7/100 completed. Average loss: 2.7525
Epoch 8/100 completed. Average loss: 2.7338
Epoch 9/100 completed. Average loss: 2.7003
Epoch 10/100 completed. Average loss: 2.6395
Epoch 11/100 completed. Average loss: 2.5034
Epoch 12/100 completed. Average loss: 2.4164
Epoch 13/100 completed. Average loss: 2.3019
Epoch 14/100 completed. Average loss: 2.2523
Epoch 15/100 completed. Average loss: 2.0829
Epoch 16/100 completed. Average loss: 2.0024
Epoch 17/100 completed. Average loss: 1.9337
Epoch 18/100 completed. Average loss: 1.7954
Epoch 19/100 completed. Average loss: 1.6925
Epoch 20/100 completed. Average loss: 1.5788
Epoch 21/100 completed. Average loss: 1.4598
Epoch 22/100 completed. Average loss: 1.3413
Epoch 23/100 comple

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('indonesian'))

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')

# Load tag mappings
tag_to_index = torch.load('tag_to_index.pt', weights_only=True)
index_to_tag = torch.load('index_to_tag.pt', weights_only=True)

# Load pre-trained BERT model
num_labels = len(tag_to_index)
model = BertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased', num_labels=num_labels)

# Load fine-tuned model weights
model.load_state_dict(torch.load('fine_tuned_bert.pt', weights_only=True))
model.eval()

# Load intents data
with open('dataset.json', 'r', encoding='utf-8') as f:
    intents = json.load(f)

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation and token not in stop_words]
    return " ".join(tokens)

def get_response(text):
    preprocessed_text = preprocess_text(text)
    encoded_input = tokenizer(
        preprocessed_text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    with torch.no_grad():
        output = model(**encoded_input)

    predicted_label = torch.argmax(output.logits, dim=1).item()
    predicted_tag = index_to_tag[predicted_label]

    # Find the corresponding intent
    for intent in intents['intents']:
        if intent['tag'] == predicted_tag:
            return random.choice(intent['responses'])

    return "Maaf, saya tidak mengerti pertanyaan Anda."

def chat():
    print("Chatbot: Halo! Saya adalah chatbot yang berspesialisasi tentang biografi Ir. Soekarno. Apa yang ingin Anda tanyakan? (Ketik 'keluar' untuk mengakhiri)")

    while True:
        user_input = input("Anda: ")
        if user_input.lower() == 'keluar':
            print("Chatbot: Terima kasih atas percakapannya. Sampai jumpa!")
            break

        response = get_response(user_input)
        print("Chatbot:", response)

# Jalankan chatbot
if __name__ == "__main__":
    chat()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Chatbot: Halo! Saya adalah chatbot yang berspesialisasi tentang biografi Ir. Soekarno. Apa yang ingin Anda tanyakan? (Ketik 'keluar' untuk mengakhiri)
Anda: Hai
Chatbot: Hai! Senang bertemu dengan Anda. Apa yang ingin anda ketahui tentang Ir. Soekarno?
Anda: Siapakah itu soekarno
Chatbot: Ir. Soekarno adalah presiden pertama Republik Indonesia, menjabat dari 1945 hingga 1967. Beliau dikenal sebagai Proklamator Kemerdekaan bersama Mohammad Hatta pada 17 Agustus 1945.
Anda: Kapan soekarno lahir
Chatbot: Lahir pada 6 Juni 1901 di Surabaya, Soekarno awalnya bernama Kusno Sosrodihardjo. Nama ini diganti karena ia sering sakit-sakitan.
Anda: bagaimana kehidupan keluarganya
Chatbot: Soekarno menikah dengan beberapa wanita, termasuk Fatmawati dan Hartini. Salah satu anaknya, Megawati Soekarnoputri, juga pernah menjadi Presiden Indonesia.
Anda: Apa perannya dalam kemerdekaan indonesia
Chatbot: Indonesia merdeka pada tanggal 17 Agustus 1945, yang ditandai dengan pembacaan teks proklamasi oleh So