In [9]:
pip install transformers torch pandas nltk



In [1]:
import json
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import random

In [4]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('indonesian'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation and token not in stop_words]
    return " ".join(tokens)

# Load data from JSON file
with open('dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Preprocess and flatten the data
preprocessed_data = []
for intent in data['intents']:
    for pattern in intent['patterns']:
        preprocessed_data.append({
            'tag': intent['tag'],
            'pattern': preprocess_text(pattern),
            'response': intent['responses'][0]  # We'll use the first response for simplicity
        })

# Create DataFrame
df = pd.DataFrame(preprocessed_data)

# Create a mapping of tags to integers
tag_to_index = {tag: i for i, tag in enumerate(df['tag'].unique())}
index_to_tag = {i: tag for tag, i in tag_to_index.items()}

# Convert tags to indices
df['tag_index'] = df['tag'].map(tag_to_index)

# Save to CSV
df.to_csv('qa_data.csv', index=False)

# Load pre-trained BERT tokenizer for Indonesian
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')

# Tokenize and encode the patterns
encoded_data = tokenizer(
    df['pattern'].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

# Save encoded data
torch.save(encoded_data, 'encoded_data.pt')

# Save tag mappings
torch.save(tag_to_index, 'tag_to_index.pt')
torch.save(index_to_tag, 'index_to_tag.pt')

print("Data preparation completed.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Data preparation completed.


In [7]:
# Load encoded data and DataFrame
encoded_data = torch.load('encoded_data.pt')
df = pd.read_csv('qa_data.csv')
tag_to_index = torch.load('tag_to_index.pt', weights_only=True)

# Prepare dataset
dataset = TensorDataset(
    encoded_data['input_ids'],
    encoded_data['attention_mask'],
    torch.tensor(df['tag_index'].values)
)

# Create data loader
batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Load pre-trained BERT model
num_labels = len(tag_to_index)
model = BertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased', num_labels=num_labels)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} completed. Average loss: {avg_loss:.4f}")

# Save the fine-tuned model
torch.save(model.state_dict(), 'fine_tuned_bert.pt')

print("Model training completed and saved.")

  encoded_data = torch.load('encoded_data.pt')
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/100 completed. Average loss: 3.0002
Epoch 2/100 completed. Average loss: 2.8806
Epoch 3/100 completed. Average loss: 2.8807
Epoch 4/100 completed. Average loss: 2.8895
Epoch 5/100 completed. Average loss: 2.8576
Epoch 6/100 completed. Average loss: 2.8008
Epoch 7/100 completed. Average loss: 2.7155
Epoch 8/100 completed. Average loss: 2.6520
Epoch 9/100 completed. Average loss: 2.6644
Epoch 10/100 completed. Average loss: 2.6063
Epoch 11/100 completed. Average loss: 2.4626
Epoch 12/100 completed. Average loss: 2.3933
Epoch 13/100 completed. Average loss: 2.2535
Epoch 14/100 completed. Average loss: 2.1935
Epoch 15/100 completed. Average loss: 2.0839
Epoch 16/100 completed. Average loss: 1.9888
Epoch 17/100 completed. Average loss: 1.8855
Epoch 18/100 completed. Average loss: 1.8727
Epoch 19/100 completed. Average loss: 1.7378
Epoch 20/100 completed. Average loss: 1.5670
Epoch 21/100 completed. Average loss: 1.4604
Epoch 22/100 completed. Average loss: 1.3758
Epoch 23/100 comple

In [8]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('indonesian'))

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')

# Load tag mappings
tag_to_index = torch.load('tag_to_index.pt', weights_only=True)
index_to_tag = torch.load('index_to_tag.pt', weights_only=True)

# Load pre-trained BERT model
num_labels = len(tag_to_index)
model = BertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased', num_labels=num_labels)

# Load fine-tuned model weights
model.load_state_dict(torch.load('fine_tuned_bert.pt', weights_only=True))
model.eval()

# Load intents data
with open('dataset.json', 'r', encoding='utf-8') as f:
    intents = json.load(f)

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation and token not in stop_words]
    return " ".join(tokens)

def get_response(text):
    preprocessed_text = preprocess_text(text)
    encoded_input = tokenizer(
        preprocessed_text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    with torch.no_grad():
        output = model(**encoded_input)

    predicted_label = torch.argmax(output.logits, dim=1).item()
    predicted_tag = index_to_tag[predicted_label]

    # Find the corresponding intent
    for intent in intents['intents']:
        if intent['tag'] == predicted_tag:
            return random.choice(intent['responses'])

    return "Maaf, saya tidak mengerti pertanyaan Anda."

def chat():
    print("Chatbot: Halo! Saya adalah chatbot yang berspesialisasi tentang biografi Ir. Soekarno. Apa yang ingin Anda tanyakan? (Ketik 'keluar' untuk mengakhiri)")

    while True:
        user_input = input("Anda: ")
        if user_input.lower() == 'keluar':
            print("Chatbot: Terima kasih atas percakapannya. Sampai jumpa!")
            break

        response = get_response(user_input)
        print("Chatbot:", response)

# Jalankan chatbot
if __name__ == "__main__":
    chat()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Chatbot: Halo! Saya adalah chatbot yang berspesialisasi tentang biografi Ir. Soekarno. Apa yang ingin Anda tanyakan? (Ketik 'keluar' untuk mengakhiri)
Anda: Hai apa kabar
Chatbot: Selamat datang! silahkan bertanya, saya bisa menjelaskan tentang Ir. Soekarno.
Anda: siapa itu soekarno?
Chatbot: Presiden pertama Indonesia, Ir. Soekarno, memimpin negara dari 1945 hingga 1967. Beliau adalah Proklamator Kemerdekaan yang memproklamasikan Indonesia merdeka pada 17 Agustus 1945.
Anda: Dimana dan kapan kelahiran soekarno?
Chatbot: Soekarno lahir pada 6 Juni 1901 di Surabaya, Jawa Timur. Nama kecilnya adalah Kusno Sosrodihardjo, tetapi kemudian diganti menjadi Soekarno karena ia sering sakit-sakitan.
Anda: Bagaimana kehidupan keluarga soekarno
Chatbot: Soekarno menikah dengan beberapa wanita, termasuk Fatmawati dan Hartini. Salah satu anaknya, Megawati Soekarnoputri, juga pernah menjadi Presiden Indonesia.
Anda: Dimana Soekarno bersekolah?
Chatbot: Ir. Soekarno memperoleh gelar insinyur dari Tech