In [None]:
# Instal pustaka yang dibutuhkan
!pip install pandas numpy scikit-learn matplotlib seaborn
!pip install torch # Torch sudah terinstal di Colab, tapi ini memastikan versi terbaru jika perlu
!pip install transformers accelerate datasets
!pip install nltk Sastrawi # Untuk Bahasa Indonesia

In [None]:
# Unduh data NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import json # Tambahkan ini untuk menyimpan label_mapping

# --- 1. Muat Dataset (Simulasi) ---
data = {
    'Ulasan': [
        "Aplikasi ini sangat membantu sekali, navigasinya akurat!",
        "Driver sering nyasar, bikin telat terus. Payah!",
        "Fitur barunya lumayan, tapi loadingnya agak lambat.",
        "Harga kadang naik terlalu tinggi saat jam sibuk.",
        "Mudah digunakan dan responsif, suka sekali!",
        "Pesanan saya dibatalkan sepihak, kecewa berat.",
        "Oke lah, tidak ada keluhan khusus.",
        "Pelayanan bagus, ramah-ramah drivernya.",
        "Promo makin sedikit, jadi kurang menarik.",
        "Antarmuka perlu diperbarui, terlalu jadul.",
        "Selalu jadi andalan kalau mau bepergian.",
        "Notifikasi sering terlambat masuk, jadi gak update.",
        "Lumayan, tapi sering error kalau bayar pakai dompet digital.",
        "Cepat sampai dan drivernya profesional.",
        "Kenapa rating saya tiba-tiba turun padahal tidak ada masalah?",
        "Mantap jiwa! Aplikasi terbaik sejauh ini.",
        "Saya tidak suka dengan sistem rating baru mereka.",
        "Netral saja, tidak ada yang spesial.",
        "Kurang puas dengan fitur chat driver.",
        "Gila sih, sering banget dapat promo diskon!"
    ],
    'Sentimen': [
        "Positif", "Negatif", "Netral", "Negatif", "Positif",
        "Negatif", "Netral", "Positif", "Negatif", "Negatif",
        "Positif", "Negatif", "Netral", "Positif", "Negatif",
        "Positif", "Negatif", "Netral", "Negatif", "Positif"
    ]
}
df = pd.DataFrame(data)

print("--- Data Awal ---")
display(df.head())
print("\nDistribusi Sentimen:")
display(df['Sentimen'].value_counts())

# --- 2. Pra-pemrosesan Teks ---
factory = StopWordRemoverFactory()
stop_words_remover = factory.create_stop_word_remover()
stop_words_id = stop_words_remover.get_stopwords() # Corrected method call
custom_stop_words = set(stop_words_id)

def preprocess_text(text):
    text = text.lower()
    # Remove punctuation and numbers, keep Indonesian letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in custom_stop_words]
    return ' '.join(filtered_tokens)

df['cleaned_ulasan'] = df['Ulasan'].apply(preprocess_text)
print("\n--- Data Setelah Preprocessing ---")
display(df.head())

# --- 3. Encoding Label Sentimen ---
le = LabelEncoder()
df['sentimen_encoded'] = le.fit_transform(df['Sentimen'])
label_mapping = list(le.classes_)
print(f"\nLabel Encoding Mapping: {label_mapping}")
display(df.head()) # Display after encoding

# --- 4. Pembagian Data ---
X = df['cleaned_ulasan']
y = df['sentimen_encoded']

# Adjusted test_size and random_state for better split with small dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y) # Increased temp size for validation/test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp) # Split temp into val and test

print(f"\nUkuran Training Set: {len(X_train)}")
print(f"Ukuran Validation Set: {len(X_val)}")
print(f"Ukuran Test Set: {len(X_test)}")

# Ensure X_train, X_val, X_test, y_train, y_val, y_test are accessible for the next steps
# These variables are now defined in the global scope of the notebook after this cell runs successfully.

In [None]:
# --- 7. Evaluasi Model pada Test Set ---
print("\n--- Evaluasi Model pada Test Set ---")
results = trainer.evaluate(test_dataset)
print(f"Hasil evaluasi pada test set: {results}")

# Generate classification report dan confusion matrix
predictions_output = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions_output.predictions, axis=-1)

print("\n--- Classification Report ---")
print(classification_report(y_test, predicted_labels, target_names=label_mapping))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_test, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_mapping, yticklabels=label_mapping)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# --- 8. Simpan Model & Tokenizer ---
# Di Google Colab, model akan disimpan di lingkungan virtual sementara.
# Jika Anda ingin menyimpannya secara permanen, Anda perlu menyimpannya ke Google Drive.
model_save_path = "./model_sentimen_ulasan_aplikasi" # Ini akan disimpan di sesi Colab saat ini
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModel dan tokenizer berhasil disimpan di: {model_save_path}")

# Simpan label mapping juga
with open(f"{model_save_path}/label_mapping.json", "w") as f:
    json.dump(label_mapping, f)
print(f"Label mapping berhasil disimpan di: {model_save_path}/label_mapping.json")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset

# --- 5. Tokenisasi Data ---
# Load the tokenizer for a pre-trained model (e.g., 'bert-base-uncased' or a suitable Indonesian model)
# Using 'indobenchmark/indobert-base-p1' as a suitable Indonesian model
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convert pandas Series to Datasets
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train.tolist(), 'label': y_train.tolist()}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_val.tolist(), 'label': y_val.tolist()}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test.tolist(), 'label': y_test.tolist()}))


# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


# --- 6. Fine-tuning Model (menggunakan Transformer) ---
# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    'indobenchmark/indobert-base-p1',
    num_labels=len(label_mapping)  # Jumlah kelas sentimen (Positif, Negatif, Netral)
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"      # Evaluate at the end of each epoch
)

# Create Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset,            # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer used in preprocessing
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer) # Data collator
)

# Start training
print("\n--- Memulai Training Model ---")
trainer.train()
print("--- Training Selesai ---")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset

# --- 5. Tokenisasi Data ---
# Load the tokenizer for a pre-trained model (e.g., 'bert-base-uncased' or a suitable Indonesian model)
# Using 'indobenchmark/indobert-base-p1' as a suitable Indonesian model
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convert pandas Series to Datasets
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train.tolist(), 'label': y_train.tolist()}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_val.tolist(), 'label': y_val.tolist()}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test.tolist(), 'label': y_test.tolist()}))


# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


# --- 6. Fine-tuning Model (menggunakan Transformer) ---
# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    'indobenchmark/indobert-base-p1',
    num_labels=len(label_mapping)  # Jumlah kelas sentimen (Positif, Negatif, Netral)
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"      # Evaluate at the end of each epoch
)

# Create Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset,            # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer used in preprocessing
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer) # Data collator
)

# Start training
print("\n--- Memulai Training Model ---")
trainer.train()
print("--- Training Selesai ---")

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory # Keep import in case other parts of Sastrawi are used later
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import json # Tambahkan ini untuk menyimpan label_mapping

# --- 1. Muat Dataset (Simulasi) ---
data = {
    'Ulasan': [
        "Aplikasi ini sangat membantu sekali, navigasinya akurat!",
        "Driver sering nyasar, bikin telat terus. Payah!",
        "Fitur barunya lumayan, tapi loadingnya agak lambat.",
        "Harga kadang naik terlalu tinggi saat jam sibuk.",
        "Mudah digunakan dan responsif, suka sekali!",
        "Pesanan saya dibatalkan sepihak, kecewa berat.",
        "Oke lah, tidak ada keluhan khusus.",
        "Pelayanan bagus, ramah-ramah drivernya.",
        "Promo makin sedikit, jadi kurang menarik.",
        "Antarmuka perlu diperbarui, terlalu jadul.",
        "Selalu jadi andalan kalau mau bepergian.",
        "Notifikasi sering terlambat masuk, jadi gak update.",
        "Lumayan, tapi sering error kalau bayar pakai dompet digital.",
        "Cepat sampai dan drivernya profesional.",
        "Kenapa rating saya tiba-tiba turun padahal tidak ada masalah?",
        "Mantap jiwa! Aplikasi terbaik sejauh ini.",
        "Saya tidak suka dengan sistem rating baru mereka.",
        "Netral saja, tidak ada yang spesial.",
        "Kurang puas dengan fitur chat driver.",
        "Gila sih, sering banget dapat promo diskon!"
    ],
    'Sentimen': [
        "Positif", "Negatif", "Netral", "Negatif", "Positif",
        "Negatif", "Netral", "Positif", "Negatif", "Negatif",
        "Positif", "Negatif", "Netral", "Positif", "Negatif",
        "Positif", "Negatif", "Netral", "Negatif", "Positif"
    ]
}
df = pd.DataFrame(data)

print("--- Data Awal ---")
display(df.head())
print("\nDistribusi Sentimen:")
display(df['Sentimen'].value_counts())

# --- 2. Pra-pemrosesan Teks ---
# Use NLTK Indonesian stopwords as Sastrawi is causing issues
stop_words_id = stopwords.words('indonesian')
custom_stop_words = set(stop_words_id)

def preprocess_text(text):
    text = text.lower()
    # Remove punctuation and numbers, keep Indonesian letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in custom_stop_words]
    return ' '.join(filtered_tokens)

df['cleaned_ulasan'] = df['Ulasan'].apply(preprocess_text)
print("\n--- Data Setelah Preprocessing ---")
display(df.head())

# --- 3. Encoding Label Sentimen ---
le = LabelEncoder()
df['sentimen_encoded'] = le.fit_transform(df['Sentimen'])
label_mapping = list(le.classes_)
print(f"\nLabel Encoding Mapping: {label_mapping}")
display(df.head()) # Display after encoding

# --- 4. Pembagian Data ---
X = df['cleaned_ulasan']
y = df['sentimen_encoded']

# Adjusted test_size and random_state for better split with small dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y) # Increased temp size for validation/test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) # Removed stratify from the second split

print(f"\nUkuran Training Set: {len(X_train)}")
print(f"Ukuran Validation Set: {len(X_val)}")
print(f"Ukuran Test Set: {len(X_test)}")

# Ensure X_train, X_val, X_test, y_train, y_val, y_test are accessible for the next steps
# These variables are now defined in the global scope of the notebook after this cell runs successfully.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset

# --- 5. Tokenisasi Data ---
# Load the tokenizer for a pre-trained model (e.g., 'bert-base-uncased' or a suitable Indonesian model)
# Using 'indobenchmark/indobert-base-p1' as a suitable Indonesian model
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convert pandas Series to Datasets
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train.tolist(), 'label': y_train.tolist()}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_val.tolist(), 'label': y_val.tolist()}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test.tolist(), 'label': y_test.tolist()}))


# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


# --- 6. Fine-tuning Model (menggunakan Transformer) ---
# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    'indobenchmark/indobert-base-p1',
    num_labels=len(label_mapping)  # Jumlah kelas sentimen (Positif, Negatif, Netral)
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"      # Evaluate at the end of each epoch
)

# Create Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset,            # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer used in preprocessing
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer) # Data collator
)

# Start training
print("\n--- Memulai Training Model ---")
trainer.train()
print("--- Training Selesai ---")

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset

# --- 5. Tokenisasi Data ---
# Load the tokenizer for a pre-trained model (e.g., 'bert-base-uncased' or a suitable Indonesian model)
# Using 'indobenchmark/indobert-base-p1' as a suitable Indonesian model
tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Convert pandas Series to Datasets
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train.tolist(), 'label': y_train.tolist()}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_val.tolist(), 'label': y_val.tolist()}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test.tolist(), 'label': y_test.tolist()}))


# Apply tokenization
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


# --- 6. Fine-tuning Model (menggunakan Transformer) ---
# Load pre-trained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    'indobenchmark/indobert-base-p1',
    num_labels=len(label_mapping)  # Jumlah kelas sentimen (Positif, Negatif, Netral)
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_strategy="epoch"      # Evaluate at the end of each epoch # Corrected argument name
)

# Create Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset,            # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer used in preprocessing
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer) # Data collator
)

# Start training
print("\n--- Memulai Training Model ---")
trainer.train()
print("--- Training Selesai ---")

In [None]:
# --- 7. Evaluasi Model pada Test Set ---
print("\n--- Evaluasi Model pada Test Set ---")
results = trainer.evaluate(tokenized_test_dataset) # Use tokenized_test_dataset
print(f"Hasil evaluasi pada test set: {results}")

# Generate classification report dan confusion matrix
predictions_output = trainer.predict(tokenized_test_dataset) # Use tokenized_test_dataset
predicted_labels = np.argmax(predictions_output.predictions, axis=-1)

print("\n--- Classification Report ---")
print(classification_report(y_test, predicted_labels, target_names=label_mapping))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(y_test, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_mapping, yticklabels=label_mapping)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# --- 8. Simpan Model & Tokenizer ---
# Di Google Colab, model akan disimpan di lingkungan virtual sementara.
# Jika Anda ingin menyimpannya secara permanen, Anda perlu menyimpannya ke Google Drive.
model_save_path = "./model_sentimen_ulasan_aplikasi" # Ini akan disimpan di sesi Colab saat ini
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModel dan tokenizer berhasil disimpan di: {model_save_path}")

# Simpan label mapping juga
with open(f"{model_save_path}/label_mapping.json", "w") as f:
    json.dump(label_mapping, f)
print(f"Label mapping berhasil disimpan di: {model_save_path}/label_mapping.json")