In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("alexmariosimanjuntak/dana-app-sentiment-review-on-playstore-indonesia")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/alexmariosimanjuntak/dana-app-sentiment-review-on-playstore-indonesia?dataset_version_number=1...


100%|██████████| 1.68M/1.68M [00:00<00:00, 2.16MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/alexmariosimanjuntak/dana-app-sentiment-review-on-playstore-indonesia/versions/1





In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. LOAD DATA
# Gunakan path file Anda (sesuaikan jika running di lokal/kaggle)
file_path = 'review_dana_labelled.csv'
df = pd.read_csv(file_path)

# Membersihkan data dari nilai null
df = df.dropna(subset=['content', 'sentimen'])

# 2. PREPROCESSING
# Encode label (NEGATIVE, NEUTRAL, POSITIVE) menjadi (0, 1, 2)
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentimen'])
num_classes = len(le.classes_)

# Split Data (80% Latih, 20% Uji)
X_train, X_test, y_train, y_test = train_test_split(
    df['content'].values,
    df['label'].values,
    test_size=0.2,
    random_state=42
)

# Vectorization (Mengubah teks menjadi angka/token)
vocab_size = 20000  # Ukuran kosakata
maxlen = 100       # Maksimal panjang kalimat
vectorize_layer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=maxlen
)
vectorize_layer.adapt(X_train)

# 3. TRANSFORMER ARCHITECTURE
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=None):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        length = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

# Membangun Model
embed_dim = 32  # Dimensi embedding
num_heads = 2   # Jumlah attention heads
ff_dim = 32     # Dimensi feed forward network

inputs = layers.Input(shape=(maxlen,))
x = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# 4. TRAINING
print("Memulai pelatihan model...")
model.fit(
    vectorize_layer(X_train), y_train,
    batch_size=64,
    epochs=5,
    validation_data=(vectorize_layer(X_test), y_test)
)

# 5. PELABELAN DATA PENGUJIAN (INFERENCE)
# Melakukan prediksi pada X_test
predictions = model.predict(vectorize_layer(X_test))
predicted_labels = np.argmax(predictions, axis=1)

# Mengubah angka kembali ke label asli (POSITIVE/NEGATIVE/NEUTRAL)
predicted_sentimen = le.inverse_transform(predicted_labels)
actual_sentimen = le.inverse_transform(y_test)

# Menyimpan hasil ke dalam DataFrame untuk dianalisis
results_df = pd.DataFrame({
    'Teks_Review': X_test,
    'Sentimen_Asli': actual_sentimen,
    'Sentimen_Prediksi_Transformer': predicted_sentimen
})

# Simpan hasil pelabelan ke CSV
results_df.to_csv('hasil_pelabelan_transformer.csv', index=False)
print("\nHasil pelabelan telah disimpan ke 'hasil_pelabelan_transformer.csv'")

# Tampilkan 10 hasil pertama
print("\nContoh Hasil Pelabelan:")
print(results_df.head(10))

Memulai pelatihan model...
Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.6595 - loss: 0.7861 - val_accuracy: 0.8396 - val_loss: 0.3992
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8590 - loss: 0.3608 - val_accuracy: 0.8456 - val_loss: 0.3844
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8977 - loss: 0.2674 - val_accuracy: 0.8463 - val_loss: 0.4057
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9184 - loss: 0.2214 - val_accuracy: 0.8391 - val_loss: 0.4302
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9307 - loss: 0.1807 - val_accuracy: 0.8348 - val_loss: 0.4431
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step

Hasil pelabelan telah disimpan ke 'hasil_pelabelan_transformer.csv'

Contoh Hasil Pelabelan

In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. LOAD & SPLIT DATA (Perbaikan nama kolom)
df = pd.read_csv('/content/review_dana_labelled.csv')
df = df.dropna(subset=['content', 'sentimen'])

# Menggunakan 'content' sesuai header file asli
X = df['content']
y = df['sentimen']

# Encode Label (POSITIVE, NEGATIVE, NEUTRAL -> 0, 1, 2)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# 2. VECTORIZATION
vocab_size = 15000
maxlen = 100
vectorize_layer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=maxlen
)
vectorize_layer.adapt(X_train.values)

# 3. TRANSFORMER COMPONENTS
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=None):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        length = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

# 4. MODEL BUILDING

embed_dim = 32
num_heads = 2
ff_dim = 32

inputs = layers.Input(shape=(maxlen,))
x = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(inputs)
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.2)(x)
x = layers.Dense(32, activation="relu")(x)
outputs = layers.Dense(len(le.classes_), activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# 5. TRAINING & LABELLING
# Mengubah teks menjadi urutan angka (tensor)
X_train_vec = vectorize_layer(X_train.values)
X_test_vec = vectorize_layer(X_test.values)

model.fit(X_train_vec, y_train, batch_size=64, epochs=5, validation_data=(X_test_vec, y_test))

# Prediksi makna (pelabelan) pada data pengujian
predictions = model.predict(X_test_vec)
pred_labels = np.argmax(predictions, axis=1)

# Menggabungkan hasil untuk verifikasi
hasil_akhir = pd.DataFrame({
    'Teks_Review': X_test,
    'Label_Asli': le.inverse_transform(y_test),
    'Prediksi_Transformer': le.inverse_transform(pred_labels)
})

print("\nContoh Hasil Pelabelan Otomatis:")
print(hasil_akhir.head())

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.6279 - loss: 0.8722 - val_accuracy: 0.7446 - val_loss: 0.5333
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7990 - loss: 0.4773 - val_accuracy: 0.8355 - val_loss: 0.4002
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8659 - loss: 0.3460 - val_accuracy: 0.8441 - val_loss: 0.3975
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8956 - loss: 0.2739 - val_accuracy: 0.8426 - val_loss: 0.4088
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9109 - loss: 0.2321 - val_accuracy: 0.8421 - val_loss: 0.4356
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step

Contoh Hasil Pelabelan Otomatis:
                                             Teks_Review Label_Asli  \
35325  Jelek..

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file into a DataFrame using the correct path
df = pd.read_csv('/content/review_dana_labelled.csv')

# Display the first 5 rows of the DataFrame
print("First 5 rows of the DataFrame:")
print(df.head())

# Display the DataFrame's information to check data types and missing values
print("\nDataFrame Info:")
df.info()

# Separate features ('content') and target ('sentimen')
X = df['content']
y = df['sentimen']

# Split the data into training and testing sets (80% train, 20% test)
# Stratify by 'sentimen' to maintain balanced class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

print("\nSentiment distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nSentiment distribution in testing set:")
print(y_test.value_counts(normalize=True))

First 5 rows of the DataFrame:
          userName  score                   at  \
0     Elisya Kasni      5  2024-02-15 11:24:56   
1       Rusman Man      2  2024-02-15 11:24:03   
2     Qiliw Sadega      1  2024-02-15 11:23:34   
3  Kijutjrv2 Kijut      3  2024-02-15 11:22:46   
4     Fifi Alfiyah      1  2024-02-15 11:21:34   

                                             content  sentimen  
0                                              Bagus  POSITIVE  
1                             Dana mmg keren mantap.  POSITIVE  
2  Saya ngajuin upgrade dana premium krna ktp say...  NEGATIVE  
3  Kocak mana diskon nya ml malah eror segala kag...  NEGATIVE  
4  Saldo hilang karena no lama Hilang ganti no sa...  NEGATIVE  

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   userName  50000 non-null  object
 1   score     50000 non-null  int64 
 

In [15]:
from sklearn.preprocessing import LabelEncoder

# 1. Inisialisasi LabelEncoder
le = LabelEncoder()

# 2. Fit dan Transform label teks menjadi angka
# POSITIVE -> 2, NEGATIVE -> 0, NEUTRAL -> 1 (urutan tergantung abjad)
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Sekarang Anda bisa menjalankan pelatihan kembali
print("Contoh label setelah di-encode:", y_train_enc[:5])
print("Mapping label:", dict(zip(le.classes_, le.transform(le.classes_))))

# 3. Jalankan Pelatihan
history = model.fit(
    train_inputs,
    y_train_enc,
    epochs=3,
    batch_size=32,
    validation_data=(test_inputs, y_test_enc)
)

Contoh label setelah di-encode: [1 0 0 1 0]
Mapping label: {np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2)}
Epoch 1/3
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 10ms/step - accuracy: 0.6747 - loss: 0.7418 - val_accuracy: 0.8099 - val_loss: 0.4496
Epoch 2/3
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - accuracy: 0.8107 - loss: 0.4423 - val_accuracy: 0.8338 - val_loss: 0.3996
Epoch 3/3
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.8431 - loss: 0.3804 - val_accuracy: 0.8395 - val_loss: 0.3871


In [7]:
import tensorflow as tf
from transformers import AutoTokenizer

# Load a robust pre-trained tokenizer for Indonesian BERT
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

# Tokenize and encode the training data
X_train_encoded = tokenizer(list(X_train.values),
                            padding='max_length',
                            truncation=True,
                            max_length=128,
                            return_tensors='tf')

# Tokenize and encode the testing data
X_test_encoded = tokenizer(list(X_test.values),
                           padding='max_length',
                           truncation=True,
                           max_length=128,
                           return_tensors='tf')

# Display the shape of the encoded outputs for training data
print("Shape of X_train_encoded input_ids:", X_train_encoded['input_ids'].shape)
print("Shape of X_train_encoded attention_mask:", X_train_encoded['attention_mask'].shape)
if 'token_type_ids' in X_train_encoded:
    print("Shape of X_train_encoded token_type_ids:", X_train_encoded['token_type_ids'].shape)

print("\nShape of X_test_encoded input_ids:", X_test_encoded['input_ids'].shape)
print("Shape of X_test_encoded attention_mask:", X_test_encoded['attention_mask'].shape)
if 'token_type_ids' in X_test_encoded:
    print("Shape of X_test_encoded token_type_ids:", X_test_encoded['token_type_ids'].shape)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


Shape of X_train_encoded input_ids: (40000, 128)
Shape of X_train_encoded attention_mask: (40000, 128)
Shape of X_train_encoded token_type_ids: (40000, 128)

Shape of X_test_encoded input_ids: (10000, 128)
Shape of X_test_encoded attention_mask: (10000, 128)
Shape of X_test_encoded token_type_ids: (10000, 128)


In [8]:
import numpy as np

# Get unique labels and create a mapping
label_to_id = {label: i for i, label in enumerate(np.unique(y_train))}
id_to_label = {i: label for label, i in label_to_id.items()}

# Convert string labels to numerical IDs
y_train_encoded = np.array([label_to_id[label] for label in y_train])
y_test_encoded = np.array([label_to_id[label] for label in y_test])

num_classes = len(label_to_id)

print("Label to ID mapping:", label_to_id)
print("ID to Label mapping:", id_to_label)
print("Number of classes:", num_classes)
print("\nShape of y_train_encoded:", y_train_encoded.shape)
print("Shape of y_test_encoded:", y_test_encoded.shape)
print("First 5 encoded training labels:", y_train_encoded[:5])
print("First 5 encoded testing labels:", y_test_encoded[:5])

Label to ID mapping: {'NEGATIVE': 0, 'NEUTRAL': 1, 'POSITIVE': 2}
ID to Label mapping: {0: 'NEGATIVE', 1: 'NEUTRAL', 2: 'POSITIVE'}
Number of classes: 3

Shape of y_train_encoded: (40000,)
Shape of y_test_encoded: (10000,)
First 5 encoded training labels: [1 0 0 1 0]
First 5 encoded testing labels: [0 1 2 0 2]


In [9]:
from transformers import TFAutoModelForSequenceClassification
import tensorflow as tf

# Load the pre-trained Transformer model with a classification head
# The num_labels argument tells the model how many output classes are expected
model = TFAutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1",
    num_labels=num_classes
)

# Compile the model for training
# Use Adam optimizer, SparseCategoricalCrossentropy for multi-class classification,
# and accuracy as a metric.
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]

model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics)

print("Transformer model loaded and compiled successfully.")
model.summary()

tf_model.h5:   0%|          | 0.00/656M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Transformer model loaded and compiled successfully.
Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  124441344 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 124443651 (474.71 MB)
Trainable params: 124443651 (474.71 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
import tensorflow as tf

# Prepare the training and testing datasets for the model
train_dataset = tf.data.Dataset.from_tensor_slices(
    ({
        'input_ids': X_train_encoded['input_ids'],
        'attention_mask': X_train_encoded['attention_mask'],
        'token_type_ids': X_train_encoded['token_type_ids']
    },
    y_train_encoded)
).shuffle(1000).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices(
    ({
        'input_ids': X_test_encoded['input_ids'],
        'attention_mask': X_test_encoded['attention_mask'],
        'token_type_ids': X_test_encoded['token_type_ids']
    },
    y_test_encoded)
).batch(32)

# Train the model
history = model.fit(
    train_dataset,
    epochs=3,
    validation_data=test_dataset
)

print("Model training completed.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Model training completed.
