In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Set seed for reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# 1. Load data
df = pd.read_csv('df_cleaned.csv')  # Sesuaikan nama file
X = df['Text']  # Ganti dengan nama kolom teks Anda
y = df.drop(columns='Text').idxmax(axis=1)

# 2. Encode target labels
label_mapping = {label: idx for idx, label in enumerate(y.unique())}
y = y.map(label_mapping)

# 3. Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# Padding sequences
max_length = max(len(seq) for seq in X_seq)
X_padded = pad_sequences(X_seq, maxlen=max_length, padding='post')

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y, test_size=0.2, random_state=SEED, stratify=y
)

# 5. Convert target labels to one-hot encoding
y_train_encoded = to_categorical(y_train, num_classes=len(label_mapping))
y_test_encoded = to_categorical(y_test, num_classes=len(label_mapping))

# 6. Build model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(len(label_mapping), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 7. Train model
history = model.fit(
    X_train, y_train_encoded,
    epochs=15,
    batch_size=4,
    validation_data=(X_test, y_test_encoded)
)

# 8. Evaluate model
loss, accuracy = model.evaluate(X_test, y_test_encoded)
print(f"Test Accuracy: {accuracy:.4f}")

# 9. Save the model
model.save('text_classification_model.h5')


Epoch 1/15




[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.2429 - loss: 2.0271 - val_accuracy: 0.6330 - val_loss: 1.2673
Epoch 2/15
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6728 - loss: 1.0268 - val_accuracy: 0.7615 - val_loss: 0.6376
Epoch 3/15
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8355 - loss: 0.5159 - val_accuracy: 0.8624 - val_loss: 0.4396
Epoch 4/15
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8633 - loss: 0.3761 - val_accuracy: 0.8440 - val_loss: 0.3898
Epoch 5/15
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8760 - loss: 0.3173 - val_accuracy: 0.8257 - val_loss: 0.3848
Epoch 6/15
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8896 - loss: 0.2857 - val_accuracy: 0.8073 - val_loss: 0.3962
Epoch 7/15
[1m108/108[0m [32m━━━━━━━



Test Accuracy: 0.7890
