In [1]:
import json
from datasets import load_from_disk

dataset_train_loaded = load_from_disk("./topic_classification/train_dataset")
dataset_test_loaded = load_from_disk("./topic_classification/test_dataset")

print("Dataset Train loaded:", dataset_train_loaded)
print("Dataset Test loaded:", dataset_test_loaded)

with open("label_mapping.json", "r") as f:
    label_mapping_loaded = json.load(f)

print("Label Mapping loaded:", label_mapping_loaded)

with open("class_weight.json", "r") as f:
    class_weight_loaded = json.load(f)

class_weight_dict = {int(k): v for k, v in class_weight_loaded.items()}

print("Class Weights loaded (re-casted):", class_weight_dict)

  from .autonotebook import tqdm as notebook_tqdm


Dataset Train loaded: Dataset({
    features: ['final_text', 'label'],
    num_rows: 86941
})
Dataset Test loaded: Dataset({
    features: ['final_text', 'label'],
    num_rows: 21736
})
Label Mapping loaded: {'astro-ph': 0, 'cond-mat': 1, 'cs': 2, 'econ': 3, 'eess': 4, 'gr-qc': 5, 'hep-ex': 6, 'hep-lat': 7, 'hep-ph': 8, 'hep-th': 9, 'math': 10, 'math-ph': 11, 'nlin': 12, 'nucl-ex': 13, 'nucl-th': 14, 'physics': 15, 'q-bio': 16, 'q-fin': 17, 'quant-ph': 18, 'stat': 19}
Class Weights loaded (re-casted): {0: 0.9724944071588367, 1: 0.7553518679409209, 2: 0.19485633600788918, 3: 12.673615160349854, 4: 1.8127814845704755, 5: 6.792265625, 6: 6.792265625, 7: 6.792265625, 8: 6.813557993730408, 9: 6.792265625, 10: 0.2343423180592992, 11: 6.792265625, 12: 1.552517857142857, 13: 6.802895148669797, 14: 6.792265625, 15: 0.3266003005259204, 16: 0.8681945276612743, 17: 1.1803013847407005, 18: 6.802895148669797, 19: 1.69806640625}


In [2]:
import pandas as pd

print("\n5 Data Pertama (Train)")
df_sample_train = pd.DataFrame(dataset_train_loaded[:5])
display(df_sample_train)

print("\n5 Data Pertama (Test)")
df_sample_test = pd.DataFrame(dataset_test_loaded[:5])
display(df_sample_test)


5 Data Pertama (Train)


Unnamed: 0,final_text,label
0,ante forecast outcome interpreted counterfactu...,17
1,gaming customizing individual character create...,2
2,framework european research project meteomet l...,15
3,software product quality defined feature chara...,2
4,optimizing communication imperative large scal...,2



5 Data Pertama (Test)


Unnamed: 0,final_text,label
0,work propose use dropout bayesian estimator in...,4
1,given simple polygon [eq] consisting [eq] vert...,2
2,pattern stored within pre trained deep neural ...,2
3,ride sharing service gaining popularity crucia...,2
4,perform experiment phase simulation ring netwo...,1


In [3]:
from datasets import ClassLabel

num_classes = len(label_mapping_loaded)

dataset_train_loaded = dataset_train_loaded.cast_column(
    "label", 
    ClassLabel(num_classes=num_classes)
)

print("Tipe kolom label sekarang:", dataset_train_loaded.features['label'])

split_result = dataset_train_loaded.train_test_split(
    test_size=0.2, 
    seed=42, 
    stratify_by_column="label"
)

dataset_train_final = split_result['train']
dataset_val_final = split_result['test']

print("="*30)
print(f"Original Train: {len(dataset_train_loaded)}")
print(f"New Train (80%): {len(dataset_train_final)}")
print(f"New Val   (20%): {len(dataset_val_final)}")
print(f"Original Test : {len(dataset_test_loaded)}")
print("="*30)

Tipe kolom label sekarang: ClassLabel(names=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19'])
Original Train: 86941
New Train (80%): 69552
New Val   (20%): 17389
Original Test : 21736


In [4]:
n_train_limit = min(1000, len(dataset_train_final))
dataset_train_final_samples = dataset_train_final.select(range(n_train_limit))

n_val_limit = min(200, len(dataset_val_final))
dataset_val_final_samples = dataset_val_final.select(range(n_val_limit))

n_test_limit = min(200, len(dataset_val_final))
dataset_test_final_samples = dataset_test_loaded.select(range(n_val_limit))

print("\n" + "="*30)
print("STATUS DATASET (MODE DEBUG)")
print("="*30)
print(f"Train Size : {len(dataset_train_final_samples)}")
print(f"Val Size   : {len(dataset_val_final_samples)}")
print(f"Test Size : {len(dataset_test_final_samples)}")
print("="*30)


STATUS DATASET (MODE DEBUG)
Train Size : 1000
Val Size   : 200
Test Size : 200


In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Iskandar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
df_train = dataset_train_final.to_pandas()
df_val = dataset_val_final.to_pandas()
df_test = dataset_test_loaded.to_pandas()

In [8]:
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = word_tokenize(text)
    return tokens

In [9]:
df_train['tokens'] = df_train['final_text'].apply(clean_and_tokenize)
df_val['tokens'] = df_val['final_text'].apply(clean_and_tokenize)
df_test['tokens'] = df_test['final_text'].apply(clean_and_tokenize)

In [10]:
num_classes = 20

In [11]:
from gensim.models import Word2Vec, FastText, KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [12]:
sentences = df_train['tokens'].tolist()

In [13]:
# Word2Vec
w2v_model = Word2Vec(sentences, vector_size=300, window=5, workers=4, epochs=10, min_count=2)

In [14]:
# FastText
ft_custom_model = FastText(sentences, vector_size=300, window=5, workers=4, epochs=10, min_count=2)

In [15]:
# FastText Pretrained
try:
    ft_pretrained = KeyedVectors.load_word2vec_format('cc.en.300.vec', binary=False) 
    print("FastText Pretrained loaded.")
except Exception as e:
    print(f"Gagal load FastText Pretrained: {e}. Menggunakan custom model sebagai fallback.")
    ft_pretrained = ft_custom_model.wv

FastText Pretrained loaded.


In [16]:
# GloVe
glove_embeddings = {}
try:
    with open('./glove.6B.100d.txt', encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            glove_embeddings[word] = coefs
    print(f"GloVe loaded. Found {len(glove_embeddings)} words.")
except FileNotFoundError:
    print("File GloVe tidak ditemukan. Pastikan path benar.")

GloVe loaded. Found 400000 words.


In [17]:
# TF-IDF + SVD
print("Vectorizing TF-IDF + SVD...")
tfidf = TfidfVectorizer(max_features=100000)
X_train_tfidf = tfidf.fit_transform(df_train['final_text'])
X_val_tfidf = tfidf.transform(df_val['final_text'])

svd = TruncatedSVD(n_components=2000, random_state=42)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_val_svd = svd.transform(X_val_tfidf)
print(f"Shape TF-IDF SVD: {X_train_svd.shape}")

X_train_svd = X_train_svd.astype(np.float32)
X_val_svd = X_val_svd.astype(np.float32)

Vectorizing TF-IDF + SVD...
Shape TF-IDF SVD: (69552, 2000)


In [18]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 200000
max_len = 200

# tokenizer = Tokenizer(num_words=max_words)
# tokenizer.fit_on_texts(df_train['final_text'])
# word_index = tokenizer.word_index

# Simpan tokenizer
# with open("tokenizer_sl.pkl", "wb") as f:
#     pickle.dump(tokenizer, f)

with open("tokenizer_sl.pkl", "rb") as f:
    tokenizer = pickle.load(f)
 
word_index = tokenizer.word_index

In [19]:
X_train_seq = tokenizer.texts_to_sequences(df_train['tokens'])
X_val_seq = tokenizer.texts_to_sequences(df_val['tokens'])
X_test_seq = tokenizer.texts_to_sequences(df_test['tokens'])

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [20]:
print(f"Contoh input asli: {df_train['tokens'].iloc[0][:5]}")
print(f"Contoh input angka: {X_train_seq[0][:5]}")

Contoh input asli: ['bound', 'quasiparticle', 'negatively', 'charged', 'trions']
Contoh input angka: [116, 3443, 4401, 1362, 23950]


In [21]:
y_train = df_train['label'].values
y_val = df_val['label'].values
y_test = df_test['label'].values

In [22]:
def create_embedding_matrix(word_index, embedding_source, dim, type='gensim'):
    vocab_size = len(word_index) + 1
    matrix = np.zeros((vocab_size, dim))
    hits = 0
    
    for word, i in word_index.items():
        if i >= max_words: continue
        
        try:
            if type == 'gensim':
                if word in embedding_source.wv: 
                    matrix[i] = embedding_source.wv[word]
                    hits += 1
            elif type == 'keyedvectors':
                if word in embedding_source:
                    matrix[i] = embedding_source[word]
                    hits += 1
            elif type == 'dict':
                embedding_vector = embedding_source.get(word)
                if embedding_vector is not None:
                    matrix[i] = embedding_vector
                    hits += 1
        except:
            continue
            
    print(f"Embedding coverage: {hits}/{len(word_index)} words found.")
    return matrix

print("\nCreating W2V Matrix...")
matrix_w2v = create_embedding_matrix(word_index, w2v_model, 300, type='gensim')

print("\nCreating FastText Custom Matrix...")
matrix_ft_custom = create_embedding_matrix(word_index, ft_custom_model, 300, type='gensim')

print("\nCreating FastText Pretrained Matrix...")
matrix_ft_pre = create_embedding_matrix(word_index, ft_pretrained, 300, type='keyedvectors')

print("\nCreating GloVe Matrix...")
matrix_glove = create_embedding_matrix(word_index, glove_embeddings, 100, type='dict')

embedding_configs = {
    "Word2Vec_300": (matrix_w2v, 300),
    "FT_Custom_300": (matrix_ft_custom, 300),
    "FT_Pre_300": (matrix_ft_pre, 300),
    "GloVe_100": (matrix_glove, 100)
}


Creating W2V Matrix...
Embedding coverage: 54072/88208 words found.

Creating FastText Custom Matrix...
Embedding coverage: 88208/88208 words found.

Creating FastText Pretrained Matrix...
Embedding coverage: 51634/88208 words found.

Creating GloVe Matrix...
Embedding coverage: 48928/88208 words found.


In [23]:
import os
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, SimpleRNN, Dense, Dropout, Bidirectional, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

if not os.path.exists("models_baru"):
    os.makedirs("models_baru")

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_dict = dict(enumerate(class_weights))
print("Class weights dictionary:", class_weights_dict)

# Dictionary matriks embedding untuk loop
embedding_dict = {
    "Word2Vec_300": matrix_w2v,
    # "FastText_Custom_300": matrix_ft_custom,
    # "FastText_Pre_300": matrix_ft_pre,
    "GloVe_100": matrix_glove
}

def build_cnn(embedding_matrix, max_len, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0],
                        output_dim=embedding_matrix.shape[1],
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable=True))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_rnn(embedding_matrix, max_len, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0],
                        output_dim=embedding_matrix.shape[1],
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable=True))
    model.add(Bidirectional(SimpleRNN(128, return_sequences=False)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_lstm(embedding_matrix, max_len, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0],
                        output_dim=embedding_matrix.shape[1],
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable=True))
    model.add(Bidirectional(LSTM(64, return_sequences=False))) 
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_ann(embedding_matrix, max_len, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0],
                        output_dim=embedding_matrix.shape[1],
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable=True))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model

model_functions = {
    "CNN": build_cnn,
    "RNN": build_rnn,
    "LSTM": build_lstm,
    "ANN": build_ann
}

results = {}

try:
    target_names = dataset_train_final.features['label'].names
except:
    target_names = [str(i) for i in range(num_classes)]

print('target names:', target_names)

print(f"Mulai Training Loop untuk {len(embedding_dict)} Embeddings x {len(model_functions)} Models...")

for emb_name, emb_matrix in embedding_dict.items():
    for model_name, model_func in model_functions.items():

        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )
        
        full_name = f"{model_name}_{emb_name}"
        print(f"\n{'='*10} Training {full_name} {'='*10}")
        
        # Build Model
        model = model_func(emb_matrix, max_len, num_classes)
        
        # Train
        history = model.fit(
            X_train_pad, y_train,
            epochs=30, 
            batch_size=64,
            validation_data=(X_val_pad, y_val),
            class_weight=class_weights_dict,
            callbacks=[early_stop],
            verbose=1 
        )
        
        # Save Model
        model_path = f"models_baru/{full_name}.h5"
        model.save(model_path)
        print(f"Model saved to {model_path}")

        # Predict & Evaluate
        y_pred_prob = model.predict(X_val_pad)
        y_pred = np.argmax(y_pred_prob, axis=1)

        acc = accuracy_score(y_val, y_pred)
        report = classification_report(
            y_val, 
            y_pred, 
            labels=range(len(target_names)),
            target_names=target_names, 
            zero_division=0
        )

        results[full_name] = {"accuracy": acc, "report": report}
        print(f"Accuracy: {acc:.4f}")
        print(report)

print("\n" + "="*40)
print("FINAL RESULTS SUMMARY")
print("="*40)
sorted_results = sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True)

for name, res in sorted_results:
    print(f"{name}: {res['accuracy']:.4f}")

Class weights dictionary: {0: np.float64(0.9724832214765101), 1: np.float64(0.7553431798436142), 2: np.float64(0.19485627836611194), 3: np.float64(12.645818181818182), 4: np.float64(1.813138686131387), 5: np.float64(6.7921875), 6: np.float64(6.7921875), 7: np.float64(6.7921875), 8: np.float64(6.818823529411764), 9: np.float64(6.7921875), 10: np.float64(0.23433962264150943), 11: np.float64(6.7921875), 12: np.float64(1.5525), 13: np.float64(6.8054794520547945), 14: np.float64(6.7921875), 15: np.float64(0.32659654395191584), 16: np.float64(0.8680978532201697), 17: np.float64(1.1804480651731162), 18: np.float64(6.8054794520547945), 19: np.float64(1.698046875)}
target names: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19']
Mulai Training Loop untuk 2 Embeddings x 4 Models...





Epoch 1/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m291s[0m 266ms/step - accuracy: 0.4403 - loss: 1.6749 - val_accuracy: 0.5611 - val_loss: 1.3053
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m 266ms/step - accuracy: 0.5679 - loss: 1.1490 - val_accuracy: 0.6016 - val_loss: 1.2158
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 262ms/step - accuracy: 0.6137 - loss: 0.9602 - val_accuracy: 0.5787 - val_loss: 1.1918
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 263ms/step - accuracy: 0.6487 - loss: 0.8045 - val_accuracy: 0.6293 - val_loss: 1.0868
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m286s[0m 263ms/step - accuracy: 0.6802 - loss: 0.6629 - val_accuracy: 0.6337 - val_loss: 1.1001
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 264ms/step - accuracy: 0.7037 - loss: 0.5595 - val_accuracy: 0.6753 - val_loss:



Model saved to models_baru/CNN_Word2Vec_300.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step
Accuracy: 0.6875
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       894
           1       0.65      0.76      0.70      1151
           2       0.85      0.70      0.77      4462
           3       0.18      0.29      0.22        68
           4       0.31      0.63      0.41       480
           5       0.41      0.66      0.51       128
           6       0.53      0.84      0.65       128
           7       0.82      0.80      0.81       128
           8       0.66      0.63      0.65       128
           9       0.46      0.71      0.55       128
          10       0.89      0.78      0.83      3710
          11       0.09      0.37      0.15       128
          12       0.42      0.72      0.53       560
          13       0.49      0.60      0.54       128
          14       0.54      0.67      0.60      



Epoch 1/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 300ms/step - accuracy: 0.2360 - loss: 2.4169 - val_accuracy: 0.3506 - val_loss: 1.9611
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m332s[0m 305ms/step - accuracy: 0.3185 - loss: 2.0392 - val_accuracy: 0.2897 - val_loss: 2.0230
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 298ms/step - accuracy: 0.3553 - loss: 1.9168 - val_accuracy: 0.3525 - val_loss: 1.8323
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 295ms/step - accuracy: 0.3483 - loss: 1.9542 - val_accuracy: 0.3639 - val_loss: 1.8133
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 297ms/step - accuracy: 0.3666 - loss: 1.8788 - val_accuracy: 0.3289 - val_loss: 1.9406
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 296ms/step - accuracy: 0.3690 - loss: 1.8304 - val_accuracy: 0.3002 - val_loss:



Model saved to models_baru/RNN_Word2Vec_300.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 24ms/step
Accuracy: 0.3639
              precision    recall  f1-score   support

           0       0.59      0.41      0.49       894
           1       0.51      0.37      0.43      1151
           2       0.65      0.17      0.27      4462
           3       0.03      0.25      0.05        68
           4       0.10      0.64      0.17       480
           5       0.08      0.65      0.14       128
           6       0.17      0.60      0.26       128
           7       0.11      0.10      0.11       128
           8       0.08      0.27      0.13       128
           9       0.12      0.11      0.12       128
          10       0.68      0.79      0.73      3710
          11       0.05      0.09      0.06       128
          12       0.24      0.28      0.26       560
          13       0.10      0.09      0.10       128
          14       0.13      0.11      0.12     



Epoch 1/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 351ms/step - accuracy: 0.4469 - loss: 1.7769 - val_accuracy: 0.5924 - val_loss: 1.1744
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 360ms/step - accuracy: 0.6021 - loss: 1.1060 - val_accuracy: 0.6590 - val_loss: 1.0086
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 358ms/step - accuracy: 0.6687 - loss: 0.8468 - val_accuracy: 0.6789 - val_loss: 0.9745
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m397s[0m 365ms/step - accuracy: 0.7235 - loss: 0.6463 - val_accuracy: 0.6858 - val_loss: 0.9770
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 357ms/step - accuracy: 0.7743 - loss: 0.4796 - val_accuracy: 0.7041 - val_loss: 0.9557
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m392s[0m 361ms/step - accuracy: 0.8182 - loss: 0.3534 - val_accuracy: 0.7133 - val_loss:



Model saved to models_baru/LSTM_Word2Vec_300.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 32ms/step
Accuracy: 0.7041
              precision    recall  f1-score   support

           0       0.84      0.87      0.86       894
           1       0.64      0.77      0.70      1151
           2       0.87      0.69      0.77      4462
           3       0.11      0.47      0.18        68
           4       0.33      0.65      0.44       480
           5       0.42      0.68      0.52       128
           6       0.63      0.80      0.71       128
           7       0.83      0.80      0.81       128
           8       0.68      0.67      0.68       128
           9       0.46      0.70      0.56       128
          10       0.92      0.77      0.84      3710
          11       0.13      0.46      0.20       128
          12       0.52      0.69      0.59       560
          13       0.49      0.75      0.59       128
          14       0.57      0.60      0.58    



Epoch 1/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m398s[0m 364ms/step - accuracy: 0.3798 - loss: 2.0731 - val_accuracy: 0.5667 - val_loss: 1.3221
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 363ms/step - accuracy: 0.4885 - loss: 1.5520 - val_accuracy: 0.5523 - val_loss: 1.3101
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 365ms/step - accuracy: 0.5572 - loss: 1.2677 - val_accuracy: 0.6692 - val_loss: 1.0828
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m398s[0m 367ms/step - accuracy: 0.6042 - loss: 1.0490 - val_accuracy: 0.6374 - val_loss: 1.1361
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 364ms/step - accuracy: 0.6497 - loss: 0.9082 - val_accuracy: 0.6604 - val_loss: 1.0709
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 364ms/step - accuracy: 0.6830 - loss: 0.7836 - val_accuracy: 0.6827 - val_loss:



Model saved to models_baru/ANN_Word2Vec_300.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step
Accuracy: 0.7255
              precision    recall  f1-score   support

           0       0.84      0.85      0.85       894
           1       0.73      0.66      0.70      1151
           2       0.84      0.74      0.79      4462
           3       0.15      0.06      0.09        68
           4       0.33      0.55      0.42       480
           5       0.45      0.72      0.55       128
           6       0.63      0.77      0.70       128
           7       0.71      0.81      0.76       128
           8       0.68      0.38      0.48       128
           9       0.58      0.41      0.48       128
          10       0.88      0.84      0.86      3710
          11       0.09      0.01      0.01       128
          12       0.47      0.65      0.54       560
          13       0.56      0.59      0.57       128
          14       0.54      0.48      0.51       



Epoch 1/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 97ms/step - accuracy: 0.3340 - loss: 2.0145 - val_accuracy: 0.5065 - val_loss: 1.4144
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 98ms/step - accuracy: 0.5487 - loss: 1.2556 - val_accuracy: 0.5597 - val_loss: 1.2898
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 98ms/step - accuracy: 0.6222 - loss: 0.9595 - val_accuracy: 0.6226 - val_loss: 1.1366
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 99ms/step - accuracy: 0.6715 - loss: 0.7584 - val_accuracy: 0.6605 - val_loss: 1.0602
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 99ms/step - accuracy: 0.7175 - loss: 0.5839 - val_accuracy: 0.6507 - val_loss: 1.1032
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 99ms/step - accuracy: 0.7527 - loss: 0.4687 - val_accuracy: 0.6767 - val_loss: 1.060



Model saved to models_baru/CNN_GloVe_100.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
Accuracy: 0.6605
              precision    recall  f1-score   support

           0       0.72      0.91      0.81       894
           1       0.75      0.53      0.62      1151
           2       0.82      0.69      0.75      4462
           3       0.13      0.38      0.20        68
           4       0.26      0.62      0.37       480
           5       0.38      0.81      0.52       128
           6       0.71      0.70      0.70       128
           7       0.72      0.84      0.78       128
           8       0.57      0.66      0.61       128
           9       0.43      0.64      0.51       128
          10       0.92      0.74      0.82      3710
          11       0.11      0.23      0.15       128
          12       0.48      0.61      0.54       560
          13       0.41      0.80      0.54       128
          14       0.42      0.53      0.47       128



[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 118ms/step - accuracy: 0.1093 - loss: 2.8219 - val_accuracy: 0.1165 - val_loss: 2.6403
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 128ms/step - accuracy: 0.1863 - loss: 2.4998 - val_accuracy: 0.2453 - val_loss: 2.2958
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 129ms/step - accuracy: 0.1478 - loss: 2.5509 - val_accuracy: 0.0649 - val_loss: 2.8252
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 131ms/step - accuracy: 0.0641 - loss: 2.8219 - val_accuracy: 0.1175 - val_loss: 2.7337
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 131ms/step - accuracy: 0.0832 - loss: 2.6147 - val_accuracy: 0.1056 - val_loss: 2.7437
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 131ms/step - accuracy: 0.0878 - loss: 2.5924 - val_accuracy: 0.1378 - val_loss: 2.5262
Epo



Model saved to models_baru/RNN_GloVe_100.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step
Accuracy: 0.2453
              precision    recall  f1-score   support

           0       0.21      0.14      0.16       894
           1       0.33      0.03      0.06      1151
           2       0.55      0.04      0.08      4462
           3       0.01      0.38      0.02        68
           4       0.10      0.22      0.13       480
           5       0.04      0.03      0.04       128
           6       0.09      0.20      0.13       128
           7       0.07      0.09      0.08       128
           8       0.03      0.11      0.05       128
           9       0.09      0.22      0.13       128
          10       0.67      0.71      0.69      3710
          11       0.02      0.02      0.02       128
          12       0.14      0.03      0.05       560
          13       0.04      0.64      0.07       128
          14       0.02      0.05      0.03       12



[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 204ms/step - accuracy: 0.3278 - loss: 2.1327 - val_accuracy: 0.4860 - val_loss: 1.4828
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 226ms/step - accuracy: 0.5503 - loss: 1.3241 - val_accuracy: 0.6459 - val_loss: 1.1264
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 210ms/step - accuracy: 0.6692 - loss: 0.9347 - val_accuracy: 0.6868 - val_loss: 0.9957
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 211ms/step - accuracy: 0.7489 - loss: 0.6776 - val_accuracy: 0.6843 - val_loss: 0.9978
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 194ms/step - accuracy: 0.8017 - loss: 0.4945 - val_accuracy: 0.7153 - val_loss: 1.0087
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 211ms/step - accuracy: 0.8431 - loss: 0.3691 - val_accuracy: 0.7157 - val_loss: 1.0419
Epo



Model saved to models_baru/LSTM_GloVe_100.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 27ms/step
Accuracy: 0.6868
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       894
           1       0.68      0.66      0.67      1151
           2       0.86      0.69      0.76      4462
           3       0.08      0.66      0.14        68
           4       0.32      0.67      0.44       480
           5       0.42      0.75      0.54       128
           6       0.63      0.76      0.69       128
           7       0.73      0.87      0.79       128
           8       0.54      0.58      0.56       128
           9       0.57      0.67      0.61       128
          10       0.90      0.80      0.85      3710
          11       0.14      0.42      0.21       128
          12       0.46      0.70      0.56       560
          13       0.50      0.49      0.50       128
          14       0.49      0.55      0.52       



[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 125ms/step - accuracy: 0.2216 - loss: 2.4829 - val_accuracy: 0.3376 - val_loss: 1.8269
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 127ms/step - accuracy: 0.3402 - loss: 1.8649 - val_accuracy: 0.5115 - val_loss: 1.4365
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 127ms/step - accuracy: 0.4215 - loss: 1.5468 - val_accuracy: 0.5521 - val_loss: 1.3405
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 128ms/step - accuracy: 0.4853 - loss: 1.2891 - val_accuracy: 0.5740 - val_loss: 1.1994
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 126ms/step - accuracy: 0.5408 - loss: 1.1022 - val_accuracy: 0.6206 - val_loss: 1.1096
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 126ms/step - accuracy: 0.5773 - loss: 0.9670 - val_accuracy: 0.6236 - val_loss: 1.0826
Epo



Model saved to models_baru/ANN_GloVe_100.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Accuracy: 0.6846
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       894
           1       0.56      0.75      0.64      1151
           2       0.85      0.66      0.75      4462
           3       0.42      0.16      0.23        68
           4       0.27      0.58      0.37       480
           5       0.44      0.62      0.52       128
           6       0.76      0.66      0.70       128
           7       0.81      0.68      0.74       128
           8       0.54      0.49      0.52       128
           9       0.45      0.55      0.50       128
          10       0.87      0.83      0.85      3710
          11       0.18      0.19      0.18       128
          12       0.36      0.65      0.46       560
          13       0.43      0.68      0.53       128
          14       0.50      0.52      0.51       128

In [24]:
import os
import pickle
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

with open("tokenizer_sl.pkl", "rb") as f:
    tokenizer = pickle.load(f)

print("Testing saved models...")
for model_file in os.listdir("models_baru"):
    if model_file.endswith(".h5"):
        model_path = os.path.join("models_baru", model_file)
        print(f"Loading {model_file} ...")
        loaded_model = load_model(model_path)
        
        y_pred_prob = loaded_model.predict(X_test_pad)
        y_pred = np.argmax(y_pred_prob, axis=1)
        
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, labels=range(len(target_names)), target_names=target_names, zero_division=0)
        print(f"Loaded model {model_file} accuracy: {acc}")
        print(report)
        print("="*50)

Testing saved models...
Loading ANN_FastText_Custom_300.h5 ...




[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step
Loaded model ANN_FastText_Custom_300.h5 accuracy: 0.6978744939271255
              precision    recall  f1-score   support

           0       0.85      0.81      0.83      1117
           1       0.60      0.72      0.66      1439
           2       0.84      0.72      0.78      5578
           3       0.33      0.06      0.10        86
           4       0.35      0.52      0.42       599
           5       0.42      0.59      0.49       160
           6       0.61      0.82      0.70       160
           7       0.65      0.79      0.71       160
           8       0.57      0.34      0.42       160
           9       0.41      0.68      0.51       160
          10       0.89      0.81      0.84      4637
          11       0.13      0.16      0.15       160
          12       0.41      0.58      0.48       700
          13       0.47      0.56      0.51       160
          14       0.42      0.41      0.41  



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step
Loaded model ANN_FastText_Pre_300.h5 accuracy: 0.6525579683474421
              precision    recall  f1-score   support

           0       0.83      0.83      0.83      1117
           1       0.63      0.75      0.68      1439
           2       0.87      0.59      0.71      5578
           3       0.25      0.31      0.28        86
           4       0.26      0.64      0.37       599
           5       0.40      0.69      0.50       160
           6       0.74      0.72      0.73       160
           7       0.77      0.79      0.78       160
           8       0.58      0.56      0.57       160
           9       0.55      0.59      0.57       160
          10       0.90      0.73      0.81      4637
          11       0.09      0.52      0.15       160
          12       0.36      0.69      0.48       700
          13       0.33      0.73      0.45       160
          14       0.49      0.53      0.50     



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Loaded model ANN_GloVe_100.h5 accuracy: 0.6800699300699301
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      1117
           1       0.56      0.76      0.64      1439
           2       0.85      0.65      0.74      5578
           3       0.31      0.16      0.21        86
           4       0.27      0.59      0.37       599
           5       0.46      0.67      0.54       160
           6       0.80      0.67      0.73       160
           7       0.80      0.66      0.72       160
           8       0.57      0.52      0.54       160
           9       0.52      0.66      0.58       160
          10       0.86      0.83      0.85      4637
          11       0.19      0.17      0.18       160
          12       0.36      0.65      0.46       700
          13       0.43      0.68      0.53       160
          14       0.49      0.49      0.49       160
 



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step
Loaded model ANN_Word2Vec_300.h5 accuracy: 0.7188535149061465
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      1117
           1       0.73      0.63      0.68      1439
           2       0.84      0.74      0.79      5578
           3       0.12      0.03      0.05        86
           4       0.34      0.54      0.41       599
           5       0.42      0.66      0.51       160
           6       0.61      0.76      0.68       160
           7       0.74      0.78      0.76       160
           8       0.64      0.43      0.52       160
           9       0.58      0.44      0.50       160
          10       0.88      0.84      0.86      4637
          11       0.33      0.02      0.04       160
          12       0.44      0.67      0.53       700
          13       0.52      0.49      0.50       160
          14       0.53      0.49      0.51       16



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step
Loaded model CNN_FastText_Custom_300.h5 accuracy: 0.6740430622009569
              precision    recall  f1-score   support

           0       0.81      0.84      0.83      1117
           1       0.63      0.72      0.67      1439
           2       0.86      0.65      0.74      5578
           3       0.14      0.38      0.21        86
           4       0.28      0.67      0.39       599
           5       0.38      0.74      0.50       160
           6       0.53      0.87      0.66       160
           7       0.69      0.88      0.78       160
           8       0.56      0.55      0.56       160
           9       0.57      0.57      0.57       160
          10       0.90      0.78      0.84      4637
          11       0.14      0.39      0.21       160
          12       0.47      0.69      0.56       700
          13       0.39      0.54      0.46       160
          14       0.44      0.59      0.51  



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step
Loaded model CNN_FastText_Pre_300.h5 accuracy: 0.6658538829591462
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      1117
           1       0.65      0.68      0.67      1439
           2       0.84      0.64      0.73      5578
           3       0.16      0.44      0.24        86
           4       0.26      0.62      0.36       599
           5       0.50      0.59      0.54       160
           6       0.78      0.61      0.69       160
           7       0.76      0.87      0.81       160
           8       0.48      0.71      0.57       160
           9       0.59      0.49      0.54       160
          10       0.90      0.75      0.82      4637
          11       0.11      0.44      0.18       160
          12       0.46      0.70      0.56       700
          13       0.38      0.82      0.52       160
          14       0.39      0.51      0.44     



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step
Loaded model CNN_GloVe_100.h5 accuracy: 0.6561924917188076
              precision    recall  f1-score   support

           0       0.73      0.89      0.80      1117
           1       0.75      0.54      0.63      1439
           2       0.82      0.68      0.74      5578
           3       0.12      0.37      0.18        86
           4       0.26      0.61      0.37       599
           5       0.37      0.78      0.50       160
           6       0.74      0.68      0.71       160
           7       0.73      0.81      0.77       160
           8       0.53      0.61      0.56       160
           9       0.40      0.65      0.50       160
          10       0.92      0.74      0.82      4637
          11       0.13      0.28      0.18       160
          12       0.48      0.65      0.56       700
          13       0.40      0.78      0.53       160
          14       0.40      0.56      0.47       160
 



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step
Loaded model CNN_Word2Vec_300.h5 accuracy: 0.6799779168200221
              precision    recall  f1-score   support

           0       0.79      0.87      0.83      1117
           1       0.63      0.74      0.68      1439
           2       0.85      0.69      0.76      5578
           3       0.22      0.38      0.28        86
           4       0.31      0.62      0.41       599
           5       0.42      0.69      0.52       160
           6       0.51      0.81      0.63       160
           7       0.80      0.82      0.81       160
           8       0.55      0.62      0.58       160
           9       0.45      0.69      0.54       160
          10       0.88      0.77      0.82      4637
          11       0.10      0.41      0.17       160
          12       0.41      0.76      0.53       700
          13       0.46      0.59      0.52       160
          14       0.54      0.64      0.58       16



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 42ms/step
Loaded model LSTM_FastText_Custom_300.h5 accuracy: 0.6886271623113729
              precision    recall  f1-score   support

           0       0.84      0.84      0.84      1117
           1       0.66      0.72      0.69      1439
           2       0.85      0.68      0.76      5578
           3       0.16      0.38      0.22        86
           4       0.31      0.65      0.42       599
           5       0.43      0.74      0.55       160
           6       0.69      0.77      0.73       160
           7       0.70      0.90      0.79       160
           8       0.69      0.52      0.59       160
           9       0.58      0.64      0.61       160
          10       0.93      0.73      0.82      4637
          11       0.12      0.51      0.19       160
          12       0.48      0.66      0.56       700
          13       0.43      0.62      0.51       160
          14       0.47      0.71      0.5



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 40ms/step
Loaded model LSTM_FastText_Pre_300.h5 accuracy: 0.6619433198380567
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1117
           1       0.61      0.72      0.66      1439
           2       0.85      0.67      0.75      5578
           3       0.14      0.64      0.23        86
           4       0.26      0.67      0.37       599
           5       0.41      0.68      0.52       160
           6       0.60      0.84      0.70       160
           7       0.80      0.79      0.80       160
           8       0.52      0.50      0.51       160
           9       0.56      0.60      0.58       160
          10       0.93      0.68      0.78      4637
          11       0.08      0.49      0.14       160
          12       0.47      0.68      0.55       700
          13       0.41      0.69      0.52       160
          14       0.52      0.53      0.52  



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 26ms/step
Loaded model LSTM_GloVe_100.h5 accuracy: 0.6851766654398234
              precision    recall  f1-score   support

           0       0.82      0.85      0.83      1117
           1       0.68      0.67      0.68      1439
           2       0.85      0.68      0.75      5578
           3       0.09      0.73      0.15        86
           4       0.33      0.66      0.44       599
           5       0.41      0.71      0.52       160
           6       0.66      0.75      0.70       160
           7       0.71      0.81      0.76       160
           8       0.53      0.61      0.57       160
           9       0.51      0.65      0.57       160
          10       0.91      0.80      0.85      4637
          11       0.15      0.48      0.23       160
          12       0.48      0.72      0.58       700
          13       0.55      0.55      0.55       160
          14       0.51      0.59      0.55       16



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step
Loaded model LSTM_Word2Vec_300.h5 accuracy: 0.7046374677953625
              precision    recall  f1-score   support

           0       0.85      0.87      0.86      1117
           1       0.65      0.77      0.70      1439
           2       0.87      0.68      0.76      5578
           3       0.13      0.56      0.21        86
           4       0.34      0.68      0.46       599
           5       0.45      0.71      0.55       160
           6       0.67      0.86      0.75       160
           7       0.86      0.81      0.83       160
           8       0.64      0.64      0.64       160
           9       0.51      0.77      0.61       160
          10       0.92      0.77      0.84      4637
          11       0.14      0.45      0.21       160
          12       0.51      0.72      0.60       700
          13       0.47      0.68      0.56       160
          14       0.54      0.65      0.59      



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 26ms/step
Loaded model RNN_FastText_Custom_300.h5 accuracy: 0.37067537725432465
              precision    recall  f1-score   support

           0       0.66      0.65      0.65      1117
           1       0.45      0.28      0.35      1439
           2       0.61      0.11      0.18      5578
           3       0.05      0.29      0.09        86
           4       0.11      0.53      0.18       599
           5       0.19      0.41      0.26       160
           6       0.20      0.64      0.30       160
           7       0.23      0.41      0.30       160
           8       0.14      0.08      0.10       160
           9       0.15      0.41      0.22       160
          10       0.76      0.69      0.72      4637
          11       0.07      0.22      0.11       160
          12       0.19      0.30      0.23       700
          13       0.13      0.22      0.17       160
          14       0.17      0.24      0.2



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 25ms/step
Loaded model RNN_FastText_Pre_300.h5 accuracy: 0.14869341185130658
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1117
           1       0.07      0.00      0.00      1439
           2       1.00      0.00      0.00      5578
           3       0.00      0.00      0.00        86
           4       0.00      0.00      0.00       599
           5       0.03      0.17      0.05       160
           6       0.00      0.00      0.00       160
           7       0.00      0.00      0.00       160
           8       0.00      0.00      0.00       160
           9       0.00      0.00      0.00       160
          10       0.43      0.53      0.48      4637
          11       0.02      0.10      0.03       160
          12       0.06      0.04      0.05       700
          13       0.06      0.83      0.11       160
          14       0.00      0.00      0.00  



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 16ms/step
Loaded model RNN_GloVe_100.h5 accuracy: 0.24682554287817446
              precision    recall  f1-score   support

           0       0.17      0.11      0.13      1117
           1       0.23      0.03      0.05      1439
           2       0.52      0.04      0.08      5578
           3       0.01      0.38      0.02        86
           4       0.11      0.23      0.15       599
           5       0.04      0.03      0.04       160
           6       0.09      0.19      0.12       160
           7       0.10      0.12      0.11       160
           8       0.03      0.11      0.05       160
           9       0.08      0.17      0.11       160
          10       0.68      0.73      0.70      4637
          11       0.07      0.04      0.05       160
          12       0.17      0.04      0.06       700
          13       0.04      0.64      0.07       160
          14       0.05      0.09      0.06       16



[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 25ms/step
Loaded model RNN_Word2Vec_300.h5 accuracy: 0.3590357011409643
              precision    recall  f1-score   support

           0       0.58      0.39      0.47      1117
           1       0.48      0.35      0.40      1439
           2       0.67      0.17      0.27      5578
           3       0.05      0.44      0.09        86
           4       0.10      0.61      0.17       599
           5       0.09      0.68      0.16       160
           6       0.15      0.56      0.24       160
           7       0.15      0.16      0.15       160
           8       0.07      0.23      0.10       160
           9       0.13      0.11      0.12       160
          10       0.69      0.78      0.74      4637
          11       0.06      0.11      0.08       160
          12       0.24      0.24      0.24       700
          13       0.17      0.16      0.16       160
          14       0.12      0.10      0.11       

In [None]:
import os
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

folder_path = "models_baru_tf_idf"
if not os.path.exists("models_baru_tf_idf"):
    os.makedirs("models_baru_tf_idf")

print(f"\n{'='*20} MULAI TRAINING SHALLOW LEARNING (PKL) {'='*20}")

scaler = MinMaxScaler()
X_train_svd_scaled = scaler.fit_transform(X_train_svd)
X_val_svd_scaled = scaler.transform(X_val_svd)

joblib.dump(tfidf, f"{folder_path}/tfidf_vectorizer.pkl")
joblib.dump(svd, f"{folder_path}/svd_model.pkl")
joblib.dump(scaler, f"{folder_path}/tfidf_svd_scaler.pkl")

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

try:
    target_names = dataset_train_final.features['label'].names
except:
    target_names = [str(i) for i in range(len(np.unique(y_train)))]

shallow_models = {
    "LogisticRegression_SVD": LogisticRegression(max_iter=2000, class_weight='balanced', solver='lbfgs'),
    # "SVM_SVD": SVC(kernel='rbf', class_weight='balanced', cache_size=1000)
}

all_results = {}

for name, model in shallow_models.items():
    print(f"\nTraining {name}...")
    
    model.fit(X_train_svd_scaled, y_train)
    
    save_path = f"{folder_path}/{name}.pkl"
    joblib.dump(model, save_path)
    print(f"Model saved to {save_path}")
    
    y_pred = model.predict(X_val_svd_scaled)
    acc = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred, target_names=target_names, zero_division=0)
    
    all_results[name] = {"accuracy": acc, "report": report}
    print(f"Accuracy: {acc:.4f}")



Training LogisticRegression_SVD...
Model saved to models_baru_tf_idf/LogisticRegression_SVD.pkl
Accuracy: 0.7304

Training SVM_SVD...


In [24]:
import os
import joblib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, GlobalMaxPooling1D, LSTM, SimpleRNN, Bidirectional, Reshape
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

folder_path = "models_baru_tf_idf"
if not os.path.exists("models_baru_tf_idf"):
    os.makedirs("models_baru_tf_idf")

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

try:
    target_names = dataset_train_final.features['label'].names
except:
    target_names = [str(i) for i in range(len(np.unique(y_train)))]

all_results = {}

print(f"\n{'='*20} TRAINING DEEP LEARNING (H5) {'='*20}")

input_dim = X_train_svd.shape[1]
num_classes = len(np.unique(y_train))

def build_tfidf_ann(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(512, input_shape=(input_dim,), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_tfidf_cnn(input_dim, num_classes):
    model = Sequential()
    model.add(Reshape((input_dim, 1), input_shape=(input_dim,)))
    model.add(Conv1D(64, 5, activation='relu', padding='same'))
    model.add(Conv1D(32, 3, activation='relu', padding='same'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_tfidf_rnn(input_dim, num_classes):
    model = Sequential()
    model.add(Reshape((1, input_dim), input_shape=(input_dim,)))
    model.add(Bidirectional(SimpleRNN(128, return_sequences=False)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_tfidf_lstm(input_dim, num_classes):
    model = Sequential()
    model.add(Reshape((1, input_dim), input_shape=(input_dim,)))
    model.add(Bidirectional(LSTM(64, return_sequences=False)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

tfidf_dl_models = {
    "ANN_TFIDF": build_tfidf_ann,
    "CNN_TFIDF": build_tfidf_cnn,
    "RNN_TFIDF": build_tfidf_rnn,
    "LSTM_TFIDF": build_tfidf_lstm
}

for model_name, model_func in tfidf_dl_models.items():
    print(f"\nTraining {model_name}...")
    
    early_stop = EarlyStopping(
        monitor='val_loss', 
        patience=5, 
        restore_best_weights=True
    )
    
    model = model_func(input_dim, num_classes)
    
    history = model.fit(
        X_train_svd, y_train,
        epochs=30, 
        batch_size=64,
        validation_data=(X_val_svd, y_val),
        class_weight=class_weights_dict,
        callbacks=[early_stop],
        verbose=1 
    )
    
    save_path = f"{folder_path}/{model_name}.h5"
    model.save(save_path)
    print(f"Model saved to {save_path}")
    
    y_pred_prob = model.predict(X_val_svd)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    acc = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred, target_names=target_names, zero_division=0)
    
    all_results[model_name] = {"accuracy": acc, "report": report}
    print(f"Accuracy: {acc:.4f}")


print("\n" + "="*40)
print("FINAL LEADERBOARD (TF-IDF MODELS) 🏆")
print("="*40)

sorted_results = sorted(all_results.items(), key=lambda x: x[1]['accuracy'], reverse=True)

for i, (name, res) in enumerate(sorted_results):
    print(f"{i+1}. {name:<30} | Accuracy: {res['accuracy']:.4f}")



Training ANN_TFIDF...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - accuracy: 0.4952 - loss: 1.4287 - val_accuracy: 0.6343 - val_loss: 1.1284
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.6660 - loss: 0.7774 - val_accuracy: 0.6884 - val_loss: 0.9329
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.7061 - loss: 0.6141 - val_accuracy: 0.7105 - val_loss: 0.8835
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.7300 - loss: 0.5050 - val_accuracy: 0.7048 - val_loss: 0.9007
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.7519 - loss: 0.4283 - val_accuracy: 0.7159 - val_loss: 0.8770
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.7693 - loss: 0.3723 - val_accuracy: 0.7245 - val_loss: 0.8646
Epoc



Model saved to models_baru_tf_idf/ANN_TFIDF.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.7245

Training CNN_TFIDF...
Epoch 1/30


  super().__init__(**kwargs)


[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 89ms/step - accuracy: 0.2130 - loss: 2.6826 - val_accuracy: 0.3251 - val_loss: 2.1431
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 89ms/step - accuracy: 0.3343 - loss: 2.2111 - val_accuracy: 0.4188 - val_loss: 1.8726
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 85ms/step - accuracy: 0.3695 - loss: 2.0568 - val_accuracy: 0.4470 - val_loss: 1.7663
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 86ms/step - accuracy: 0.3859 - loss: 1.9935 - val_accuracy: 0.4030 - val_loss: 1.8454
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 85ms/step - accuracy: 0.3885 - loss: 1.9561 - val_accuracy: 0.4574 - val_loss: 1.7498
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 85ms/step - accuracy: 0.3963 - loss: 1.9308 - val_accuracy: 0.4221 - val_loss: 1.7671
Epoch 7/30
[1m



Model saved to models_baru_tf_idf/CNN_TFIDF.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step
Accuracy: 0.4689

Training RNN_TFIDF...


  super().__init__(**kwargs)


Epoch 1/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.4915 - loss: 1.4866 - val_accuracy: 0.6720 - val_loss: 1.0650
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6639 - loss: 0.8206 - val_accuracy: 0.6973 - val_loss: 0.9584
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6905 - loss: 0.6780 - val_accuracy: 0.6968 - val_loss: 0.9284
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7008 - loss: 0.6067 - val_accuracy: 0.6982 - val_loss: 0.9291
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7105 - loss: 0.5539 - val_accuracy: 0.7067 - val_loss: 0.8970
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7207 - loss: 0.5128 - val_accuracy: 0.6882 - val_loss: 0.9446
Epoch 7/30
[1m1



Model saved to models_baru_tf_idf/RNN_TFIDF.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.7115

Training LSTM_TFIDF...
Epoch 1/30


  super().__init__(**kwargs)


[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 10ms/step - accuracy: 0.3375 - loss: 1.8608 - val_accuracy: 0.5612 - val_loss: 1.3462
Epoch 2/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.5448 - loss: 1.0386 - val_accuracy: 0.6248 - val_loss: 1.1103
Epoch 3/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.5978 - loss: 0.8817 - val_accuracy: 0.6400 - val_loss: 1.0520
Epoch 4/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.6235 - loss: 0.7937 - val_accuracy: 0.6754 - val_loss: 0.9720
Epoch 5/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.6428 - loss: 0.7276 - val_accuracy: 0.6721 - val_loss: 0.9683
Epoch 6/30
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 10ms/step - accuracy: 0.6518 - loss: 0.6785 - val_accuracy: 0.6621 - val_loss: 0.9810
Epoch 7/30
[1m



Model saved to models_baru_tf_idf/LSTM_TFIDF.h5
[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 0.7175

FINAL LEADERBOARD (TF-IDF MODELS) 🏆
1. ANN_TFIDF                      | Accuracy: 0.7245
2. LSTM_TFIDF                     | Accuracy: 0.7175
3. RNN_TFIDF                      | Accuracy: 0.7115
4. CNN_TFIDF                      | Accuracy: 0.4689


In [27]:
import os
import joblib
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, accuracy_score

FOLDER_MODEL = "models_baru_tf_idf"
TARGET_TEXT = df_test['final_text'] 
TARGET_LABEL = y_test

try:
    target_names = dataset_train_final.features['label'].names
except:
    target_names = [str(i) for i in range(len(np.unique(TARGET_LABEL)))]

print(f"Loading preprocessing tools from {FOLDER_MODEL}...")

try:
    tfidf_vectorizer = joblib.load(os.path.join(FOLDER_MODEL, "tfidf_vectorizer.pkl"))
    svd_model = joblib.load(os.path.join(FOLDER_MODEL, "svd_model.pkl"))
    scaler = joblib.load(os.path.join(FOLDER_MODEL, "tfidf_svd_scaler.pkl"))
    print("Tools (TFIDF, SVD, Scaler) loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Pastikan file .pkl tools ada di folder models_baru_tf_idf")
    exit()

print("\nTransforming test data...")

X_test_tfidf = tfidf_vectorizer.transform(TARGET_TEXT)
X_test_svd = svd_model.transform(X_test_tfidf)
X_test_svd_scaled = scaler.transform(X_test_svd)

print(f"Shape Test Data: {X_test_svd.shape}")
print("="*50)

print("Testing saved models...")

for model_file in os.listdir(FOLDER_MODEL):
    model_path = os.path.join(FOLDER_MODEL, model_file)
    
    if "scaler" in model_file or "vectorizer" in model_file or "svd" in model_file:
        continue

    print(f"\nTesting Model: {model_file} ...")
    
    try:
        y_pred = None
        
        if model_file.endswith(".h5"):
            loaded_model = load_model(model_path)
            input_data = X_test_svd 
            y_pred_prob = loaded_model.predict(input_data, verbose=0)
            y_pred = np.argmax(y_pred_prob, axis=1)

        elif model_file.endswith(".pkl"):
            loaded_model = joblib.load(model_path)
            y_pred = loaded_model.predict(X_test_svd_scaled)
        
        else:
            continue

        if y_pred is not None:
            acc = accuracy_score(TARGET_LABEL, y_pred)
            report = classification_report(
                TARGET_LABEL, 
                y_pred, 
                labels=range(len(target_names)), 
                target_names=target_names, 
                zero_division=0
            )
            
            print(f"Accuracy: {acc:.4f}")
            print(report)
            print("-" * 50)

    except Exception as e:
        print(f"Gagal load/test {model_file}: {e}")
        if model_file.endswith(".h5"):
            print(f"   Expected Input Shape: {loaded_model.input_shape}")
            print(f"   Actual Data Shape:    {input_data.shape}")

print("\nTesting Selesai.")

Loading preprocessing tools from models_baru_tf_idf...
Tools (TFIDF, SVD, Scaler) loaded successfully.

Transforming test data...




Shape Test Data: (21736, 2000)
Testing saved models...

Testing Model: ANN_TFIDF.h5 ...




Accuracy: 0.7236
              precision    recall  f1-score   support

           0       0.88      0.85      0.86      1117
           1       0.70      0.75      0.72      1439
           2       0.89      0.67      0.77      5578
           3       0.20      0.45      0.28        86
           4       0.32      0.74      0.45       599
           5       0.49      0.79      0.61       160
           6       0.64      0.86      0.73       160
           7       0.77      0.91      0.83       160
           8       0.77      0.46      0.58       160
           9       0.54      0.72      0.62       160
          10       0.88      0.83      0.86      4637
          11       0.18      0.46      0.26       160
          12       0.54      0.69      0.61       700
          13       0.48      0.75      0.59       160
          14       0.55      0.69      0.61       160
          15       0.77      0.53      0.63      3327
          16       0.67      0.82      0.74      1252
          



Accuracy: 0.7132
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1117
           1       0.67      0.77      0.72      1439
           2       0.86      0.67      0.76      5578
           3       0.22      0.42      0.29        86
           4       0.31      0.72      0.44       599
           5       0.52      0.71      0.60       160
           6       0.70      0.78      0.74       160
           7       0.78      0.85      0.81       160
           8       0.61      0.67      0.64       160
           9       0.59      0.69      0.64       160
          10       0.89      0.80      0.85      4637
          11       0.17      0.47      0.25       160
          12       0.50      0.73      0.59       700
          13       0.58      0.71      0.64       160
          14       0.60      0.70      0.65       160
          15       0.76      0.50      0.61      3327
          16       0.66      0.81      0.73      1252
          