In [None]:
!pip install pandas numpy scikit-learn transformers tensorflow
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Found existing installation: torchvision 0.20.1+cu121
Uninstalling torchvision-0.20.1+cu121:
  Successfully uninstalled torchvision-0.20.1+cu121
Found existing installation: torchaudio 2.5.1+cu121
Uninstalling torchaudio-2.5.1+cu121:
  Successfully uninstalled torchaudio-2.5.1+cu121
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp310-cp310-linux_x86_64.whl (174.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.20.1%2Bcpu-cp310-cp310-linux_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaud

# **BI-LSTM**

In [None]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import torch
import os
from sklearn.metrics import classification_report, confusion_matrix

# Load Dataset
file_path = '/content/input data.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)

# Print the column names to check for correctness
print("Columns in dataset:", df.columns)

# Clean Data (check column names and handle possible case discrepancies)
if 'review' not in df.columns:
    print("Error: 'review' column not found.")
else:
    df_cleaned = df.dropna(subset=['review'])
    df_cleaned = df_cleaned[df_cleaned['label'].apply(lambda x: str(x).isdigit())]
    df_cleaned['label'] = df_cleaned['label'].astype(int)

    # Print the number of reviews being used
    print(f"Total reviews used for training: {len(df_cleaned)}")

    # Load Tokenizer and Model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

    # Generate Mini-Batch Embeddings with Progress Print
    def generate_embeddings_batch(text_list, tokenizer, model, batch_size=32, max_length=64):
        embeddings = []
        for i in range(0, len(text_list), batch_size):
            batch = text_list[i:i + batch_size]
            encoded = tokenizer(
                batch.tolist(),
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            )
            with torch.no_grad():
                output = model(**encoded)
            batch_embeddings = output.last_hidden_state[:, 0, :].numpy()
            embeddings.append(batch_embeddings)

            # Print progress for every batch
            print(f"Processed batch {i // batch_size + 1}/{(len(text_list) // batch_size) + 1}")
        return np.vstack(embeddings)

    # Create Embeddings
    reviews = df_cleaned['review']
    embeddings = generate_embeddings_batch(reviews, tokenizer, bert_model)

    # Prepare Labels
    labels = df_cleaned['label'].values
    label_binarizer = LabelBinarizer()
    labels = label_binarizer.fit_transform(labels)

    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.104, random_state=42)

    # Print how much data is used for training and testing
    print(f"Reviews used for training: {len(X_train)}")
    print(f"Reviews used for testing: {len(X_test)}")

    # Reshape Embeddings for LSTM
    X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])  # (batch_size, sequence_length=1, embedding_dim=768)
    X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])      # (batch_size, sequence_length=1, embedding_dim=768)

    # Check if model exists
    model_path = '/content/lightweight_suicide_ideation_model.h5'
    if os.path.exists(model_path):
        print("Model found, loading the model...")
        model = load_model(model_path)
    else:
        # Build Bi-LSTM Model
        print("Training new model...")
        model = Sequential([
            Bidirectional(LSTM(64, input_shape=(1, X_train.shape[2]))),  # Bi-LSTM layer
            Dropout(0.2),  # Regularization to prevent overfitting
            Dense(32, activation='relu'),  # Fully connected layer with ReLU activation
            Dropout(0.2),  # Additional dropout for robustness
            Dense(labels.shape[1], activation='softmax')  # Output layer for classification
        ])

        # Compile Model
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        # Train Model with Progress Print
        epochs = 100
        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")
            model.fit(X_train, y_train, epochs=1, batch_size=16, validation_data=(X_test, y_test))
            print(f"Epoch {epoch + 1} complete.")

        # Save the Model
        model.save(model_path)
        print("Model saved!")

    # Evaluate Model
    predictions = model.predict(X_test)
    predicted_labels = label_binarizer.inverse_transform(predictions)

    # Create DataFrame to save predictions
    test_indices = X_test.shape[0]  # X_test corresponds to test set, so we use the length to slice df_cleaned

    verify_df = pd.DataFrame({
        'review': df_cleaned['review'].iloc[len(X_train):len(X_train) + test_indices].reset_index(drop=True),
        'actual_label': y_test.argmax(axis=1),
        'predicted_label': predicted_labels
    })

    # Save Predictions to Excel
    verify_df.to_excel('/content/hasil.xlsx', index=False)
    print("Predictions saved to 'hasil.xlsx'")

    # Show accuracy, precision, recall, and confusion matrix
    print("Classification Report:")
    print(classification_report(y_test.argmax(axis=1), predicted_labels))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test.argmax(axis=1), predicted_labels))

Columns in dataset: Index(['label', 'review'], dtype='object')
Total reviews used for training: 38901


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Processed batch 1/1216
Processed batch 2/1216
Processed batch 3/1216
Processed batch 4/1216
Processed batch 5/1216
Processed batch 6/1216
Processed batch 7/1216
Processed batch 8/1216
Processed batch 9/1216
Processed batch 10/1216
Processed batch 11/1216
Processed batch 12/1216
Processed batch 13/1216
Processed batch 14/1216
Processed batch 15/1216
Processed batch 16/1216
Processed batch 17/1216
Processed batch 18/1216
Processed batch 19/1216
Processed batch 20/1216
Processed batch 21/1216
Processed batch 22/1216
Processed batch 23/1216
Processed batch 24/1216
Processed batch 25/1216
Processed batch 26/1216
Processed batch 27/1216
Processed batch 28/1216
Processed batch 29/1216
Processed batch 30/1216
Processed batch 31/1216
Processed batch 32/1216
Processed batch 33/1216
Processed batch 34/1216
Processed batch 35/1216
Processed batch 36/1216
Processed batch 37/1216
Processed batch 38/1216
Processed batch 39/1216
Processed batch 40/1216
Processed batch 41/1216
Processed batch 42/1216
P

  super().__init__(**kwargs)


Epoch 1/100
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 11ms/step - accuracy: 0.7322 - loss: 0.6426 - val_accuracy: 0.8085 - val_loss: 0.4575
Epoch 1 complete.
Epoch 2/100
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.8125 - loss: 0.4803 - val_accuracy: 0.8250 - val_loss: 0.4291
Epoch 2 complete.
Epoch 3/100
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 10ms/step - accuracy: 0.8267 - loss: 0.4503 - val_accuracy: 0.8188 - val_loss: 0.4393
Epoch 3 complete.
Epoch 4/100
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 10ms/step - accuracy: 0.8315 - loss: 0.4391 - val_accuracy: 0.8356 - val_loss: 0.4230
Epoch 4 complete.
Epoch 5/100
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 10ms/step - accuracy: 0.8430 - loss: 0.4087 - val_accuracy: 0.8371 - val_loss: 0.4214
Epoch 5 complete.
Epoch 6/100
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s



Epoch 100 complete.
Model saved!
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
Predictions saved to 'hasil.xlsx'
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.94      1440
           1       0.93      0.90      0.91      1283
           2       0.90      0.89      0.90      1323

    accuracy                           0.92      4046
   macro avg       0.92      0.91      0.92      4046
weighted avg       0.92      0.92      0.92      4046

Confusion Matrix:
[[1374   24   42]
 [  42 1150   91]
 [  72   69 1182]]


# **BI-RNN**

In [None]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Dropout, Bidirectional, SimpleRNN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import torch
import os
from sklearn.metrics import classification_report, confusion_matrix

# Load Dataset
file_path = '/content/input data.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)

# Print the column names to check for correctness
print("Columns in dataset:", df.columns)

# Clean Data (check column names and handle possible case discrepancies)
if 'review' not in df.columns:
    print("Error: 'review' column not found.")
else:
    df_cleaned = df.dropna(subset=['review'])
    df_cleaned = df_cleaned[df_cleaned['label'].apply(lambda x: str(x).isdigit())]
    df_cleaned['label'] = df_cleaned['label'].astype(int)

    # Print the number of reviews being used
    print(f"Total reviews used for training: {len(df_cleaned)}")

    # Load Tokenizer and Model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

    # Generate Mini-Batch Embeddings with Progress Print
    def generate_embeddings_batch(text_list, tokenizer, model, batch_size=32, max_length=64):
        embeddings = []
        for i in range(0, len(text_list), batch_size):
            batch = text_list[i:i + batch_size]
            encoded = tokenizer(
                batch.tolist(),
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            )
            with torch.no_grad():
                output = model(**encoded)
            batch_embeddings = output.last_hidden_state[:, 0, :].numpy()
            embeddings.append(batch_embeddings)

            # Print progress for every batch
            print(f"Processed batch {i // batch_size + 1}/{(len(text_list) // batch_size) + 1}")
        return np.vstack(embeddings)

    # Create Embeddings
    reviews = df_cleaned['review']
    embeddings = generate_embeddings_batch(reviews, tokenizer, bert_model)

    # Prepare Labels
    labels = df_cleaned['label'].values
    label_binarizer = LabelBinarizer()
    labels = label_binarizer.fit_transform(labels)

    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.104, random_state=42)

    # Print how much data is used for training and testing
    print(f"Reviews used for training: {len(X_train)}")
    print(f"Reviews used for testing: {len(X_test)}")

    # Reshape Embeddings for CNN
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)  # (batch_size, sequence_length=768, channels=1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)      # (batch_size, sequence_length=768, channels=1)

    # Check if model exists
    model_path = '/content/lightweight_suicide_ideation_model2.h5'
    if os.path.exists(model_path):
        print("Model found, loading the model...")
        model = load_model(model_path)
    else:
        # Build Bi-RNN Model
        print("Training new model...")
        model = Sequential([
            Bidirectional(tf.keras.layers.SimpleRNN(64, input_shape=(1, X_train.shape[2]))),  # Bi-RNN layer
            Dropout(0.2),  # Regularization to prevent overfitting
            Dense(32, activation='relu'),  # Fully connected layer with ReLU activation
            Dropout(0.2),  # Additional dropout for robustness
            Dense(labels.shape[1], activation='softmax')  # Output layer for classification
        ])

        # Compile Model
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

        # Train Model with Progress Print
        epochs = 25
        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")
            model.fit(X_train, y_train, epochs=1, batch_size=16, validation_data=(X_test, y_test))
            print(f"Epoch {epoch + 1} complete.")

        # Save the Model
        model.save(model_path)
        print("Model saved!")

    # Evaluate Model
    predictions = model.predict(X_test)
    predicted_labels = label_binarizer.inverse_transform(predictions)

    # Create DataFrame to save predictions
    test_indices = X_test.shape[0]  # X_test corresponds to test set, so we use the length to slice df_cleaned

    verify_df = pd.DataFrame({
        'review': df_cleaned['review'].iloc[len(X_train):len(X_train) + test_indices].reset_index(drop=True),
        'actual_label': y_test.argmax(axis=1),
        'predicted_label': predicted_labels
    })

    # Save Predictions to Excel
    verify_df.to_excel('/content/hasil2.xlsx', index=False)
    print("Predictions saved to 'hasil2.xlsx'")

    # Show accuracy, precision, recall, and confusion matrix
    print("Classification Report:")
    print(classification_report(y_test.argmax(axis=1), predicted_labels))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test.argmax(axis=1), predicted_labels))


Columns in dataset: Index(['label', 'review'], dtype='object')
Total reviews used for training: 38901


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Processed batch 1/1216
Processed batch 2/1216
Processed batch 3/1216
Processed batch 4/1216
Processed batch 5/1216
Processed batch 6/1216
Processed batch 7/1216
Processed batch 8/1216
Processed batch 9/1216
Processed batch 10/1216
Processed batch 11/1216
Processed batch 12/1216
Processed batch 13/1216
Processed batch 14/1216
Processed batch 15/1216
Processed batch 16/1216
Processed batch 17/1216
Processed batch 18/1216
Processed batch 19/1216
Processed batch 20/1216
Processed batch 21/1216
Processed batch 22/1216
Processed batch 23/1216
Processed batch 24/1216
Processed batch 25/1216
Processed batch 26/1216
Processed batch 27/1216
Processed batch 28/1216
Processed batch 29/1216
Processed batch 30/1216
Processed batch 31/1216
Processed batch 32/1216
Processed batch 33/1216
Processed batch 34/1216
Processed batch 35/1216
Processed batch 36/1216
Processed batch 37/1216
Processed batch 38/1216
Processed batch 39/1216
Processed batch 40/1216
Processed batch 41/1216
Processed batch 42/1216
P

  super().__init__(**kwargs)


Epoch 1/25
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 147ms/step - accuracy: 0.6041 - loss: 0.8489 - val_accuracy: 0.6930 - val_loss: 0.6807
Epoch 1 complete.
Epoch 2/25
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 144ms/step - accuracy: 0.7168 - loss: 0.6746 - val_accuracy: 0.7444 - val_loss: 0.6077
Epoch 2 complete.
Epoch 3/25
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 144ms/step - accuracy: 0.7428 - loss: 0.6300 - val_accuracy: 0.7610 - val_loss: 0.5859
Epoch 3 complete.
Epoch 4/25
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m312s[0m 143ms/step - accuracy: 0.7465 - loss: 0.6190 - val_accuracy: 0.7375 - val_loss: 0.6271
Epoch 4 complete.
Epoch 5/25
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m313s[0m 144ms/step - accuracy: 0.7543 - loss: 0.5998 - val_accuracy: 0.7509 - val_loss: 0.5916
Epoch 5 complete.
Epoch 6/25
[1m2179/2179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [



Epoch 25 complete.
Model saved!
[1m127/127[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step
Predictions saved to 'hasil2.xlsx'
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83      1440
           1       0.74      0.85      0.79      1283
           2       0.82      0.57      0.67      1323

    accuracy                           0.77      4046
   macro avg       0.78      0.77      0.77      4046
weighted avg       0.78      0.77      0.77      4046

Confusion Matrix:
[[1282   95   63]
 [  83 1091  109]
 [ 285  280  758]]


# **Transformer**

In [None]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import torch
import os

# Load Dataset
file_path = '/content/input data.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)

# Print the column names to check for correctness
print("Columns in dataset:", df.columns)

# Clean Data (check column names and handle possible case discrepancies)
if 'review' not in df.columns or 'label' not in df.columns:
    raise ValueError("Dataset must contain 'review' and 'label' columns.")

df_cleaned = df.dropna(subset=['review'])
df_cleaned = df_cleaned[df_cleaned['label'].apply(lambda x: str(x).isdigit())]
df_cleaned['label'] = df_cleaned['label'].astype(int)

# Print the number of reviews being used
print(f"Total reviews used for training: {len(df_cleaned)}")

# Load Tokenizer and Model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Generate Mini-Batch Embeddings with Progress Print
def generate_embeddings_batch(text_list, tokenizer, model, batch_size=32, max_length=64):
    embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        encoded = tokenizer(
            batch.tolist(),
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        with torch.no_grad():
            output = model(**encoded)
        batch_embeddings = output.last_hidden_state[:, 0, :].numpy()
        embeddings.append(batch_embeddings)

        # Print progress for every batch
        print(f"Processed batch {i // batch_size + 1}/{(len(text_list) // batch_size) + 1}")
    return np.vstack(embeddings)

# Create Embeddings
reviews = df_cleaned['review']
embeddings = generate_embeddings_batch(reviews, tokenizer, bert_model)

# Prepare Labels
labels = df_cleaned['label'].values
label_binarizer = LabelBinarizer()
labels = label_binarizer.fit_transform(labels)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.1, random_state=42)

# Print how much data is used for training and testing
print(f"Reviews used for training: {len(X_train)}")
print(f"Reviews used for testing: {len(X_test)}")

# Reshape Embeddings for LSTM
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Compute Class Weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1))
class_weights_dict = dict(enumerate(class_weights))

# Check if model exists
model_path = '/content/lightweight_suicide_ideation_model.h5'
if os.path.exists(model_path):
    print("Model found, loading the model...")
    model = load_model(model_path)
else:
    # Build Bi-LSTM Model
    print("Training new model...")
    model = Sequential([
        Bidirectional(LSTM(64, input_shape=(1, X_train.shape[2]))),  # Bi-LSTM layer
        Dropout(0.2),  # Regularization to prevent overfitting
        Dense(32, activation='relu'),  # Fully connected layer with ReLU activation
        Dropout(0.2),  # Additional dropout for robustness
        Dense(labels.shape[1], activation='softmax')  # Output layer for classification
    ])

    # Compile Model
    optimizer = tf.keras.optimizers.Adam(clipvalue=1.0)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    # Train Model with Early Stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        class_weight=class_weights_dict,
        callbacks=[early_stopping],
        epochs=100, batch_size=16
    )

    # Save the Model
    model.save(model_path)
    print("Model saved!")

# Evaluate Model
predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1)
true_labels = np.argmax(y_test, axis=1)

# Create DataFrame to save predictions
verify_df = pd.DataFrame({
    'review': df_cleaned['review'].iloc[len(X_train):len(X_train) + len(X_test)].reset_index(drop=True),
    'actual_label': true_labels,
    'predicted_label': predicted_labels
})

# Save Predictions to Excel
output_path = '/content/hasil.xlsx'
verify_df.to_excel(output_path, index=False)
print(f"Predictions saved to '{output_path}'")

# Show accuracy, precision, recall, and confusion matrix
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))
print("Confusion Matrix:")
print(confusion_matrix(true_labels, predicted_labels))


Columns in dataset: Index(['label', 'review'], dtype='object')
Total reviews used for training: 38901


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Processed batch 1/1216
Processed batch 2/1216
Processed batch 3/1216
Processed batch 4/1216
Processed batch 5/1216
Processed batch 6/1216
Processed batch 7/1216
Processed batch 8/1216
Processed batch 9/1216
Processed batch 10/1216
Processed batch 11/1216
Processed batch 12/1216
Processed batch 13/1216
Processed batch 14/1216
Processed batch 15/1216
Processed batch 16/1216
Processed batch 17/1216
Processed batch 18/1216
Processed batch 19/1216
Processed batch 20/1216
Processed batch 21/1216
Processed batch 22/1216
Processed batch 23/1216
Processed batch 24/1216
Processed batch 25/1216
Processed batch 26/1216
Processed batch 27/1216
Processed batch 28/1216
Processed batch 29/1216
Processed batch 30/1216
Processed batch 31/1216
Processed batch 32/1216
Processed batch 33/1216
Processed batch 34/1216
Processed batch 35/1216
Processed batch 36/1216
Processed batch 37/1216
Processed batch 38/1216
Processed batch 39/1216
Processed batch 40/1216
Processed batch 41/1216
Processed batch 42/1216
P

  super().__init__(**kwargs)


Epoch 1/100
[1m2189/2189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 9ms/step - accuracy: 0.7240 - loss: 0.6579 - val_accuracy: 0.7882 - val_loss: 0.4921
Epoch 2/100
[1m2189/2189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.8136 - loss: 0.4889 - val_accuracy: 0.8144 - val_loss: 0.4666
Epoch 3/100
[1m2189/2189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - accuracy: 0.8240 - loss: 0.4585 - val_accuracy: 0.8309 - val_loss: 0.4347
Epoch 4/100
[1m2189/2189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.8354 - loss: 0.4319 - val_accuracy: 0.8404 - val_loss: 0.4116
Epoch 5/100
[1m2189/2189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.8412 - loss: 0.4183 - val_accuracy: 0.8322 - val_loss: 0.4062
Epoch 6/100
[1m2189/2189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - accuracy: 0.8468 - loss: 0.4093 - val_accuracy: 0.8520 - val_loss: 0.3875
Epoc



Model saved!
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
Predictions saved to '/content/hasil.xlsx'
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      1374
           1       0.88      0.89      0.88      1240
           2       0.89      0.79      0.84      1277

    accuracy                           0.88      3891
   macro avg       0.88      0.88      0.88      3891
weighted avg       0.88      0.88      0.88      3891

Confusion Matrix:
[[1299   35   40]
 [  46 1107   87]
 [ 139  123 1015]]
