# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

from tensorflow.keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load Dataset

## Read CSV

In [2]:
dir_ = "dataset/"
file_input = dir_ + "oshibe_spv_comments_2025-01-15_labeled_lexicon.csv"
data = pd.read_csv(file_input)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11650 entries, 0 to 11649
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   11650 non-null  object
 1   Username             11650 non-null  object
 2   Comment              11650 non-null  object
 3   LikeCount            11650 non-null  int64 
 4   ReplyCount           11650 non-null  int64 
 5   Date                 11650 non-null  object
 6   Comment_clean        11650 non-null  object
 7   Comment_clean_words  11504 non-null  object
 8   Sentiment_score      11650 non-null  int64 
 9   Sentiment            11650 non-null  object
dtypes: int64(3), object(7)
memory usage: 910.3+ KB


- Hasil stem dari tahap sebelumnya mengandung missing values pada 'Comment_stem', karena kolom ini akan menjadi X maka row dengan missing values disini akan dihapus

In [3]:
data = data[pd.notnull(data['Comment_clean_words']) & (data['Comment_clean_words'] != '')].copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11504 entries, 0 to 11649
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   11504 non-null  object
 1   Username             11504 non-null  object
 2   Comment              11504 non-null  object
 3   LikeCount            11504 non-null  int64 
 4   ReplyCount           11504 non-null  int64 
 5   Date                 11504 non-null  object
 6   Comment_clean        11504 non-null  object
 7   Comment_clean_words  11504 non-null  object
 8   Sentiment_score      11504 non-null  int64 
 9   Sentiment            11504 non-null  object
dtypes: int64(3), object(7)
memory usage: 988.6+ KB


In [4]:
data[['Comment_clean_words', 'Sentiment']].head(10)

Unnamed: 0,Comment_clean_words,Sentiment
0,teman-teman lagu lgbt gadis muda yang beranjak...,negative
1,performance videonya kaya memberitahu dampak b...,positive
2,persatu member kesempatan menunjukan potensiny...,positive
3,fiks kedepan jkt48 release single mvnya kostum...,positive
4,malam rahasia bilang siapasiapa rahasia ah cah...,negative
5,terlepas kontroversi sejujurnya lagu represent...,positive
6,terlepas hate comen jujur kemajuan banget jkt4...,positive
7,gila konsep mv keren banget good job jkt48,positive
8,congrats jkt48 new era mini albumnya jkt48 jay...,positive
9,buay yang bilang lesbi salah tuh makna lumayan...,positive


## Define X & y

In [5]:
X = data['Comment_clean_words']
y = data['Sentiment']

# SVM with TF-IDF

## Feature Extraction

In [38]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

## Train-Test Split

In [51]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_tfidf, y, test_size=0.2, random_state=48)

## Train & Evaluation

In [69]:
# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_svm, y_train_svm)

# Make predictions
y_pred_svm_train = svm_model.predict(X_train_svm)
y_pred_svm_test = svm_model.predict(X_test_svm)

# Evaluate the model
accuracy_svm_train = accuracy_score(y_train_svm, y_pred_svm_train)
accuracy_svm_test = accuracy_score(y_test_svm, y_pred_svm_test)

print("SVM with TF-IDF Classification Report:")
print(classification_report(y_test_svm, y_pred_svm_test))

print(f"SVM with TF-IDF Train Accuracy: {accuracy_svm_train*100:.2f}%")
print(f"SVM with TF-IDF Test Accuracy: {accuracy_svm_test*100:.2f}%")

SVM with TF-IDF Classification Report:
              precision    recall  f1-score   support

    negative       0.92      0.68      0.79       241
     neutral       0.88      0.96      0.92      1195
    positive       0.95      0.91      0.93       865

    accuracy                           0.91      2301
   macro avg       0.92      0.85      0.88      2301
weighted avg       0.91      0.91      0.91      2301

SVM with TF-IDF Train Accuracy: 96.82%
SVM with TF-IDF Test Accuracy: 91.09%


# Random Forest with BoW

## Feature Extraction

In [83]:
# Initialize CountVectorizer without limiting features
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# Get the number of unique words
total_unique_words = len(vectorizer.get_feature_names_out())
print(f"Total unique words: {total_unique_words}")

# Define percentage of unique words to include
percentage = 0.5  # Example: use 50% of the unique words
max_features = int(total_unique_words * percentage)

print(f"Using {percentage * 100:.0f}% of unique words: {max_features} features")

Total unique words: 11536
Using 50% of unique words: 5768 features


In [84]:
# Create Bag of Words (BoW) representation
vectorizer = CountVectorizer(max_features=max_features)
X_bow = vectorizer.fit_transform(X)

## Train-Test Split

In [85]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, y, test_size=0.3, random_state=48)

## Train & Evaluate

In [86]:
# Train Random Forest model
rf_model_bow = RandomForestClassifier(n_estimators=48, random_state=48)
rf_model_bow.fit(X_train_bow, y_train_bow)

# Make predictions
y_pred_bow_train = rf_model_bow.predict(X_train_bow)
y_pred_bow_test = rf_model_bow.predict(X_test_bow)

# Evaluate the model
accuracy_bow_train = accuracy_score(y_train_bow, y_pred_bow_train)
accuracy_bow_test = accuracy_score(y_test_bow, y_pred_bow_test)

print("Random Forest with Bag of Words Classification Report:")
print(classification_report(y_test_bow, y_pred_bow_test))

print(f"Random Forest with Bag of Words Train Accuracy: {accuracy_bow_train * 100:.2f}%")
print(f"Random Forest with Bag of Words Test Accuracy: {accuracy_bow_test * 100:.2f}%")

Random Forest with Bag of Words Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.69      0.76       370
     neutral       0.90      0.92      0.91      1737
    positive       0.90      0.93      0.92      1345

    accuracy                           0.90      3452
   macro avg       0.88      0.84      0.86      3452
weighted avg       0.90      0.90      0.89      3452

Random Forest with Bag of Words Train Accuracy: 99.89%
Random Forest with Bag of Words Test Accuracy: 89.63%


# Deep Learning

## Feature Extraction

In [23]:
X = data['Comment_clean_words'].values
y = data['Sentiment'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [25]:
data['Word_count'] = data['Comment_clean_words'].apply(lambda x: len(str(x).split()))

data[['Comment_clean_words', 'Word_count']].sort_values('Word_count', ascending=False)

Unnamed: 0,Comment_clean_words,Word_count
9554,malam rahasia bilang siapasiapa rahasia ah cah...,406
256,tari perut belly dance dikenal sebutan raqs sh...,297
1,performance videonya kaya memberitahu dampak b...,246
729,lyrics japanese naisho konya atta koto dare in...,197
3508,resep niku udon ala marugame bahan g udon basa...,185
...,...,...
2427,freya,1
6633,ngeri,1
6638,mantap,1
6640,meresahkan,1


In [31]:
# Calculate the 90th percentile of the word count
percentile_95 = np.percentile(data['Word_count'], 95)

print(f"The 95th percentile of word count is: {percentile_95:.0f}")

The 95th percentile of word count is: 20


In [None]:
# Tokenization
max_words = 10000
max_length = int(percentile_95)  # Maximum number of words in a comment
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_length)

## Train-Test Split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=48)

## Train & Evaluation

In [36]:
# Define the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(LSTM(64)),
    Dropout(0.2),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 52ms/step - accuracy: 0.6780 - loss: 0.7414 - val_accuracy: 0.8848 - val_loss: 0.3203
Epoch 2/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 64ms/step - accuracy: 0.9351 - loss: 0.2041 - val_accuracy: 0.9144 - val_loss: 0.2358
Epoch 3/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 50ms/step - accuracy: 0.9787 - loss: 0.0733 - val_accuracy: 0.9148 - val_loss: 0.2808
Epoch 4/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 51ms/step - accuracy: 0.9845 - loss: 0.0442 - val_accuracy: 0.9322 - val_loss: 0.2300
Epoch 5/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 51ms/step - accuracy: 0.9941 - loss: 0.0224 - val_accuracy: 0.9352 - val_loss: 0.2718


In [37]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification report
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

# Evaluate the model
accuracy_dl = model.evaluate(X_test, y_test, verbose=0)[1]
print(f"Deep Learning Accuracy: {accuracy_dl * 100:.2f}%")

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step
              precision    recall  f1-score   support

    negative       0.94      0.80      0.87       241
     neutral       0.93      0.95      0.94      1195
    positive       0.95      0.95      0.95       865

    accuracy                           0.94      2301
   macro avg       0.94      0.90      0.92      2301
weighted avg       0.94      0.94      0.93      2301

Deep Learning Accuracy: 93.52%


# Inference Test with Deep Learning