# Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

from tensorflow.keras.layers import Bidirectional, Dense, Dropout, Embedding, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load Dataset

## Read CSV

In [2]:
dir_ = "dataset/"
file_input = dir_ + "oshibe_spv_comments_2025-01-15_labeled_lexicon.csv"
data = pd.read_csv(file_input)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11650 entries, 0 to 11649
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   11650 non-null  object
 1   Username             11650 non-null  object
 2   Comment              11650 non-null  object
 3   LikeCount            11650 non-null  int64 
 4   ReplyCount           11650 non-null  int64 
 5   Date                 11650 non-null  object
 6   Comment_clean        11650 non-null  object
 7   Comment_clean_words  11504 non-null  object
 8   Sentiment_score      11650 non-null  int64 
 9   Sentiment            11650 non-null  object
dtypes: int64(3), object(7)
memory usage: 910.3+ KB


- Hasil stem dari tahap sebelumnya mengandung missing values pada 'Comment_stem', karena kolom ini akan menjadi X maka row dengan missing values disini akan dihapus

In [3]:
data = data[pd.notnull(data['Comment_clean_words']) & (data['Comment_clean_words'] != '')].copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11504 entries, 0 to 11649
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   11504 non-null  object
 1   Username             11504 non-null  object
 2   Comment              11504 non-null  object
 3   LikeCount            11504 non-null  int64 
 4   ReplyCount           11504 non-null  int64 
 5   Date                 11504 non-null  object
 6   Comment_clean        11504 non-null  object
 7   Comment_clean_words  11504 non-null  object
 8   Sentiment_score      11504 non-null  int64 
 9   Sentiment            11504 non-null  object
dtypes: int64(3), object(7)
memory usage: 988.6+ KB


In [4]:
data[['Comment_clean_words', 'Sentiment']].head(10)

Unnamed: 0,Comment_clean_words,Sentiment
0,teman-teman lagu lgbt gadis muda yang beranjak...,negative
1,performance videonya kaya memberitahu dampak b...,positive
2,persatu member kesempatan menunjukan potensiny...,positive
3,fiks kedepan jkt48 release single mvnya kostum...,positive
4,malam rahasia bilang siapasiapa rahasia ah cah...,negative
5,terlepas kontroversi sejujurnya lagu represent...,positive
6,terlepas hate comen jujur kemajuan banget jkt4...,positive
7,gila konsep mv keren banget good job jkt48,positive
8,congrats jkt48 new era mini albumnya jkt48 jay...,positive
9,buay yang bilang lesbi salah tuh makna lumayan...,positive


## Define X & y

In [5]:
X = data['Comment_clean_words']
y = data['Sentiment']

# SVM with TF-IDF

## Feature Extraction

In [6]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

## Train-Test Split

In [7]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_tfidf, y, test_size=0.2, random_state=48)

## Train & Evaluate

In [8]:
# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_svm, y_train_svm)

# Make predictions
y_pred_svm_train = svm_model.predict(X_train_svm)
y_pred_svm_test = svm_model.predict(X_test_svm)

# Evaluate the model
accuracy_svm_train = accuracy_score(y_train_svm, y_pred_svm_train)
accuracy_svm_test = accuracy_score(y_test_svm, y_pred_svm_test)

print(f"SVM with TF-IDF Train Accuracy: {accuracy_svm_train*100:.2f}%")
print(f"SVM with TF-IDF Test Accuracy: {accuracy_svm_test*100:.2f}%")
print()
print("SVM with TF-IDF Classification Report:")
print(classification_report(y_test_svm, y_pred_svm_test))

SVM with TF-IDF Train Accuracy: 96.82%
SVM with TF-IDF Test Accuracy: 91.09%

SVM with TF-IDF Classification Report:
              precision    recall  f1-score   support

    negative       0.92      0.68      0.79       241
     neutral       0.88      0.96      0.92      1195
    positive       0.95      0.91      0.93       865

    accuracy                           0.91      2301
   macro avg       0.92      0.85      0.88      2301
weighted avg       0.91      0.91      0.91      2301



# Random Forest with TF-IDF

## Feature Extraction

In [9]:
# Let's just use X_tfidf from above

## Train-Test Split

In [10]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_tfidf, y, test_size=0.2, random_state=48)

## Train & Evaluate

In [11]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=48, random_state=48)
rf_model.fit(X_train_rf, y_train_rf)

# Make predictions
y_pred_rf_train = rf_model.predict(X_train_rf)
y_pred_rf_test = rf_model.predict(X_test_rf)

# Evaluate the model
accuracy_rf_train = accuracy_score(y_train_rf, y_pred_rf_train)
accuracy_rf_test = accuracy_score(y_test_rf, y_pred_rf_test)

print(f"Random Forest with TF-IDF Train Accuracy: {accuracy_rf_train * 100:.2f}%")
print(f"Random Forest with TF-IDF Test Accuracy: {accuracy_rf_test * 100:.2f}%")
print()
print("Random Forest with TF-IDF Classification Report:")
print(classification_report(y_test_rf, y_pred_rf_test))

Random Forest with TF-IDF Train Accuracy: 99.97%
Random Forest with TF-IDF Test Accuracy: 88.09%

Random Forest with TF-IDF Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.60      0.72       241
     neutral       0.88      0.92      0.90      1195
    positive       0.88      0.91      0.89       865

    accuracy                           0.88      2301
   macro avg       0.89      0.81      0.84      2301
weighted avg       0.88      0.88      0.88      2301



# Random Forest with BoW

## Feature Extraction

In [12]:
# Create Bag of Words (BoW) representation
vectorizer = CountVectorizer(max_features=5000)
X_bow = vectorizer.fit_transform(X)

## Train-Test Split

In [13]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, y, test_size=0.3, random_state=48)

## Train & Evaluate

In [14]:
# Train Random Forest model
rf_model_bow = RandomForestClassifier(n_estimators=48, random_state=48)
rf_model_bow.fit(X_train_bow, y_train_bow)

# Make predictions
y_pred_bow_train = rf_model_bow.predict(X_train_bow)
y_pred_bow_test = rf_model_bow.predict(X_test_bow)

# Evaluate the model
accuracy_bow_train = accuracy_score(y_train_bow, y_pred_bow_train)
accuracy_bow_test = accuracy_score(y_test_bow, y_pred_bow_test)

print(f"Random Forest with Bag of Words Train Accuracy: {accuracy_bow_train * 100:.2f}%")
print(f"Random Forest with Bag of Words Test Accuracy: {accuracy_bow_test * 100:.2f}%")
print()
print("Random Forest with Bag of Words Classification Report:")
print(classification_report(y_test_bow, y_pred_bow_test))

Random Forest with Bag of Words Train Accuracy: 99.88%
Random Forest with Bag of Words Test Accuracy: 90.09%

Random Forest with Bag of Words Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.70      0.77       370
     neutral       0.90      0.92      0.91      1737
    positive       0.91      0.93      0.92      1345

    accuracy                           0.90      3452
   macro avg       0.89      0.85      0.87      3452
weighted avg       0.90      0.90      0.90      3452



# Deep Learning

## Feature Extraction

In [15]:
X = data['Comment_clean_words'].values
y = data['Sentiment'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [16]:
data['Word_count'] = data['Comment_clean_words'].apply(lambda x: len(str(x).split()))

data[['Comment_clean_words', 'Word_count']].sort_values('Word_count', ascending=False)

Unnamed: 0,Comment_clean_words,Word_count
9554,malam rahasia bilang siapasiapa rahasia ah cah...,406
256,tari perut belly dance dikenal sebutan raqs sh...,297
1,performance videonya kaya memberitahu dampak b...,246
729,lyrics japanese naisho konya atta koto dare in...,197
3508,resep niku udon ala marugame bahan g udon basa...,185
...,...,...
2427,freya,1
6633,ngeri,1
6638,mantap,1
6640,meresahkan,1


In [17]:
# Calculate the 90th percentile of the word count
percentile_95 = np.percentile(data['Word_count'], 95)

print(f"The 95th percentile of word count is: {percentile_95:.0f}")

The 95th percentile of word count is: 20


In [18]:
# Tokenization
max_words = 10000
max_length = int(percentile_95)  # Maximum number of words in a comment
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_length)

## Train-Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=48)

## Train & Evaluation

In [20]:
# Define the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(LSTM(64)),
    Dropout(0.2),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 49ms/step - accuracy: 0.6621 - loss: 0.7599 - val_accuracy: 0.8896 - val_loss: 0.3141
Epoch 2/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 45ms/step - accuracy: 0.9316 - loss: 0.1985 - val_accuracy: 0.8996 - val_loss: 0.2959
Epoch 3/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 44ms/step - accuracy: 0.9685 - loss: 0.0935 - val_accuracy: 0.9392 - val_loss: 0.2029
Epoch 4/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 44ms/step - accuracy: 0.9922 - loss: 0.0309 - val_accuracy: 0.9387 - val_loss: 0.2321
Epoch 5/5
[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 46ms/step - accuracy: 0.9930 - loss: 0.0268 - val_accuracy: 0.9344 - val_loss: 0.2250


In [None]:
# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_test, axis=1)

# Evaluate the model
accuracy_dl_train = model.evaluate(X_train, y_train, verbose=0)[1]
accuracy_dl_test = model.evaluate(X_test, y_test, verbose=0)[1]
print(f"Deep Learning Train Accuracy: {accuracy_dl_train * 100:.2f}%")
print(f"Deep Learning Test Accuracy: {accuracy_dl_test * 100:.2f}%")
print()

# Classification report
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

[1m288/288[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
Deep Learning Train Accuracy: 99.50%
Deep Learning Test Accuracy: 93.44%

              precision    recall  f1-score   support

    negative       0.90      0.86      0.88       241
     neutral       0.93      0.95      0.94      1195
    positive       0.96      0.93      0.94       865

    accuracy                           0.93      2301
   macro avg       0.93      0.91      0.92      2301
weighted avg       0.93      0.93      0.93      2301



# Inference Test with Deep Learning

In [22]:
# Create some new comments to predict the sentiment
new_comments = [
    "Marsha muah muah",
    "Aku nggak mau mikir kejauhan, cukup dinikmati aja, kalau nggak suka tinggal skip",
    "Dih, kok kesel ya liat yang pada komen negatif, kek udah paling bener aja hidupnya"
]

# Preprocess the new comments (tokenize and pad)
new_sequences = tokenizer.texts_to_sequences(new_comments)
new_padded = pad_sequences(new_sequences, maxlen=max_length)

# Make predictions
predictions = model.predict(new_padded)
predicted_classes = np.argmax(predictions, axis=1)

# Decode the predicted classes into sentiment labels
predicted_labels = label_encoder.inverse_transform(predicted_classes)

# Output results
for comment, label in zip(new_comments, predicted_labels):
    print(f"New Comment: {comment}")
    print(f"Predicted Sentiment: {label}")
    print("-" * 30)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
New Comment: Marsha muah muah
Predicted Sentiment: positive
------------------------------
New Comment: Aku nggak mau mikir kejauhan, cukup dinikmati aja, kalau nggak suka tinggal skip
Predicted Sentiment: neutral
------------------------------
New Comment: Dih, kok kesel ya liat yang pada komen negatif, kek udah paling bener aja hidupnya
Predicted Sentiment: negative
------------------------------
