# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

from tensorflow.keras.layers import Bidirectional, Dense, Dropout, LSTM, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load Dataset

## Read CSV

In [6]:
dir_ = "dataset/"
file_input = dir_ + "oshibe_spv_comments_2025-01-15_labeled_stem.csv"
data = pd.read_csv(file_input)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11650 entries, 0 to 11649
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             11650 non-null  object 
 1   Username       11650 non-null  object 
 2   Comment        11650 non-null  object 
 3   LikeCount      11650 non-null  int64  
 4   ReplyCount     11650 non-null  int64  
 5   Date           11650 non-null  object 
 6   Comment_clean  11650 non-null  object 
 7   Comment_size   11650 non-null  int64  
 8   Sentiment      11650 non-null  object 
 9   Confidence     11650 non-null  float64
 10  Comment_stem   11339 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 1001.3+ KB


- Hasil stem dari tahap sebelumnya mengandung missing values pada 'Comment_stem', karena kolom ini akan menjadi X maka row dengan missing values disini akan dihapus

In [7]:
data = data[pd.notnull(data['Comment_stem']) & (data['Comment_stem'] != '')]
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11339 entries, 0 to 11649
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             11339 non-null  object 
 1   Username       11339 non-null  object 
 2   Comment        11339 non-null  object 
 3   LikeCount      11339 non-null  int64  
 4   ReplyCount     11339 non-null  int64  
 5   Date           11339 non-null  object 
 6   Comment_clean  11339 non-null  object 
 7   Comment_size   11339 non-null  int64  
 8   Sentiment      11339 non-null  object 
 9   Confidence     11339 non-null  float64
 10  Comment_stem   11339 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 1.0+ MB


In [35]:
data[['Comment_stem', 'Sentiment']].head(10)

Unnamed: 0,Comment_stem,Sentiment
0,lagu gadis muda yg anjak dewasa hilang polos p...,neutral
1,performance video kaya memberitahu dampak buru...,neutral
2,satu member sempat tunjuk biar gk iri wotanya ...,positive
3,depan jkt48 release single mvnya gak konsep ce...,positive
4,malam rahasia ya bilang rahasia cahaya awan hi...,neutral
5,lepas kontroversi jujur lagu representasi real...,neutral
6,lepas hate comen jujur maju banget konsep nger...,positive
7,gila konsep mv nya keren good job jkt48,positive
8,congrats jkt48 new era mini jkt48 jaya jaya jaya,positive
9,buay yg blg lesbi salah tuh makna lumayan bera...,positive


## Define X & y

In [36]:
X = data['Comment_stem']
y = data['Sentiment']

# SVM with TF-IDF

## Feature Extraction

In [37]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X)

## Train-Test Split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=48)

## Train & Evaluation

In [39]:
# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
print("SVM with TF-IDF Classification Report:")
print(classification_report(y_test, y_pred))
print("SVM with TF-IDF Accuracy:", accuracy_score(y_test, y_pred))

SVM with TF-IDF Classification Report:
              precision    recall  f1-score   support

    negative       0.71      0.66      0.69       745
     neutral       0.63      0.54      0.58       455
    positive       0.75      0.83      0.79      1068

    accuracy                           0.72      2268
   macro avg       0.70      0.68      0.69      2268
weighted avg       0.71      0.72      0.71      2268

SVM with TF-IDF Accuracy: 0.718694885361552


# Random Forest with Word2Vec

## Feature Extraction

In [40]:
# Tokenize the comments
tokenized_comments = [comment.split() for comment in X]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_comments, vector_size=100, window=5, min_count=1, workers=4)

# Create feature vectors for each comment by averaging word vectors
def get_vector(comment):
    words = comment.split()
    word_vecs = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(word_vecs, axis=0) if word_vecs else np.zeros(word2vec_model.vector_size)

X_word2vec = np.array([get_vector(comment) for comment in X])

## Train-Test Split

In [41]:
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(X_word2vec, y, test_size=0.2, random_state=48)

## Train & Evaluation

In [42]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=150)
rf_model.fit(X_train_w2v, y_train_w2v)

# Make predictions
y_pred_w2v = rf_model.predict(X_test_w2v)

# Evaluate the model
print("Random Forest with Word2Vec Classification Report:")
print(classification_report(y_test_w2v, y_pred_w2v))
print("Random Forest with Word2Vec Accuracy:", accuracy_score(y_test_w2v, y_pred_w2v))

Random Forest with Word2Vec Classification Report:
              precision    recall  f1-score   support

    negative       0.60      0.55      0.57       745
     neutral       0.58      0.39      0.46       455
    positive       0.66      0.79      0.72      1068

    accuracy                           0.63      2268
   macro avg       0.61      0.58      0.59      2268
weighted avg       0.62      0.63      0.62      2268

Random Forest with Word2Vec Accuracy: 0.6305114638447972


# Deep Learning with TensorFlow

## Feature Extraction

In [None]:
X = data['Comment_stem'].values
y = data['Sentiment'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# Tokenization
max_words = 10000
max_length = 100  # Maximum number of words in a comment
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)

X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=max_length)

## Train-Test Split

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=48)

## Train & Evaluation

In [58]:
# Define the model with more layers
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(LSTM(64)),
    Dropout(0.2),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/10




[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 176ms/step - accuracy: 0.5488 - loss: 0.9317 - val_accuracy: 0.6781 - val_loss: 0.7128
Epoch 2/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 187ms/step - accuracy: 0.8024 - loss: 0.5186 - val_accuracy: 0.6861 - val_loss: 0.7393
Epoch 3/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 195ms/step - accuracy: 0.8832 - loss: 0.3364 - val_accuracy: 0.7015 - val_loss: 0.7680
Epoch 4/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 186ms/step - accuracy: 0.9138 - loss: 0.2551 - val_accuracy: 0.7130 - val_loss: 0.8774
Epoch 5/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 187ms/step - accuracy: 0.9371 - loss: 0.1911 - val_accuracy: 0.7103 - val_loss: 0.9086
Epoch 6/10
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 180ms/step - accuracy: 0.9460 - loss: 0.1619 - val_accuracy: 0.7094 - val_loss: 0.9706
Epoch 7/10
[1m284/28

In [59]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification report
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

# Evaluate the model
accuracy_dl = model.evaluate(X_test, y_test, verbose=0)[1]
print(f"Deep Learning Accuracy: {accuracy_dl * 100:.2f}%")

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step
              precision    recall  f1-score   support

    negative       0.67      0.65      0.66       713
     neutral       0.67      0.54      0.60       467
    positive       0.75      0.82      0.78      1088

    accuracy                           0.71      2268
   macro avg       0.69      0.67      0.68      2268
weighted avg       0.70      0.71      0.70      2268

Deep Learning Accuracy: 70.86%
