In [1]:
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [2]:
# LOAD DATASET
df = pd.read_csv(r"C:\Users\A S U S\OneDrive\Documents\TUGASSS\Proyek-Web-Scraping-Shopee-Perbandingan-Dua-Produk\KlasifikasiData\DataSetParfumeKlasifikasi.csv")

# PREPROCESSING
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean'] = df['comment'].apply(clean_text)

In [3]:
# STEMMING (SASTRAWI)
factory = StemmerFactory()
stemmer = factory.create_stemmer()

df['stemmed'] = df['clean'].apply(stemmer.stem)
df['lemmas'] = df['stemmed']   # placeholder lemmatization

In [4]:
# BoW
bow = CountVectorizer()
X_bow = bow.fit_transform(df['stemmed'])
y = df['label_asli']


In [5]:

# TF-IDF + SVM
X_tfidf = TfidfVectorizer().fit_transform(df['stemmed'])
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2)

svm_model = svm.SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

print("TF-IDF x SVM")
print(classification_report(y_test, svm_pred))

TF-IDF x SVM
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        17
     neutral       1.00      1.00      1.00        13
    positive       1.00      1.00      1.00        20

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



In [6]:
# WORD2VEC
tokens = [text.split() for text in df['stemmed']]
w2v = Word2Vec(sentences=tokens, vector_size=100, window=5, min_count=1)

def sent_vector(words):
    return np.mean([w2v.wv[w] for w in words if w in w2v.wv], axis=0)

X_vec = np.array([sent_vector(t) for t in tokens])

In [8]:
# LSTM input preparation
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['stemmed'])
seq = tokenizer.texts_to_sequences(df['stemmed'])
padded = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=20)

label_map = {'positive':0, 'negative':1, 'neutral':2}
y_int = df['label_asli'].map(label_map)

X_train2, X_test2, y_train2, y_test2 = train_test_split(padded, y_int, test_size=0.2)

model_lstm = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=20),
    LSTM(64),
    Dense(3, activation='softmax')
])

model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model_lstm.fit(X_train2, y_train2, epochs=100)

Epoch 1/100




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.3850 - loss: 1.0790
Epoch 2/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5350 - loss: 1.0144
Epoch 3/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7100 - loss: 0.9378
Epoch 4/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7250 - loss: 0.8089
Epoch 5/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8000 - loss: 0.5939
Epoch 6/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8850 - loss: 0.3871
Epoch 7/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9400 - loss: 0.2346
Epoch 8/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9950 - loss: 0.1529
Epoch 9/100
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

<keras.src.callbacks.history.History at 0x22b25ae8950>