In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [76]:
df1 = pd.read_csv('./dataset_komentar_ig.csv')
df1 = df1.drop(columns=['Id'])
df1 = df1.rename(columns={'Instagram Comment Text': 'review', 'Sentiment': 'sentiment'})
df1.head()

Unnamed: 0,sentiment,review
0,negative,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1,negative,Geblek lo tata...cowo bgt dibela2in balikan......
2,negative,Kmrn termewek2 skr lengket lg duhhh kok labil ...
3,negative,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
4,negative,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [77]:
df2 = pd.read_csv('./dataset_opini_film.csv')
df2 = df2.drop(columns=['Id'])
df2 = df2.rename(columns={'Text Tweet': 'review', 'Sentiment': 'sentiment'})
df2.head()

Unnamed: 0,sentiment,review
0,negative,Jelek filmnya... apalagi si ernest gak mutu bg...
1,negative,Film king Arthur ini film paling jelek dari se...
2,negative,@beexkuanlin Sepanjang film gwa berkata kasar ...
3,negative,Ane ga suka fast and furious..menurutku kok je...
4,negative,"@baekhyun36 kan gua ga tau film nya, lu bilang..."


In [78]:
df3 = pd.read_csv('./dataset_penggunaan_selular.csv')
df3 = df3.drop(columns=['Id'])
df3 = df3.rename(columns={'Text Tweet': 'review', 'Sentiment': 'sentiment'})
df3.head()

Unnamed: 0,sentiment,review
0,positive,<USER_MENTION> #BOIKOT_<PROVIDER_NAME> Gunakan...
1,positive,"Saktinya balik lagi, alhamdulillah :v <PROVIDE..."
2,negative,Selamat pagi <PROVIDER_NAME> bisa bantu kenap...
3,negative,Dear <PROVIDER_NAME> akhir2 ini jaringan data ...
4,negative,Selamat malam PENDUSTA <PROVIDER_NAME>


In [79]:
df4 = pd.read_csv('./dataset_sentimen_pilkada.csv')
df4 = df4.drop(columns=['Id', 'Pasangan Calon'])
df4 = df4.rename(columns={'Text Tweet': 'review', 'Sentiment': 'sentiment'})
df4.head()

Unnamed: 0,sentiment,review
0,negative,Banyak akun kloning seolah2 pendukung #agussil...
1,negative,#agussilvy bicara apa kasihan yaa...lap itu ai...
2,negative,Kalau aku sih gak nunggu hasil akhir QC tp lag...
3,negative,Kasian oh kasian dengan peluru 1milyar untuk t...
4,negative,Maaf ya pendukung #AgusSilvy..hayo dukung #Ani...


In [80]:
df5 = pd.read_csv('./dataset_tayangan_tv.csv')
df5 = df5.drop(columns=['Id', 'Acara TV', 'Jumlah Retweet'])
df5 = df5.rename(columns={'Text Tweet': 'review', 'Sentiment': 'sentiment'})
df5.head()

Unnamed: 0,sentiment,review
0,positive,"Undang @N_ShaniJKT48 ke hitamputih, pemenang S..."
1,positive,Selamat berbuka puasa Semoga amal ibadah hari ...
2,positive,"Ada nih di trans7 hitam putih, dia dpt penghar..."
3,positive,selamat ya mas @adietaufan masuk hitamputih
4,positive,Asiknya nonton Hitam Putih Trans7


In [81]:
data = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
data.count()

sentiment    2200
review       2200
dtype: int64

In [82]:
data['sentiment'].value_counts()

sentiment
negative    1111
positive    1089
Name: count, dtype: int64

In [83]:
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)
data.head()

Unnamed: 0,sentiment,review
0,0,<USERNAME> TOLOL!! Gak ada hubungan nya kegug...
1,0,Geblek lo tata...cowo bgt dibela2in balikan......
2,0,Kmrn termewek2 skr lengket lg duhhh kok labil ...
3,0,"Intinya kalau kesel dengan ATT nya, gausah ke ..."
4,0,"hadewwwww permpuan itu lg!!!!sakit jiwa,knp ha..."


In [84]:
data['sentiment'].value_counts()

sentiment
0    1111
1    1089
Name: count, dtype: int64

Splitting dataset into training data and testing data

In [85]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

print(train_data.shape)
print(test_data.shape)

(1760, 2)
(440, 2)


In [86]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [87]:
print(X_train)
print(X_test)

[[   0    0    0 ...   20  815   15]
 [   0    0    0 ...  228    6 1670]
 [   0    0    0 ...   72  486  209]
 ...
 [   0    0    0 ...   71   40 2426]
 [   0    0    0 ...  585  419    9]
 [   0    0    0 ...  204    2    1]]
[[   0    0    0 ...   53   87   42]
 [   0    0    0 ...  223  790    9]
 [   0    0    0 ...  223   15  173]
 ...
 [   0    0    0 ... 1341 1342  814]
 [   0    0    0 ...  341   43   76]
 [   0    0    0 ...   25  175 1948]]


In [88]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [89]:
print(Y_train)

1656    1
752     0
892     1
1041    0
1179    1
       ..
1638    0
1095    1
1130    1
1294    0
860     0
Name: sentiment, Length: 1760, dtype: int64


In [90]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))



In [91]:
model.summary()

In [92]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

In [93]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 266ms/step - accuracy: 0.5272 - loss: 0.6903 - val_accuracy: 0.6165 - val_loss: 0.6757
Epoch 2/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 252ms/step - accuracy: 0.7307 - loss: 0.6170 - val_accuracy: 0.7216 - val_loss: 0.5406
Epoch 3/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 229ms/step - accuracy: 0.8936 - loss: 0.3112 - val_accuracy: 0.7983 - val_loss: 0.4780
Epoch 4/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 244ms/step - accuracy: 0.9556 - loss: 0.1369 - val_accuracy: 0.7983 - val_loss: 0.5164
Epoch 5/5
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 221ms/step - accuracy: 0.9748 - loss: 0.0799 - val_accuracy: 0.7898 - val_loss: 0.5753


<keras.src.callbacks.history.History at 0x312837f70>

In [94]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.7761 - loss: 0.6227
Test Loss: 0.5887420177459717
Test Accuracy: 0.7749999761581421


In [95]:
def predict_sentiment(review):
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [96]:
new_review = "Produk nya keren"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
The sentiment of the review is: positive
