In [1]:
import pandas as pd
import numpy as np

In [2]:
anger = pd.read_csv("/kaggle/input/main-dataset0-1/AngerData.csv", delimiter="\t", encoding="utf-8")
fear = pd.read_csv("/kaggle/input/main-dataset0-1/FearData.csv", delimiter="\t", encoding="utf-8")
joy = pd.read_csv("/kaggle/input/main-dataset0-1/JoyData.csv", delimiter="\t", encoding="utf-8")
love = pd.read_csv("/kaggle/input/main-dataset0-1/LoveData.csv", delimiter="\t", encoding="utf-8")
neutral = pd.read_csv("/kaggle/input/main-dataset0-1/NeutralData.csv", delimiter="\t", encoding="utf-8")
sad = pd.read_csv("/kaggle/input/main-dataset0-1/SadData.csv", delimiter="\t", encoding="utf-8")

In [3]:
print("Anger Dataset information")
anger.info()

print("Fear Dataset information")
fear.info()

print("Joy Dataset information")
joy.info()

print("Love Dataset information")
love.info()

print("Neutral Dataset information")
neutral.info()

print("Sad Dataset information")
sad.info()

Anger Dataset information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1130 entries, 0 to 1129
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweet   1130 non-null   object
 1   Label   1130 non-null   object
dtypes: object(2)
memory usage: 17.8+ KB
Fear Dataset information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweet   911 non-null    object
 1   Label   911 non-null    object
dtypes: object(2)
memory usage: 14.4+ KB
Joy Dataset information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275 entries, 0 to 1274
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweet   1275 non-null   object
 1   Label   1275 non-null   object
dtypes: object(2)
memory usage: 20.0+ KB
Love Dataset information
<class 'pandas.core.frame.Dat

In [4]:
anger["Label"] = "Anger"
fear["Label"] = "Fear"
joy["Label"] = "Joy"
love["Label"] = "Love"
neutral["Label"] = "Neutral"
sad["Label"] = "Sad"

In [5]:
df = pd.concat([anger, fear, joy, love, neutral, sad], ignore_index=True)
print(df)

print(df['Label'].value_counts())
print(df.info())

                                                  Tweet  Label
0                           pagi2 udah di buat emosi :)  Anger
1     kok stabilitas negara, memange 10 thn negara t...  Anger
2                          dah lah emosi mulu liat emyu  Anger
3     aib? bodoh benar! sebelum kata aib itu muncul,...  Anger
4                               dih lu yg nyebelin bego  Anger
...                                                 ...    ...
7075  pagi saat seisi semesta sujud pada zat yg acap...    Sad
7076  meski engkau yg pergi, meski engkau yg meningg...    Sad
7077                     udah biasa kalah dan tersakiti    Sad
7078  apakabar ku?. apakah baik-baik saja,, tidak, a...    Sad
7079      this user sdg tidak.baik.baik saja sdg stress    Sad

[7080 rows x 2 columns]
Label
Neutral    2001
Joy        1275
Anger      1130
Sad        1003
Fear        911
Love        760
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7080 entries, 0 to 7079
Data columns (tot

PREPROCESS

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
# nltk.download('stopwords')
# nltk.download('punkt')

In [7]:
stop_words = set(stopwords.words('indonesian')) 
stemmer = PorterStemmer()

In [8]:
singkatan_dict = {
    "gpp": "gak apa apa",
    "yg": "yang",
    "abg": "abang",
    "bg": "bang",
    "bgt": "banget",
    "thn": "tahun",
    "klo": "kalo",
    "kl": "kalo",
    "tdk": "tidak",
    "bgt": "banget",
    "krn": "karena",
    "udh": "sudah",
    "dgn": "dengan",
    "trs": "terus",
    "blm": "belum",
    "sm": "sama",
    "aja": "saja"
}

In [9]:
def expand_singkatan(text):
    words = text.split()
    words = [singkatan_dict[word] if word in singkatan_dict else word for word in words]
    return " ".join(words)

df["Tweet"] = df["Tweet"].fillna("").apply(expand_singkatan)

In [10]:
def preprocess_text(text):
    text = expand_singkatan(text)  # Perbaiki singkatan dulu
    text = text.lower()  # Lowercasing
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Hapus angka & tanda baca
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stop_words]  # Hapus stopwords
    tokens = [stemmer.stem(word) for word in tokens]  # Stemming
    return " ".join(tokens)

df["Clean_Tweet"] = df["Tweet"].apply(preprocess_text)
print(df.head()[["Tweet", "Clean_Tweet"]])

                                               Tweet  \
0                        pagi2 udah di buat emosi :)   
1  kok stabilitas negara, memange 10 tahun negara...   
2                       dah lah emosi mulu liat emyu   
3  aib? bodoh benar! sebelum kata aib itu muncul,...   
4                          dih lu yang nyebelin bego   

                                         Clean_Tweet  
0                                    pagi udah emosi  
1  stabilita negara memang negara aman sbi menyub...  
2                           dah emosi mulu liat emyu  
3  aib bodoh aib muncul tindakan kekejian kau sem...  
4                               dih lu nyebelin bego  


# **LSTM (GRU LATER)**

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Bidirectional, Dropout

In [12]:
# Encode label (kategori emosi)
label_encoder = LabelEncoder()
df["Label"] = label_encoder.fit_transform(df["Label"])
print(label_encoder.classes_)


['Anger' 'Fear' 'Joy' 'Love' 'Neutral' 'Sad']


In [13]:
#Data di split 70/30 ben ra sepaneng
train_texts, test_texts, train_labels, test_labels = train_test_split(df["Clean_Tweet"], df["Label"], test_size=0.2, random_state=42)

In [14]:
train_labels = train_labels.values
test_labels = test_labels.values

# Tokenisasi teks
tokenizer = Tokenizer(num_words=30000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

max_len = 128
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding="post", truncating="post")

In [15]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopper = EarlyStopping(monitor="val_loss", patience=12, restore_best_weights=True)

In [16]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

unique_classes = np.unique(df["Label"])

weights = compute_class_weight(
    class_weight="balanced",
    classes=unique_classes,
    y=df["Label"]
)

class_weights = {label: weight for label, weight in zip(unique_classes, weights)}

print(class_weights) 


{0: 1.0442477876106195, 1: 1.2952799121844127, 2: 0.9254901960784314, 3: 1.5526315789473684, 4: 0.5897051474262869, 5: 1.1764705882352942}


In [17]:
# model = Sequential([
#     Embedding(input_dim=10000, output_dim=256),  # Naikkan output_dim biar embedding lebih kaya
#     Bidirectional(LSTM(128, return_sequences=True)),  # Tambah units biar lebih kompleks
#     Dropout(0.3),  # Tambah dropout buat regularisasi
#     Bidirectional(GRU(64)),  # Tambah kapasitas GRU
#     Dropout(0.3),
#     Dense(64, activation="relu"),  # Tambah hidden layer tambahan
#     Dense(6, activation="softmax")
# ])

model = Sequential([
    tf.keras.layers.Embedding(input_dim=30000, output_dim=256),
    tf.keras.layers.Bidirectional(LSTM(32, recurrent_dropout=0.2)),
    tf.keras.layers.Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.006)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.LeakyReLU(),
    tf.keras.layers.Dense(6, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

print(df["Label"].value_counts(normalize=True))

history = model.fit(
    train_padded, 
    train_labels, 
    epochs=20, 
    batch_size=32, 
    validation_split=0.2, 
    class_weight=class_weights,
    callbacks = [early_stopper])

Label
4    0.282627
2    0.180085
0    0.159605
5    0.141667
1    0.128672
3    0.107345
Name: proportion, dtype: float64
Epoch 1/20
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 271ms/step - accuracy: 0.2514 - loss: 1.8263 - val_accuracy: 0.4881 - val_loss: 1.4416
Epoch 2/20
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 268ms/step - accuracy: 0.5878 - loss: 1.1142 - val_accuracy: 0.6055 - val_loss: 1.2049
Epoch 3/20
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 269ms/step - accuracy: 0.7770 - loss: 0.6745 - val_accuracy: 0.6328 - val_loss: 1.1823
Epoch 4/20
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 273ms/step - accuracy: 0.8786 - loss: 0.4731 - val_accuracy: 0.6408 - val_loss: 1.2763
Epoch 5/20
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 270ms/step - accuracy: 0.9270 - loss: 0.3409 - val_accuracy: 0.6214 - val_loss: 1.3831
Epoch 6/20
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━