## **Panggil data dengan label aktual sementara**

In [None]:
import pandas as pd
import numpy as np
!pip install gdown
import gdown

file_id = "13oy1pH836mylOn5h6Itp3nGvFc51D64i"
url = f"https://drive.google.com/uc?id={file_id}"
output = "label_fix.csv"
gdown.download(url, output, quiet=False)
datalabelfix = pd.read_csv("label_fix.csv")
datalabelfix.head()

## **Label aktual dengan bobot dictionary**

In [None]:
local_negative = [
    # Kebersihan & kondisi
    "dirty restroom",
    "toilet dirty",
    "lots of rubbish",
    "rubbish strewn",
    "beach is dirty",
    "smells of urine",
    "cloudy pool",
    "shaky inside",
    "slippery floor",
    "unsafe bus",

    # Harga & value
    "not worth the ticket",
    "not worth it",
    "too expensive",
    "pricey",
    "overpriced",
    "expensive ticket",
    "drink prices are too expensive",
    "ticket is rigged",

    # Keramaian & akses
    "very crowded",
    "hard to take photos",
    "fight for photo spots",
    "queue was really bad",
    "long queue",
    "only 1 counter",

    # Fasilitas buruk / terbatas
    "no janitors",
    "no standby staff",
    "no prayer room",
    "no paths for strollers",
    "not suitable for elderly",
    "not suitable for children",
    "limited gazebo",
    "no shelter",

    # Pelayanan
    "staff not friendly",
    "didn't care about passengers",
    "disproportionate driver",
    "discriminatory staff",
    "prioritize foreign tourists",
    "confusing system",
    "system is slow",

    # Pengalaman mengecewakan
    "worst experience",
    "very disappointed",
    "regretted it",
    "boring place",
    "animals look unhealthy",
    "few animals",
    "empty cages",

    # Teknis & manajemen
    "confusing directions",
    "poor guidance",
    "not informative",
    "arenas were closed",
    "without notification",
    "online ticketing problem",
]
local_neutral = [
    # Pengalaman biasa / datar
    "quite tired",
    "a bit confusing",
    "standard cleanliness",
    "okay service",
    "not much different",
    "quite hot",
    "very busy",

    # Kondisi yang mengurangi kenyamanan
    "need to come early",
    "prepare extra energy",
    "lots of stairs",
    "ups and downs",
    "long walk",
    "hard to park",

    # Harga masih ditoleransi tapi mengeluh
    "quite expensive",
    "a bit expensive",
    "ticket includes",

    # Fasilitas ada tapi kurang optimal
    "limited",
    "not festive",
    "small area",
    "few shows",
    "animals are incomplete",

    # Situasi eksternal
    "weather was bad",
    "rain and wind",
    "too late",
    "missed the show",
]


In [None]:
def local_complaint_score(text):
    if pd.isna(text):
        return 0, 0

    text = text.lower()

    neg = sum(1 for w in local_negative if w in text)
    neu = sum(1 for w in local_neutral if w in text)

    return neg, neu

In [None]:
def refine_label(row):
    label = row['label_aktual']
    vader = row['vader_score']
    afinn = row['afinn_score']
    text = row['ulasan_en']

    neg_score, neu_score = local_complaint_score(text)

    # 1. Negative kuat â†’ jangan diubah
    if label == "negative" and (afinn <= -2 or vader <= -0.3):
        return "negative"

    # 2. Neutral + keluhan lokal â†’ NEGATIVE
    if label == "neutral" and neg_score >= 1:
        return "negative"

    # 3. Positive lemah + keluhan lokal â†’ NEUTRAL
    if label == "positive" and (afinn <= 1 and vader < 0.4) and (neg_score + neu_score) >= 1:
        return "neutral"

    return label

In [None]:
datalabelfix['label_refined'] = datalabelfix.apply(refine_label, axis=1)

In [None]:
print(datalabelfix["label_refined"].value_counts())

import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x="label_refined", data=datalabelfix, order=["positive", "neutral", "negative"])
plt.title("Distribusi Sentimen Berdasarkan hybrid new")
plt.xlabel("Sentiment")
plt.ylabel("Jumlah Ulasan")
plt.show()

## **NN MLP undersampling**

In [None]:
datalabelfix.head()

panggil data embedding multimodal

In [None]:
file_id_mult = "1wmLmZR8xDed_gBHAy5FxYBtuBCb9MgN_"
url_mult = f"https://drive.google.com/uc?id={file_id_mult}"
gdown.download(url_mult, "multimodal_embeddings.npy", quiet=False)
multimodal_embeddings = np.load("multimodal_embeddings.npy", allow_pickle=True)
print("multimodal_embeddings:", np.array(multimodal_embeddings).shape)

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Drop neutral
df_bin = datalabelfix[datalabelfix['label_refined'] != 'neutral'].copy()

# Ambil 200 positive saja
df_pos = df_bin[df_bin['label_refined'] == 'positive'].sample(n=200, random_state=42)
df_neg = df_bin[df_bin['label_refined'] == 'negative']

df_final = pd.concat([df_pos, df_neg]).reset_index(drop=True)

# Encode label
df_final['label'] = df_final['label_refined'].map({
    'negative': 0,
    'positive': 1
})

print(df_final['label'].value_counts())

In [None]:
# Ambil index baris yang dipakai
selected_idx = df_final.index.to_numpy()

X = multimodal_embeddings[selected_idx]
y = df_final['label'].values

print(X.shape, y.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
train_dataset = EmbeddingDataset(X_train, y_train)
val_dataset   = EmbeddingDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
class MultimodalNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 2)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MultimodalNN(input_dim=X.shape[1]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
epochs = 20

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")

In [None]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for X_batch, y_batch in val_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)

        y_true.extend(y_batch.numpy())
        y_pred.extend(preds.cpu().numpy())

In [None]:
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", cm)

print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["Negative", "Positive"]))

## **Topic Modeling (BERTopic)**

In [None]:
!pip install bertopic sentence-transformers hdbscan umap-learn

In [None]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [None]:
# Pastikan tidak ada NaN
datalabelfix = datalabelfix.dropna(subset=["ulasan_en", "label_refined"])

# Pisahkan review
negative_reviews = datalabelfix[
    datalabelfix["label_refined"] == "negative"
]["ulasan_en"].tolist()

positive_reviews = datalabelfix[
    datalabelfix["label_refined"] == "positive"
]["ulasan_en"].tolist()

print("Jumlah negative:", len(negative_reviews))
print("Jumlah positive:", len(positive_reviews))

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
    metric="cosine",
    random_state=42  # ðŸ”’ KUNCI
)

hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True
)

In [None]:
custom_stopwords = [
    "cool", "good", "lots", "nice", "don",
    "unfortunately", "like", "come", "just", "bali"
]

In [None]:
vectorizer_model = CountVectorizer(
    stop_words=all_stopwords,
    ngram_range=(1, 2),
    min_df=2
)

In [None]:
topic_model_neg = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    language="english",
    min_topic_size=10
)
topics_neg, probs_neg = topic_model_neg.fit_transform(negative_reviews)

In [None]:
topic_info_neg = topic_model_neg.get_topic_info()
topic_info_neg

In [None]:
for topic_id in topic_info_neg.Topic:
    if topic_id != -1:
        print(f"\nTopic {topic_id}:")
        print(topic_model_neg.get_topic(topic_id))

In [None]:
topic_model_neg.get_topic(0)

Topic Positive

In [None]:
topic_model_pos = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    language="english",
    min_topic_size=10
)
topics_pos, probs_pos = topic_model_pos.fit_transform(positive_reviews)

In [None]:
topic_info_pos = topic_model_pos.get_topic_info()
topic_info_pos

In [None]:
for topic_id in topic_info_pos.Topic:
    if topic_id != -1:
        print(f"\nTopic {topic_id}:")
        print(topic_model_pos.get_topic(topic_id))

visualisasi topik

In [None]:
topic_model_pos.visualize_barchart(top_n_topics=44)

In [None]:
topic_model_neg.visualize_barchart(top_n_topics=2)

## **Metrik Evaluasi hasil cluster topik**

Silhouette score

In [None]:
import numpy as np
from sklearn.metrics import silhouette_score

# Ambil embedding dokumen dari BERTopic
embeddings_neg = topic_model_neg._extract_embeddings(negative_reviews)

# Convert ke numpy
embeddings_neg = np.array(embeddings_neg)

# Convert topic ke numpy
topics_neg = np.array(topics_neg)

In [None]:
mask_neg = topics_neg != -1

X_neg = embeddings_neg[mask_neg]
y_neg = topics_neg[mask_neg]

print("Jumlah dokumen valid (neg):", X_neg.shape[0])
print("Jumlah topik (neg):", len(set(y_neg)))

In [None]:
if len(set(y_neg)) > 1:
    sil_neg = silhouette_score(X_neg, y_neg, metric="cosine")
    print("Silhouette Score (Negative):", sil_neg)
else:
    print("Silhouette score tidak bisa dihitung (hanya 1 topik)")

In [None]:
# Ambil embedding dokumen positive
embeddings_pos = topic_model_pos._extract_embeddings(positive_reviews)
embeddings_pos = np.array(embeddings_pos)
topics_pos = np.array(topics_pos)

# Buang outlier
mask_pos = topics_pos != -1
X_pos = embeddings_pos[mask_pos]
y_pos = topics_pos[mask_pos]

print("Jumlah dokumen valid (pos):", X_pos.shape[0])
print("Jumlah topik (pos):", len(set(y_pos)))

In [None]:
if len(set(y_pos)) > 1:
    sil_pos = silhouette_score(X_pos, y_pos, metric="cosine")
    print("Silhouette Score (Positive):", sil_pos)
else:
    print("Silhouette score tidak bisa dihitung (hanya 1 topik)")

Davies Bouldin

In [None]:
from sklearn.metrics import davies_bouldin_score
import numpy as np

In [None]:
# Ambil embedding dokumen
embeddings_neg = topic_model_neg._extract_embeddings(negative_reviews)
embeddings_neg = np.array(embeddings_neg)

# Label topik
topics_neg = np.array(topics_neg)

# Buang outlier
mask_neg = topics_neg != -1
X_neg = embeddings_neg[mask_neg]
y_neg = topics_neg[mask_neg]

print("Jumlah dokumen valid (neg):", X_neg.shape[0])
print("Jumlah topik (neg):", len(set(y_neg)))

In [None]:
# Ambil embedding dokumen
embeddings_pos = topic_model_pos._extract_embeddings(positive_reviews)
embeddings_pos = np.array(embeddings_pos)

# Label topik
topics_pos = np.array(topics_pos)

# Buang outlier
mask_pos = topics_pos != -1
X_pos = embeddings_pos[mask_pos]
y_pos = topics_pos[mask_pos]

print("Jumlah dokumen valid (pos):", X_pos.shape[0])
print("Jumlah topik (pos):", len(set(y_pos)))

**SIMPAN HASIL**

informasi topik

In [None]:
# Topic summary (negative)
topic_info_neg.to_csv(
    "/content/drive/MyDrive/Coolyeah/SMT 5/SML Project/topic_summary_neg.csv",
    index=False
)

In [None]:
# Topic summary (positive)
topic_info_pos.to_csv(
    "/content/drive/MyDrive/Coolyeah/SMT 5/SML Project/topic_summary_pos.csv",
    index=False
)

kata2 penting

In [None]:
topic_words_neg = []

for topic_id in topic_info_neg.Topic:
    if topic_id != -1:
        words = topic_model_neg.get_topic(topic_id)
        for word, weight in words:
            topic_words_neg.append({
                "topic": topic_id,
                "word": word,
                "weight": weight
            })

df_topic_words_neg = pd.DataFrame(topic_words_neg)

df_topic_words_neg.to_csv(
    "/content/drive/MyDrive/Coolyeah/SMT 5/SML Project/topic_words_negative.csv",
    index=False
)

In [None]:
topic_words_pos = []

for topic_id in topic_info_pos.Topic:
    if topic_id != -1:
        words = topic_model_pos.get_topic(topic_id)
        for word, weight in words:
            topic_words_pos.append({
                "topic": topic_id,
                "word": word,
                "weight": weight
            })

df_topic_words_pos = pd.DataFrame(topic_words_pos)

df_topic_words_pos.to_csv(
    "/content/drive/MyDrive/Coolyeah/SMT 5/SML Project/topic_words_positive.csv",
    index=False
)

mapping dokumen

In [None]:
df_docs_neg = pd.DataFrame({
    "review": negative_reviews,
    "topic": topics_neg,
    "probability": probs_neg
})

df_docs_neg.to_csv(
    "/content/drive/MyDrive/Coolyeah/SMT 5/SML Project/document_topic_negative.csv",
    index=False
)

In [None]:
df_docs_pos = pd.DataFrame({
    "review": positive_reviews,
    "topic": topics_pos,
    "probability": probs_pos
})

df_docs_pos.to_csv(
    "/content/drive/MyDrive/Coolyeah/SMT 5/SML Project/document_topic_positive.csv",
    index=False
)