ASPECT EXTRACTION

In [1]:
import yaml

# Load API key from the YAML file
with open('../key/key.yml', 'r') as file:
    keys = yaml.safe_load(file)
    GEMINI_API_KEY = keys.get('GEMINI_API_KEY')

print("API key loaded successfully.")

API key loaded successfully.


In [2]:
import pandas as pd
import google.generativeai as genai
import time
import json
import os

In [3]:
# Ensure the client and model are correctly initialized
client = genai.configure(api_key=GEMINI_API_KEY)

# Inisialisasi model Gemini
model = genai.GenerativeModel('gemini-2.5-flash')

In [4]:
df = pd.read_csv('../data/output/labelled.csv')
# Siapkan kolom hasil
if 'Predicted_Aspects' not in df.columns:
    df['Predicted_Aspects'] = ""
if 'Predicted_Keywords' not in df.columns:
    df['Predicted_Keywords'] = ""


In [5]:

# 2. Checkpoint
checkpoint_file = '../data/output/progress_checkpoint.json'
progress = {}
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        progress = json.load(f)

# Looping untuk Zero-Shot Classification
save_interval = 10  # Simpan setiap 10 tweet
counter = 0

In [6]:

for idx, row in df.iterrows():
    tweet = row['Text']
    # Jika sudah pernah diproses skip
    if str(idx) in progress:
        df.at[idx, 'Predicted_Aspects'] = progress[str(idx)]['aspects']
        df.at[idx, 'Predicted_Keywords'] = progress[str(idx)]['keywords']
        continue
    
    prompt = f"""
    Identifikasi aspek-aspek berikut yang dibahas dalam ulasan Twitter tentang kedai kopi ini: Kualitas Kopi, Layanan Pelanggan, Harga/Nilai, Suasana/Tempat, Menu Lain, Promo. Untuk setiap aspek yang relevan, sebutkan kata-kata kunci terkait. Jika suatu aspek tidak disebutkan, abaikan.
    
    Ulasan: "{tweet}"

    Format:
    Aspek: [Aspek yang teridentifikasi]
    Kata Kunci: [Kata kunci dari ulasan]"""
    
    try:
        resp = model.generate_content([prompt])
        text = resp.text.strip()

        # --- Parsing sederhana berdasarkan baris “Aspek:” dan “Kata Kunci:” ---
        aspects, keywords = [], []
        for line in text.splitlines():
            if line.lower().startswith("aspek:"):
                aspects.append(line.split(":",1)[1].strip())
            elif line.lower().startswith("kata kunci:"):
                keywords.append(line.split(":",1)[1].strip())

        aspects_str  = "; ".join(aspects)
        keywords_str = "; ".join(keywords)

        df.at[idx, 'Predicted_Aspects']  = aspects_str
        df.at[idx, 'Predicted_Keywords'] = keywords_str
        progress[str(idx)] = {'aspects': aspects_str, 'keywords': keywords_str}
        counter += 1
        print(f"[{idx}] → Aspek: {aspects_str} | Keywords: {keywords_str}")

        # Simpan berkala
        if counter % save_interval == 0:
            with open(checkpoint_file, 'w') as f:
                json.dump(progress, f)
            df.to_excel('tweet_with_predicted_aspect_test.xlsx', index=False)
            print(f"— Saved checkpoint at tweet #{idx}")

        time.sleep(2.0)

    except Exception as e:
        print(f"Error on idx {idx}: {e}")
        df.at[idx, 'Predicted_Aspects']  = 'ERROR'
        df.at[idx, 'Predicted_Keywords'] = ''
        continue

# 4. Final save
with open(checkpoint_file, 'w') as f:
    json.dump(progress, f)
df.to_excel('tweet_with_predicted_aspect_test.xlsx', index=False)
print("✅ Done. All predictions saved.")

[60] → Aspek:  | Keywords: 
[61] → Aspek: Suasana/Tempat | Keywords: bikin, tegal, pekalongan, semarang, fore
[62] → Aspek:  | Keywords: 
[63] → Aspek: Kualitas Kopi | Keywords: matcha, enaaaak
[64] → Aspek: Kualitas Kopi; Promo | Keywords: manisnya diawal minum, mengeluh; diskonan
[65] → Aspek: Kualitas Kopi | Keywords: fore esbatu, beda, cair
[66] → Aspek: Kualitas Kopi | Keywords: fore, minum, enakkk
[67] → Aspek: Kualitas Kopi | Keywords: berasa
[68] → Aspek: Layanan Pelanggan | Keywords: fore, anjing males
[69] → Aspek: Kualitas Kopi | Keywords: enak, magic, pelet, rd nya gak maen2
— Saved checkpoint at tweet #69
[70] → Aspek:  | Keywords: 
[71] → Aspek: Kualitas Kopi | Keywords: seneng kalii, minum, fore
[72] → Aspek:  | Keywords: 
[73] → Aspek: Kualitas Kopi | Keywords: enak, bgtt, fore
[74] → Aspek: Kualitas Kopi | Keywords: fore gula
[75] → Aspek: Kualitas Kopi | Keywords: kopi2 fore, butterscotchbuttercream
[76] → Aspek: Kualitas Kopi; Menu Lain | Keywords: frappe, enak, kopi

HANDLING IMBALANCED DATA 

In [7]:
import pandas as pd
from sklearn.utils import resample

# Load dataset
df = pd.read_excel('tweet_with_predicted_aspect.xlsx', sheet_name='sample')  # Ganti nama file jika berbeda

# Cek distribusi awal
print("Distribusi Awal Aspek:")
print(df['Predicted_Aspect'].value_counts())

# Tentukan aspek yang tersedia
aspects = df['Predicted_Aspect'].unique()

# Hitung jumlah sampel tertinggi
max_count = df['Predicted_Aspect'].value_counts().max()

# Lakukan random oversampling per aspek
balanced_df = pd.DataFrame()

for aspect in aspects:
    subset = df[df['Predicted_Aspect'] == aspect]
    resampled_subset = resample(subset,
                                replace=True,
                                n_samples=max_count,
                                random_state=42)
    balanced_df = pd.concat([balanced_df, resampled_subset])

# Reset index
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Cek distribusi setelah balancing
print("\nDistribusi Setelah Balancing:")
print(balanced_df['Predicted_Aspect'].value_counts())

# Simpan dataset balanced
balanced_df.to_excel('balanced_aspect_dataset.xlsx', index=False)
print("\nDataset balanced berhasil disimpan sebagai 'balanced_aspect_dataset.xlsx'")


Distribusi Awal Aspek:
Predicted_Aspect
Produk     255
Promosi     27
Layanan     18
Name: count, dtype: int64

Distribusi Setelah Balancing:
Predicted_Aspect
Layanan    255
Promosi    255
Produk     255
Name: count, dtype: int64

Dataset balanced berhasil disimpan sebagai 'balanced_aspect_dataset.xlsx'


EVALUATION

In [8]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

# Load dataset hasil labeling dan prediksi
df = pd.read_excel('balanced_aspect_dataset.xlsx', sheet_name='Sheet1')  # Pastikan file kamu sesuai

# Ambil kolom yang diperlukan
y_true = df['Valid_Aspect'].astype(str)  # Kolom yang berisi label sebenarnya
y_pred = df['Predicted_Aspect'].astype(str)

# Hitung akurasi keseluruhan
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Hitung precision, recall, f1-score per kelas
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, labels=['Produk', 'Layanan', 'Promosi'])

# Tampilkan hasil per kelas
aspect_labels = ['Produk', 'Layanan', 'Promosi']
for idx, aspect in enumerate(aspect_labels):
    print(f'\nAspect: {aspect}')
    print(f'Precision: {precision[idx]:.4f}')
    print(f'Recall: {recall[idx]:.4f}')
    print(f'F1-Score: {f1[idx]:.4f}')
    print(f'Support: {support[idx]}')

# Tampilkan classification report lengkap
print('\nClassification Report:')
print(classification_report(y_true, y_pred, labels=['Produk', 'Layanan', 'Promosi']))

# Tampilkan confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=['Produk', 'Layanan', 'Promosi'])
print('\nConfusion Matrix:')
print(pd.DataFrame(cm, index=['Produk (True)', 'Layanan (True)', 'Promosi (True)'],
                   columns=['Produk (Pred)', 'Layanan (Pred)', 'Promosi (Pred)']))


Accuracy: 0.6340

Aspect: Produk
Precision: 0.9176
Recall: 0.5120
F1-Score: 0.6573
Support: 457

Aspect: Layanan
Precision: 0.5333
Recall: 0.7727
F1-Score: 0.6311
Support: 176

Aspect: Promosi
Precision: 0.4510
Recall: 0.8712
F1-Score: 0.5943
Support: 132

Classification Report:
              precision    recall  f1-score   support

      Produk       0.92      0.51      0.66       457
     Layanan       0.53      0.77      0.63       176
     Promosi       0.45      0.87      0.59       132

    accuracy                           0.63       765
   macro avg       0.63      0.72      0.63       765
weighted avg       0.75      0.63      0.64       765


Confusion Matrix:
                Produk (Pred)  Layanan (Pred)  Promosi (Pred)
Produk (True)             234             103             120
Layanan (True)             20             136              20
Promosi (True)              1              16             115
