ASPECT EXTRACTION

In [2]:
GEMINI_API_KEY = 'AIzaSyC-KEAl0h05JIsSPfR7zPH3ykMO2ioMMWA'

In [1]:
import pandas as pd
import google.generativeai as genai
import time
import json
import os

In [3]:
# Ensure the client and model are correctly initialized
client = genai.configure(api_key=GEMINI_API_KEY)

# Inisialisasi model Gemini
model = genai.GenerativeModel('gemini-2.5-flash')

In [4]:
df = pd.read_excel('../data/data_valid/main_indobert_train.xlsx', sheet_name='data_test')
# Siapkan kolom hasil
if 'Predicted_Aspect' not in df.columns:
    df['Predicted_Aspect'] = ""
# Checkpoint file untuk progress
checkpoint_file = 'progress_checkpoint.json'

In [5]:

# Load progress jika ada
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        progress = json.load(f)
    print("Checkpoint loaded.")
else:
    progress = {}

# Looping untuk Zero-Shot Classification
save_interval = 10  # Simpan setiap 10 tweet
counter = 0

Checkpoint loaded.


In [6]:
df

Unnamed: 0,Date,Text,Text Normalization,Label_Bert,Predicted_Aspect
0,2024-01-01,Tim matcha green tea sm classic milo setiap be...,tim matcha green tea classic milo beli fore se...,positive,
1,2024-01-01,"Nder, mo nanya dong kalo minuman yang coklat-c...",minuman coklatcoklat fore enak yaa,negative,
2,2024-01-01,Fore aplikasi,fore aplikasi,positive,
3,2024-01-01,hari ini lagi coba menu baru(?)nya fore.,coba menu barunya fore,positive,
4,2024-01-01,LOH INI ADMINE TA tp aku cocok di caramel pral...,loh admine ta cocok caramel praline frappe for...,positive,
...,...,...,...,...,...
452,2024-12-01,"Eemang ga enak ih tomoro mah, pait.. mending fore",eemang enak ih tomoro mah pait fore,negative,
453,2024-12-01,Fore jelas. Beda harga kak,fore beda harga kak,negative,
454,2024-12-01,"Kakah gue sukanya jajan kopi di fore, jadi dia...",kakah sukanya jajan kopi fore jajan fore stike...,positive,
455,2024-12-01,OKKKKK MENU YANG AKAN KUBELI KALO KE FORE,okkkkk menu kubeli fore,neutral,


In [9]:

for idx, row in df.iterrows():
    tweet = row['Text']
    
    prompt = f"""
    Tweet: "{tweet}"
    Available Aspects: Produk, Layanan, Promosi.
    Question: Which aspect does the tweet talk about? Only answer one aspect: Produk, Layanan, or Promosi. Do not explain.
    """
    
    try:
        response = model.generate_content([prompt])
        aspect = response.text.strip()

        df.at[idx, 'Predicted_Aspect'] = aspect
        progress[str(idx)] = aspect  # Simpan progress
        counter += 1

        print(f"Processed tweet {idx+1}/{len(df)}: Aspect = {aspect}")

        # Simpan setiap N tweet
        if counter % save_interval == 0:
            with open(checkpoint_file, 'w') as f:
                json.dump(progress, f)
            df.to_excel('tweet_with_predicted_aspect_test.xlsx', index=False)
            print(f"Progress saved after {counter} tweets.")

        time.sleep(2.0)  # Hindari rate-limit

    except Exception as e:
        print(f"Error processing tweet {idx+1}: {e}")
        df.at[idx, 'Predicted_Aspect'] = 'ERROR'
        continue

# Simpan file akhir
with open(checkpoint_file, 'w') as f:
    json.dump(progress, f)
df.to_excel('tweet_with_predicted_aspect_test.xlsx', index=False)
print("Aspect prediction completed and final progress saved.")

Processed tweet 1/457: Aspect = Produk
Processed tweet 2/457: Aspect = Produk
Processed tweet 3/457: Aspect = Produk
Processed tweet 4/457: Aspect = Produk
Processed tweet 5/457: Aspect = Produk
Processed tweet 6/457: Aspect = Produk
Progress saved after 10 tweets.
Processed tweet 7/457: Aspect = Produk
Error processing tweet 8: Invalid operation: The `response.parts` quick accessor requires a single candidate, but but `response.candidates` is empty.
This appears to be caused by a blocked prompt, see `response.prompt_feedback`: block_reason: PROHIBITED_CONTENT

Processed tweet 9/457: Aspect = Produk
Processed tweet 10/457: Aspect = Produk
Processed tweet 11/457: Aspect = Produk
Processed tweet 12/457: Aspect = Produk
Processed tweet 13/457: Aspect = Layanan
Processed tweet 14/457: Aspect = Produk
Processed tweet 15/457: Aspect = Produk
Processed tweet 16/457: Aspect = Produk
Processed tweet 17/457: Aspect = Produk
Progress saved after 20 tweets.
Processed tweet 18/457: Aspect = Promosi

HANDLING IMBALANCED DATA 

In [16]:
import pandas as pd
from sklearn.utils import resample

# Load dataset
df = pd.read_excel('tweet_with_predicted_aspect.xlsx', sheet_name='sample')  # Ganti nama file jika berbeda

# Cek distribusi awal
print("Distribusi Awal Aspek:")
print(df['Predicted_Aspect'].value_counts())

# Tentukan aspek yang tersedia
aspects = df['Predicted_Aspect'].unique()

# Hitung jumlah sampel tertinggi
max_count = df['Predicted_Aspect'].value_counts().max()

# Lakukan random oversampling per aspek
balanced_df = pd.DataFrame()

for aspect in aspects:
    subset = df[df['Predicted_Aspect'] == aspect]
    resampled_subset = resample(subset,
                                replace=True,
                                n_samples=max_count,
                                random_state=42)
    balanced_df = pd.concat([balanced_df, resampled_subset])

# Reset index
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Cek distribusi setelah balancing
print("\nDistribusi Setelah Balancing:")
print(balanced_df['Predicted_Aspect'].value_counts())

# Simpan dataset balanced
balanced_df.to_excel('balanced_aspect_dataset.xlsx', index=False)
print("\nDataset balanced berhasil disimpan sebagai 'balanced_aspect_dataset.xlsx'")


Distribusi Awal Aspek:
Predicted_Aspect
Produk     255
Promosi     27
Layanan     18
Name: count, dtype: int64

Distribusi Setelah Balancing:
Predicted_Aspect
Layanan    255
Promosi    255
Produk     255
Name: count, dtype: int64

Dataset balanced berhasil disimpan sebagai 'balanced_aspect_dataset.xlsx'


EVALUATION

In [17]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

# Load dataset hasil labeling dan prediksi
df = pd.read_excel('balanced_aspect_dataset.xlsx', sheet_name='Sheet1')  # Pastikan file kamu sesuai

# Ambil kolom yang diperlukan
y_true = df['Valid_Aspect'].astype(str)  # Kolom yang berisi label sebenarnya
y_pred = df['Predicted_Aspect'].astype(str)

# Hitung akurasi keseluruhan
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Hitung precision, recall, f1-score per kelas
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, labels=['Produk', 'Layanan', 'Promosi'])

# Tampilkan hasil per kelas
aspect_labels = ['Produk', 'Layanan', 'Promosi']
for idx, aspect in enumerate(aspect_labels):
    print(f'\nAspect: {aspect}')
    print(f'Precision: {precision[idx]:.4f}')
    print(f'Recall: {recall[idx]:.4f}')
    print(f'F1-Score: {f1[idx]:.4f}')
    print(f'Support: {support[idx]}')

# Tampilkan classification report lengkap
print('\nClassification Report:')
print(classification_report(y_true, y_pred, labels=['Produk', 'Layanan', 'Promosi']))

# Tampilkan confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=['Produk', 'Layanan', 'Promosi'])
print('\nConfusion Matrix:')
print(pd.DataFrame(cm, index=['Produk (True)', 'Layanan (True)', 'Promosi (True)'],
                   columns=['Produk (Pred)', 'Layanan (Pred)', 'Promosi (Pred)']))


Accuracy: 0.6340

Aspect: Produk
Precision: 0.9176
Recall: 0.5120
F1-Score: 0.6573
Support: 457

Aspect: Layanan
Precision: 0.5333
Recall: 0.7727
F1-Score: 0.6311
Support: 176

Aspect: Promosi
Precision: 0.4510
Recall: 0.8712
F1-Score: 0.5943
Support: 132

Classification Report:
              precision    recall  f1-score   support

      Produk       0.92      0.51      0.66       457
     Layanan       0.53      0.77      0.63       176
     Promosi       0.45      0.87      0.59       132

    accuracy                           0.63       765
   macro avg       0.63      0.72      0.63       765
weighted avg       0.75      0.63      0.64       765


Confusion Matrix:
                Produk (Pred)  Layanan (Pred)  Promosi (Pred)
Produk (True)             234             103             120
Layanan (True)             20             136              20
Promosi (True)              1              16             115
