# **Import Libraries and Prepare Dataset**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from wordcloud import WordCloud
from collections import Counter

import re
import nltk
nltk.download('punkt_tab')

from sklearn.model_selection import train_test_split

import stanza
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nlp_id.lemmatizer import Lemmatizer
from nlp_id.tokenizer import Tokenizer
from nlp_id.stopword import StopWord

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/alicia.siahaya/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
url = (
    "https://docs.google.com/spreadsheets/d/e/2PACX-1vQx0P59eV_KGGFJe-l86dgsz9pZRehTRJoOP_tyVXbrQPtzmD97E1lZ0lgFJ-ATVTT9HkBRX5g1kRKB/pub?output=csv"
)
df = pd.read_csv(url)
print(df.shape)

(302, 7)


In [3]:
df_melt = df.melt(
    value_vars=[
        "Apa yang Anda sukai selama masa perkuliahan Anda?",
        "Apa yang Anda tidak sukai selama masa perkuliahan Anda?"
    ],
    var_name='sentiment',
    value_name='text'
)

sentiment_map = {
    "Apa yang Anda sukai selama masa perkuliahan Anda?": 'positive',
    "Apa yang Anda tidak sukai selama masa perkuliahan Anda?": 'negative'
}
df_melt['sentiment'] = df_melt['sentiment'].map(sentiment_map)

df_preprocess = df_melt[['text', 'sentiment']]

# **Data Cleansing and Preprocessing**

## Split Data

In [4]:
def split_data(df, validation_data=False):
    if validation_data:
        train_df, val_test_df = train_test_split(
            df, test_size=0.2, random_state=42, stratify=df['sentiment']
        )
        val_df, test_df = train_test_split(
            val_test_df, test_size=0.5, random_state=42, stratify=val_test_df['sentiment']
        )

        X_train = train_df['text']
        y_train = train_df['sentiment']
        X_val = val_df['text']
        y_val = val_df['sentiment']
        X_test = test_df['text']
        y_test = test_df['sentiment']

        print(f'Training shape: {X_train.shape}, {y_train.shape}')
        print(f'Validation shape: {X_val.shape}, {y_val.shape}')
        print(f'Test shape: {X_test.shape}, {y_test.shape}')
        return X_train, y_train, X_val, y_val, X_test, y_test

    # Only train/test split
    train_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df['sentiment']
    )
    X_train = train_df['text']
    y_train = train_df['sentiment']
    X_test = test_df['text']
    y_test = test_df['sentiment']
    print(f'Training shape: {X_train.shape}, {y_train.shape}')
    print(f'Test shape: {X_test.shape}, {y_test.shape}')
    return X_train, y_train, X_test, y_test

## Data Cleansing

In [5]:
import sys
sys.path.append("../src/dictionary")
sys.path.append("../src/models")

In [6]:
from normalization_dictionary import norm_dict

def correct_typos(text):
    for typo, correction in norm_dict.items():
        text = re.sub(typo, correction, text, flags=re.IGNORECASE)
    return text

# Contoh: "Saya sukaaaa sekali dengan kampus saya" -> "Saya suka sekali dengan kampus saya"
def reduce_extra_characters(text):
    return re.sub(r'(.)\1{2,}', r'\1', text)

exception_words = ["tanya", 'punya', 'bertanya', 'hanya'] # Kata-kata yang berakhiran dengan -nya namun memiliki arti sendiri
def split_nya(word):
    if word in exception_words:
        return word
    return re.sub(r'(.*?)nya$', r'\1 nya', word)

def process_split_nya(review):
    words = review.split()
    processed_words = [split_nya(word.strip()) for word in words]
    return ' '.join(processed_words)

def text_cleansing(text):
    # Mengubah ke huruf kecil
    text = text.lower()

    # Menghapus simbol dan angka
    text = re.sub(r"[^a-zA-Z\s']", ' ', text)

    # Memisahkan kata dengan akhiran -nya
    text = process_split_nya(text)

    # Memperbaiki typo dan huruf yang berlebih
    text = correct_typos(text)
    text = reduce_extra_characters(text)

    # Menghapus spasi yang berlebih
    text = re.sub(r'\s+', ' ', text).strip()

    return text

## Data Preprocessing

In [7]:
from exclude_words import exclude_lemmatization, exclude_stopwords

lemmatizer = Lemmatizer()
tokenizer = Tokenizer()
stopword = StopWord()

# Menambahkan kata untuk stop words
stop_words = stopword.get_stopword()
custom_stopwords = ['nya', 'ya', 'nih']
stop_words.append(custom_stopwords)

def text_preprocessing(text):
    # Tokenisasi menggunakan Tokenizer dari nlp_id
    tokens = tokenizer.tokenize(text)

    # Lematisasi dan hapus stopwords
    lemmatized_tokens = [
        lemmatizer.lemmatize(word) if (word not in exclude_stopwords and word not in exclude_lemmatization) else word
        for word in tokens
        if word not in stop_words or word in exclude_stopwords
    ]

    # Gabungkan kembali menjadi teks dan hapus spasi berlebih
    processed_text = re.sub(r'\s+', ' ', ' '.join(lemmatized_tokens)).strip()

    return processed_text

In [8]:
from exclude_words import exclude_lemmatization, exclude_stopwords

stanza.download('id')
nlp = stanza.Pipeline(lang='id', processors='tokenize,mwt,pos,lemma')

factory = StopWordRemoverFactory()
sastrawi_stopwords = set(factory.get_stop_words())
custom_stopwords = {'nya', 'ya', 'nih'}
all_stopwords = sastrawi_stopwords.union(custom_stopwords)

def text_preprocessing2(text):
  doc = nlp(text)

  lemmatized_tokens = []
  for sentence in doc.sentences:
      for word in sentence.words:
          token = word.text.lower()
          lemma = word.lemma.lower()

          # Lakukan filtering stopwords dan pengecualian
          if token in exclude_stopwords or token not in all_stopwords:
              lemmatized_token = token if token in exclude_lemmatization else lemma
              lemmatized_tokens.append(lemmatized_token)

  # Hapus spasi berlebih dan gabungkan kembali ke teks
  cleaned_text = re.sub(r'\s+', ' ', ' '.join(lemmatized_tokens)).strip()

  return cleaned_text

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-10-07 13:11:15 INFO: Downloaded file to /Users/alicia.siahaya/stanza_resources/resources.json
2025-10-07 13:11:15 INFO: Downloading default packages for language: id (Indonesian) ...
2025-10-07 13:11:17 INFO: File exists: /Users/alicia.siahaya/stanza_resources/id/default.zip
2025-10-07 13:11:20 INFO: Finished downloading models and saved to /Users/alicia.siahaya/stanza_resources
2025-10-07 13:11:20 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-10-07 13:11:20 INFO: Downloaded file to /Users/alicia.siahaya/stanza_resources/resources.json
2025-10-07 13:11:21 INFO: Loading these models for language: id (Indonesian):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| mwt       | gsd          |
| pos       | gsd_charlm   |
| lemma     | gsd_nocharlm |

2025-10-07 13:11:21 INFO: Using device: cpu
2025-10-07 13:11:21 INFO: Loading: tokenize
2025-10-07 13:11:22 INFO: Loading: mwt
2025-10-07 13:11:22 INFO: Loading: pos
2025-10-07 13:11:26 INFO: Loading: lemma
2025-10-07 13:11:27 INFO: Done loading processors!


## Data Preparation

In [9]:
def prepare_dataset(df, model_type, preprocess_type=None):
  df['cleaned_text'] = df['text'].apply(text_cleansing)

  if model_type == 'svm':
    if preprocess_type == 'nlp_id':
      df['preprocessed_text_nlp_id'] = df['cleaned_text'].apply(text_preprocessing)
      return df

    if preprocess_type == 'stanza':
      df['preprocessed_text_stanza'] = df['cleaned_text'].apply(text_preprocessing2)
      return df

  if model_type == 'indobert':
    return df

# **Modeling**

In [10]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

# **PIPELINE**

In [11]:
import pickle

with open('../src/models/svm_model/vectorizer1.pkl', 'rb') as f:
    vectorizer1 = pickle.load(f)

with open('../src/models/svm_model/model_svm1.pkl', 'rb') as f:
    model1 = pickle.load(f)

with open('../src/models/svm_model/vectorizer2.pkl', 'rb') as f:
    vectorizer2 = pickle.load(f)

with open('../src/models/svm_model/model_svm2.pkl', 'rb') as f:
    model2 = pickle.load(f)

model_indobert = AutoModelForSequenceClassification.from_pretrained("../src/models/indobert_model")
tokenizer_indobert = AutoTokenizer.from_pretrained("../src/models/indobert_tokenizer")

FileNotFoundError: [Errno 2] No such file or directory: '../src/models/svm_model/vectorizer1.pkl'

In [None]:
def run_svm_pipeline(df, model_type, vectorizer=None, model=None, preprocess_type=None):
  start_time = time.time()
  df_preprocessed = prepare_dataset(df, model_type=model_type, preprocess_type=preprocess_type)

  df_tfidf = vectorizer.transform(df_preprocessed[f'preprocessed_text_{preprocess_type}'])
  tfidf_test_df = pd.DataFrame(df_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

  y_pred = model.predict(tfidf_test_df)

  end_time = time.time()
  execution_time = end_time - start_time
  print(f"Execution time for {model_type} using {preprocess_type}: {execution_time} seconds")
  return y_pred

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

def run_indobert_pipeline(df, model, tokenizer):
  start_time = time.time()

  model.eval()

  df_preprocessed = prepare_dataset(df, model_type='indobert')
  texts = df_preprocessed['cleaned_text'].tolist()

  cleaned_texts = [text_cleansing(t) for t in texts]
  inputs = tokenizer(cleaned_texts, return_tensors='pt', padding=True, truncation=True)

  with torch.no_grad():
      outputs = model(**inputs)
      preds = torch.argmax(outputs.logits, dim=1)
      labels = [id2label[int(p)] for p in preds]

  end_time = time.time()
  execution_time = end_time - start_time
  print(f"Execution time for IndoBERT: {execution_time} seconds")
  return labels

## Inference

In [None]:
label_map = {'positive':1, 'negative':0}
id2label = {v: k for k, v in label_map.items()}
target_names = ['negative', 'positive']

### **SVM**

In [None]:
X_train, y_train, X_test, y_test = split_data(df_preprocess, validation_data=False)
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)

df_test = pd.DataFrame({'text': X_test, 'sentiment': y_test})

y_pred_test_nlp_id = run_svm_pipeline(df_test, model_type='svm', vectorizer=vectorizer1, model=model1, preprocess_type='nlp_id')
y_pred_test_stanza = run_svm_pipeline(df_test, model_type='svm', vectorizer=vectorizer2, model=model2, preprocess_type='stanza')

Training shape: (481,), (481,)
Test shape: (121,), (121,)
Execution time for svm using nlp_id: 1.4417951107025146 seconds
Execution time for svm using stanza: 53.72056698799133 seconds


In [None]:
print("Classification Report for SVM with nlp_id:")
print(classification_report(y_test, y_pred_test_nlp_id, target_names=target_names))

Classification Report for SVM with nlp_id:
              precision    recall  f1-score   support

    negative       0.92      0.95      0.94        61
    positive       0.95      0.92      0.93        60

    accuracy                           0.93       121
   macro avg       0.93      0.93      0.93       121
weighted avg       0.93      0.93      0.93       121



In [None]:
print("Classification Report for SVM with stanza:")
print(classification_report(y_test, y_pred_test_stanza, target_names=target_names))

Classification Report for SVM with stanza:
              precision    recall  f1-score   support

    negative       0.97      0.97      0.97        61
    positive       0.97      0.97      0.97        60

    accuracy                           0.97       121
   macro avg       0.97      0.97      0.97       121
weighted avg       0.97      0.97      0.97       121



### **IndoBERT**

In [None]:
x_train, y_train, x_val, y_val, x_test, y_test = split_data(df_preprocess, validation_data=True)

df_test_indobert = pd.DataFrame({'text': x_test, 'sentiment': y_test})

y_pred_indobert = run_indobert_pipeline(df_test_indobert, model_indobert, tokenizer_indobert)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Training shape: (481,), (481,)
Validation shape: (60,), (60,)
Test shape: (61,), (61,)
Execution time for IndoBERT: 7.022518157958984 seconds


In [None]:
print("Classification Report for IndoBERT:")
print(classification_report(y_test, y_pred_indobert))

Classification Report for IndoBERT:
              precision    recall  f1-score   support

    negative       0.97      1.00      0.98        31
    positive       1.00      0.97      0.98        30

    accuracy                           0.98        61
   macro avg       0.98      0.98      0.98        61
weighted avg       0.98      0.98      0.98        61



# **Comparison**

In [None]:
# Model Performance and Execution Time Visualization
import matplotlib.pyplot as plt
import numpy as np

# Data from the experiments
models = ['SVM (nlp_id)', 'SVM (Stanza+Sastrawi)', 'IndoBERT']
accuracies = [0.93, 0.97, 0.98]  # Accuracy from classification reports
execution_times = [1.44, 53.72, 7.02]  # Execution times in seconds

# Create the plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Accuracy comparison
bars1 = ax1.bar(models, accuracies, color=['#2E8B57', '#FF6B6B', '#4ECDC4'], alpha=0.8, edgecolor='black')
ax1.set_ylabel('Accuracy', fontsize=12, fontweight='bold')
ax1.set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
ax1.set_ylim(0.9, 1.0)
ax1.grid(True, alpha=0.3)

# Add value labels on bars
for bar, acc in zip(bars1, accuracies):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.005,
             f'{acc:.2f}', ha='center', va='bottom', fontweight='bold')

# Plot 2: Execution time comparison (log scale for better visualization)
bars2 = ax2.bar(models, execution_times, color=['#2E8B57', '#FF6B6B', '#4ECDC4'], alpha=0.8, edgecolor='black')
ax2.set_ylabel('Execution Time (seconds)', fontsize=12, fontweight='bold')
ax2.set_title('Model Execution Time Comparison', fontsize=14, fontweight='bold')
ax2.set_yscale('log')  # Log scale due to large difference in times
ax2.grid(True, alpha=0.3)

# Add value labels on bars
for bar, time in zip(bars2, execution_times):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height * 1.1,
             f'{time:.1f}s', ha='center', va='bottom', fontweight='bold')

# Rotate x-axis labels for better readability
for ax in [ax1, ax2]:
    ax.tick_params(axis='x', rotation=45)
    ax.set_xticklabels(models, fontsize=10)

plt.tight_layout()
plt.show()

# Create a combined scatter plot showing the trade-off
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

# Create scatter plot
colors = ['#2E8B57', '#FF6B6B', '#4ECDC4']
sizes = [200, 200, 200]  # Point sizes

scatter = ax.scatter(execution_times, accuracies, c=colors, s=sizes, alpha=0.7, edgecolors='black', linewidth=2)

# Add labels for each point
for i, model in enumerate(models):
    ax.annotate(model, (execution_times[i], accuracies[i]), 
                xytext=(10, 10), textcoords='offset points',
                fontsize=10, fontweight='bold',
                bbox=dict(boxstyle='round,pad=0.3', facecolor=colors[i], alpha=0.7))

# Customize the plot
ax.set_xlabel('Execution Time (seconds)', fontsize=12, fontweight='bold')
ax.set_ylabel('Accuracy', fontsize=12, fontweight='bold')
ax.set_title('Model Performance vs Execution Time Trade-off', fontsize=14, fontweight='bold')
ax.set_xscale('log')  # Log scale for better visualization
ax.grid(True, alpha=0.3)

# Add quadrant lines to show the trade-off
ax.axhline(y=0.95, color='red', linestyle='--', alpha=0.5, label='High Accuracy Threshold (95%)')
ax.axvline(x=10, color='blue', linestyle='--', alpha=0.5, label='Fast Execution Threshold (10s)')

# Add legend
ax.legend()

plt.tight_layout()
plt.show()

# Create a summary table
print("=" * 80)
print("MODEL PERFORMANCE SUMMARY")
print("=" * 80)
print(f"{'Model':<25} {'Accuracy':<10} {'Execution Time':<15} {'Speed Category':<15}")
print("-" * 80)
print(f"{'SVM (nlp_id)':<25} {'93%':<10} {'1.44s':<15} {'Fast ⚡':<15}")
print(f"{'SVM (Stanza+Sastrawi)':<25} {'97%':<10} {'53.72s':<15} {'Slow 🐌':<15}")
print(f"{'IndoBERT':<25} {'98%':<10} {'7.02s':<15} {'Medium 🚀':<15}")
print("=" * 80)

# Performance vs Speed Analysis
print("\nPERFORMANCE vs SPEED ANALYSIS:")
print("-" * 40)
print("• SVM (nlp_id): Best speed-performance balance")
print("• SVM (Stanza): High accuracy but impractical speed")
print("• IndoBERT: Highest accuracy with reasonable speed")
print("\nRECOMMENDATIONS:")
print("-" * 40)
print("• Real-time applications: SVM (nlp_id)")
print("• High accuracy needs: IndoBERT")
print("• Research/analysis: SVM (Stanza) for preprocessing study")

# **Summary**

## **SVM**
SVM model used two different preprocessing methods and TF-IDF Vectorizer:
- nlp_id for lemmatization and stopwords removal -> achieved **93%** accuracy
- stanza for lemmatization and Sastrawi for stopwords removal -> achieved **97%** accuracy

While the accuracy is both decent and not that different, the time execution between both model showed a bigger gap in predicting the test set. SVM using nlp_id predicted in **under 1 second**, while SVM using stanza and Sastrawi predicted in **13 seconds**.

With a similar time execution, it is better to use SVM with nlp_id because it's faster than SVM with stanza and Sastrawi.

## **IndoBERT**
IndoBERT model achieved higher accuracy than both SVM models, with **98%** accuracy which is expected since it is a larger model. But the time execution and architecture is surely bigger than traditional machine learning models. With a time execution of **7 seconds**.

With a time difference of 3 seconds but with a 5% accuracy trade off, it's better to use IndoBERT because it achieved the best accuracy among 3 models.