# **Import Libraries and Prepare Dataset**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import time

import re
import nltk
nltk.download('punkt_tab')

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/alicia.siahaya/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
import os
os.chdir("..")

In [3]:
url = (
    "https://docs.google.com/spreadsheets/d/e/2PACX-1vQx0P59eV_KGGFJe-l86dgsz9pZRehTRJoOP_tyVXbrQPtzmD97E1lZ0lgFJ-ATVTT9HkBRX5g1kRKB/pub?output=csv"
)
df = pd.read_csv(url)
print(df.shape)

(302, 7)


In [4]:
df_melt = df.melt(
    value_vars=[
        "Apa yang Anda sukai selama masa perkuliahan Anda?",
        "Apa yang Anda tidak sukai selama masa perkuliahan Anda?"
    ],
    var_name='sentiment',
    value_name='text'
)

sentiment_map = {
    "Apa yang Anda sukai selama masa perkuliahan Anda?": 'positive',
    "Apa yang Anda tidak sukai selama masa perkuliahan Anda?": 'negative'
}
df_melt['sentiment'] = df_melt['sentiment'].map(sentiment_map)

df_preprocess = df_melt[['text', 'sentiment']]

# **Data Cleansing and Preprocessing**

## Split Data

In [5]:
def split_data(df, validation_data=False):
    if validation_data:
        train_df, val_test_df = train_test_split(
            df, test_size=0.2, random_state=42, stratify=df['sentiment']
        )
        val_df, test_df = train_test_split(
            val_test_df, test_size=0.5, random_state=42, stratify=val_test_df['sentiment']
        )

        X_train = train_df['text']
        y_train = train_df['sentiment']
        X_val = val_df['text']
        y_val = val_df['sentiment']
        X_test = test_df['text']
        y_test = test_df['sentiment']

        print(f'Training shape: {X_train.shape}, {y_train.shape}')
        print(f'Validation shape: {X_val.shape}, {y_val.shape}')
        print(f'Test shape: {X_test.shape}, {y_test.shape}')
        return X_train, y_train, X_val, y_val, X_test, y_test

    # Only train/test split
    train_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df['sentiment']
    )
    X_train = train_df['text']
    y_train = train_df['sentiment']
    X_test = test_df['text']
    y_test = test_df['sentiment']
    print(f'Training shape: {X_train.shape}, {y_train.shape}')
    print(f'Test shape: {X_test.shape}, {y_test.shape}')
    return X_train, y_train, X_test, y_test

## Data Cleansing

In [6]:
from src.dictionary.normalization_dictionary import norm_dict

class TextCleansing:
    def __init__(self, text, norm_dict=None):
        self.text = text
        self.norm_dict = norm_dict if norm_dict else {}

    def correct_typos(self, text):
        for typo, correction in self.norm_dict.items():
            clean_text = re.sub(
                rf'\b{typo}\b', correction, text, flags=re.IGNORECASE
            )
        return clean_text
    
    def reduce_extra_characters(self, text):
        """
        Contoh:
        "sukaaaaa" -> "suka"
        """
        return re.sub(r'(.)\1{2,}', r'\1', text)
    
    def split_nya(self, text, exception_words=None):
        if exception_words is None:
            exception_words = ["tanya", "punya", "bertanya", "hanya"]

        if text in exception_words:
            return text

        return re.sub(r'(.*?)nya$', r'\1 nya', text)

    def process_split_nya(self, text):
        words = text.split()
        processed_words = [
            self.split_nya(word.strip()) for word in words
        ]
        return ' '.join(processed_words)

    def clean(self):
        text = self.text.lower()
        text = re.sub(r"[^a-zA-Z\s']", ' ', text)
        text = self.process_split_nya(text)
        text = self.correct_typos(text)
        text = self.reduce_extra_characters(text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

In [7]:
def text_cleansing(text):
    tc = TextCleansing(text, norm_dict=norm_dict)
    return tc.clean()


## Data Preprocessing

In [8]:
from src.models.preprocess.stanza_preprocessor import StanzaPreprocessor
from src.models.preprocess.nlpid_preprocessor import NLPIdPreprocessor
from src.dictionary.exclude_words import exclude_stopwords, exclude_lemmatization

stanza_prep = StanzaPreprocessor(
    exclude_stopwords=exclude_stopwords,
    exclude_lemmatization=exclude_lemmatization
)

nlp_id_prep = NLPIdPreprocessor(
    exclude_stopwords=exclude_stopwords,
    exclude_lemmatization=exclude_lemmatization
)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-12-27 21:52:19 INFO: Downloaded file to /Users/alicia.siahaya/stanza_resources/resources.json
2025-12-27 21:52:19 INFO: Downloading default packages for language: id (Indonesian) ...
2025-12-27 21:52:20 INFO: File exists: /Users/alicia.siahaya/stanza_resources/id/default.zip
2025-12-27 21:52:24 INFO: Finished downloading models and saved to /Users/alicia.siahaya/stanza_resources
2025-12-27 21:52:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-12-27 21:52:24 INFO: Downloaded file to /Users/alicia.siahaya/stanza_resources/resources.json
2025-12-27 21:52:25 INFO: Loading these models for language: id (Indonesian):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| mwt       | gsd          |
| pos       | gsd_charlm   |
| lemma     | gsd_nocharlm |

2025-12-27 21:52:25 INFO: Using device: cpu
2025-12-27 21:52:25 INFO: Loading: tokenize
2025-12-27 21:52:27 INFO: Loading: mwt
2025-12-27 21:52:27 INFO: Loading: pos
2025-12-27 21:52:33 INFO: Loading: lemma
2025-12-27 21:52:33 INFO: Done loading processors!


## Data Preparation

In [9]:
def prepare_dataset(df, model_type, preprocess_type=None):
  df['cleaned_text'] = df['text'].apply(text_cleansing)

  if model_type == 'svm':
    if preprocess_type == 'nlp_id':
      df['preprocessed_text_nlp_id'] = df['cleaned_text'].apply(nlp_id_prep.transform)
      return df

    if preprocess_type == 'stanza':
      df['preprocessed_text_stanza'] = df['cleaned_text'].apply(stanza_prep.transform)
      return df

  if model_type == 'indobert':
    return df

# **Modeling**

In [10]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

# **PIPELINE**

In [11]:
import pickle

with open('src/models/svm_model/vectorizer1.pkl', 'rb') as f:
    vectorizer1 = pickle.load(f)

with open('src/models/svm_model/model_svm1.pkl', 'rb') as f:
    model1 = pickle.load(f)

with open('src/models/svm_model/vectorizer2.pkl', 'rb') as f:
    vectorizer2 = pickle.load(f)

with open('src/models/svm_model/model_svm2.pkl', 'rb') as f:
    model2 = pickle.load(f)

model_indobert = AutoModelForSequenceClassification.from_pretrained("src/models/indobert_model")
tokenizer_indobert = AutoTokenizer.from_pretrained("src/models/indobert_tokenizer")

In [12]:
def run_svm_pipeline(df, model_type, vectorizer=None, model=None, preprocess_type=None):
  start_time = time.time()
  df_preprocessed = prepare_dataset(df, model_type=model_type, preprocess_type=preprocess_type)

  df_tfidf = vectorizer.transform(df_preprocessed[f'preprocessed_text_{preprocess_type}'])
  tfidf_test_df = pd.DataFrame(df_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

  y_pred = model.predict(tfidf_test_df)

  end_time = time.time()
  execution_time = end_time - start_time
  print(f"Execution time for {model_type} using {preprocess_type}: {execution_time} seconds")
  return y_pred

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

def run_indobert_pipeline(df, model, tokenizer):
  start_time = time.time()

  model.eval()

  df_preprocessed = prepare_dataset(df, model_type='indobert')
  texts = df_preprocessed['cleaned_text'].tolist()

  cleaned_texts = [text_cleansing(t) for t in texts]
  inputs = tokenizer(cleaned_texts, return_tensors='pt', padding=True, truncation=True)

  with torch.no_grad():
      outputs = model(**inputs)
      preds = torch.argmax(outputs.logits, dim=1)
      labels = [id2label[int(p)] for p in preds]

  end_time = time.time()
  execution_time = end_time - start_time
  print(f"Execution time for IndoBERT: {execution_time} seconds")
  return labels

## Inference

In [14]:
label_map = {'positive':1, 'negative':0}
id2label = {v: k for k, v in label_map.items()}
target_names = ['negative', 'positive']

### **SVM**

In [15]:
X_train, y_train, X_test, y_test = split_data(df_preprocess, validation_data=False)
y_train = y_train.map(label_map)
y_test = y_test.map(label_map)

df_test = pd.DataFrame({'text': X_test, 'sentiment': y_test})

y_pred_test_nlp_id = run_svm_pipeline(df_test, model_type='svm', vectorizer=vectorizer1, model=model1, preprocess_type='nlp_id')
y_pred_test_stanza = run_svm_pipeline(df_test, model_type='svm', vectorizer=vectorizer2, model=model2, preprocess_type='stanza')

Training shape: (483,), (483,)
Test shape: (121,), (121,)
Execution time for svm using nlp_id: 0.3294408321380615 seconds
Execution time for svm using stanza: 40.16508221626282 seconds


In [16]:
print("Classification Report for SVM with nlp_id:")
print(classification_report(y_test, y_pred_test_nlp_id, target_names=target_names))

Classification Report for SVM with nlp_id:
              precision    recall  f1-score   support

    negative       0.92      0.93      0.93        61
    positive       0.93      0.92      0.92        60

    accuracy                           0.93       121
   macro avg       0.93      0.93      0.93       121
weighted avg       0.93      0.93      0.93       121



In [17]:
print("Classification Report for SVM with stanza:")
print(classification_report(y_test, y_pred_test_stanza, target_names=target_names))

Classification Report for SVM with stanza:
              precision    recall  f1-score   support

    negative       0.97      0.97      0.97        61
    positive       0.97      0.97      0.97        60

    accuracy                           0.97       121
   macro avg       0.97      0.97      0.97       121
weighted avg       0.97      0.97      0.97       121



### **IndoBERT**

In [19]:
x_train, y_train, x_val, y_val, x_test, y_test = split_data(df_preprocess, validation_data=True)

df_test_indobert = pd.DataFrame({'text': x_test, 'sentiment': y_test})

y_pred_indobert = run_indobert_pipeline(df_test_indobert, model_indobert, tokenizer_indobert)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Training shape: (483,), (483,)
Validation shape: (60,), (60,)
Test shape: (61,), (61,)
Execution time for IndoBERT: 5.943676948547363 seconds


In [20]:
print("Classification Report for IndoBERT:")
print(classification_report(y_test, y_pred_indobert))

Classification Report for IndoBERT:
              precision    recall  f1-score   support

    negative       0.97      1.00      0.98        31
    positive       1.00      0.97      0.98        30

    accuracy                           0.98        61
   macro avg       0.98      0.98      0.98        61
weighted avg       0.98      0.98      0.98        61



# **Summary**

## **SVM**
SVM model was evaluated using two different text preprocessing methods in combination with a TF-IDF Vectorizer:
1.  nlp_id for lemmatization and stopwords removal -> achieved **93%** accuracy
2.  stanza for lemmatization and Sastrawi for stopwords removal -> achieved **97%** accuracy

Although the accuracy difference between the two SVM models were relatively small, we will continue to observe based on the time execution for inference. SVM model with nlp_id completed the inference in under 1 second, whereas the SVM model with Stanza and Sastrawi required 40 seconds for the same task.

Considering both accuracy and time efficiency, the SVM model with nlp_id is more suitable for practical and public use, as it offers significantly fast prediction results with only a 4% decrease in accuracy compared to the Stanza-Sastrawi based approach.

## **IndoBERT**
The IndoBERT model achieved highest performance among all evaluated model, with **98%** accuracy, which is expected given its transformer based architecture and larger model capacity. Despite its longer computational time compared to SVM model, IndoBERT showed efficient inference requiring only 5 seconds.

## **Conclusion**
Overall, IndoBERT provides the best trade-off between accuracy and inference time, excelling in performance while maintaining a fast computational time. Therefore, IndoBERT is selected as the final model for the sentiment analysis. 