# **Import Libraries and Prepare Dataset**

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# SVM
df_train = pd.read_csv('../src/data/df_train.csv')
df_test = pd.read_csv('../src/data/df_test.csv')

df_train2 = pd.read_csv('../src/data/df_train2.csv')
df_test2 = pd.read_csv('../src/data/df_test2.csv')

# IndoBERT
df_modeling = pd.read_csv('../src/data/df_modeling_BERT.csv')

In [7]:
label_map = {'positive':1, 'negative':0}

df_train['sentiment'] = df_train['sentiment'].map(label_map)
df_test['sentiment'] = df_test['sentiment'].map(label_map)

df_train2['sentiment'] = df_train2['sentiment'].map(label_map)
df_test2['sentiment'] = df_test2['sentiment'].map(label_map)

df_modeling['sentiment'] = df_modeling['sentiment'].map(label_map)

# **Functions**

In [8]:
def vectorize_text(vectorizer, x_train, x_test):
  train_tfidf_matrix = vectorizer.fit_transform(x_train)
  test_tfidf_matrix = vectorizer.transform(x_test)

  tfidf_train_df = pd.DataFrame(train_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
  tfidf_test_df = pd.DataFrame(test_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

  return tfidf_train_df, tfidf_test_df

def train_eval(vectorizer, model, column, df_train, df_test):
  x_train = df_train[column]
  x_test = df_test[column]
  y_train = df_train['sentiment']
  y_test = df_test['sentiment']
  tfidf_train, tfidf_test = vectorize_text(vectorizer, x_train, x_test)

  model.fit(tfidf_train, y_train)
  y_pred = model.predict(tfidf_test)

  scores = {}
  scores.update(evaluate(y_test, y_pred))

  target_names = ['negative', 'positive']
  print("Classification Report:")
  print(classification_report(y_test, y_pred, target_names=target_names))

  return scores, model

def evaluate(y, y_pred):
    metrics = classification_report(y, y_pred, output_dict=True, zero_division=1.0)

    return {
        "precision_pos": metrics['1']['precision'],
        "recall_pos": metrics['1']['recall'],
        "f1_pos": metrics['1']['f1-score'],
        "precision_neg": metrics['0']['precision'],
        "recall_neg": metrics['0']['recall'],
        "f1_neg": metrics['0']['f1-score'],
        "precision_avg": metrics['macro avg']['precision'],
        "recall_avg": metrics['macro avg']['recall'],
        "f1_avg": metrics['macro avg']['f1-score']
    }

# **Support Vector Machine**

In [9]:
vectorizer1 = TfidfVectorizer()
model1 = svm.LinearSVC()
scores, _ = train_eval(vectorizer1, model1, 'cleaned_text_nlpid', df_train, df_test)

Classification Report:
              precision    recall  f1-score   support

    negative       0.92      0.95      0.94        61
    positive       0.95      0.92      0.93        60

    accuracy                           0.93       121
   macro avg       0.93      0.93      0.93       121
weighted avg       0.93      0.93      0.93       121



In [10]:
vectorizer2 = TfidfVectorizer()
model2 = svm.LinearSVC()
scores, _ = train_eval(vectorizer2, model2, 'cleaned_text_stanza_sastrawi', df_train2, df_test2)

Classification Report:
              precision    recall  f1-score   support

    negative       0.97      0.97      0.97        61
    positive       0.97      0.97      0.97        60

    accuracy                           0.97       121
   macro avg       0.97      0.97      0.97       121
weighted avg       0.97      0.97      0.97       121



In [None]:
# import pickle
# import os

# os.makedirs('./src/models/svm_model', exist_ok=True)

# with open('./src/models/svm_model/vectorizer1.pkl', 'wb') as f:
#     pickle.dump(vectorizer1, f)

# with open('./src/models/svm_model/vectorizer2.pkl', 'wb') as f:
#     pickle.dump(vectorizer2, f)

# with open('./src/models/svm_model/model_svm1.pkl', 'wb') as f:
#     pickle.dump(model1, f)

# with open('./src/models/svm_model/model_svm2.pkl', 'wb') as f:
#     pickle.dump(model2, f)

# **IndoBERT**

In [11]:
import pandas as pd
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding

In [12]:
def split_data(df, validation_data=False):
    X = df['cleaned_text']
    y = df['sentiment']

    if validation_data:
        X_train, X_val_test, y_train, y_val_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        X_val, X_test, y_val, y_test = train_test_split(
            X_val_test, y_val_test, test_size=0.5, random_state=42, stratify=y_val_test
        )

        print(f"Training shape: {X_train.shape}")
        print(f"Validation shape: {X_val.shape}")
        print(f"Test shape: {X_test.shape}")

        return X_train, y_train, X_val, y_val, X_test, y_test

    else:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        print(f"Training shape: {X_train.shape}")
        print(f"Test shape: {X_test.shape}")

        return X_train, y_train, X_test, y_test


x_train, y_train, x_val, y_val, x_test, y_test = split_data(df_modeling, validation_data=True)

Training shape: (481,)
Validation shape: (60,)
Test shape: (61,)


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype="auto",
    revision="refs/pr/3",      
    low_cpu_mem_usage=True,
)


# Custom Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = tokenizer(
            text,
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Create datasets
train_dataset = SentimentDataset(x_train.tolist(), y_train.tolist())
val_dataset = SentimentDataset(x_val.tolist(), y_val.tolist())
test_dataset = SentimentDataset(x_test.tolist(), y_test.tolist())

# Data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Step 5: Define Training Arguments and Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    learning_rate=2e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    logging_steps=10,
    report_to='none'
)

# Updated compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    report = classification_report(labels, preds, output_dict=True)
    return {
        'f1': report['weighted avg']['f1-score'],
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall']
    }

# Initialize Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement after 2 epochs
)

In [9]:
#Step 6: Train and Evaluate the Model
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Precision,Recall
1,0.7636,0.707796,0.0192,0.882517,0.894286,0.883333
2,0.3769,0.337588,0.0192,0.949986,0.950501,0.95
3,0.1691,0.211981,0.0192,0.949986,0.950501,0.95
4,0.1023,0.189135,0.0192,0.949986,0.950501,0.95


TrainOutput(global_step=244, training_loss=0.4670765239928589, metrics={'train_runtime': 1740.1949, 'train_samples_per_second': 1.382, 'train_steps_per_second': 0.175, 'total_flos': 126559826525184.0, 'train_loss': 0.4670765239928589, 'epoch': 4.0})

In [10]:
 # Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("Test Set Evaluation:")
print(f"F1 Score: {test_results['eval_f1']:.4f}")
print(f"Precision: {test_results['eval_precision']:.4f}")
print(f"Recall: {test_results['eval_recall']:.4f}")

# Detailed classification report
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = test_dataset.labels
print("Classification Report for Test Set:")
print(classification_report(y_true, y_pred, target_names=label_map.keys()))

each_class = compute_metrics(predictions)

Test Set Evaluation:
F1 Score: 0.9836
Precision: 0.9841
Recall: 0.9836
Classification Report for Test Set:
              precision    recall  f1-score   support

    positive       0.97      1.00      0.98        31
    negative       1.00      0.97      0.98        30

    accuracy                           0.98        61
   macro avg       0.98      0.98      0.98        61
weighted avg       0.98      0.98      0.98        61



In [11]:
# Create folder to store model
model_path = "./src/models/indobert_model"
tokenizer_path = "./src/models/indobert_tokenizer"

# Save the model
model.save_pretrained(model_path)

# Save the tokenizer
tokenizer.save_pretrained(tokenizer_path)

('./src/models/indobert_tokenizer/tokenizer_config.json',
 './src/models/indobert_tokenizer/special_tokens_map.json',
 './src/models/indobert_tokenizer/vocab.txt',
 './src/models/indobert_tokenizer/added_tokens.json',
 './src/models/indobert_tokenizer/tokenizer.json')

In [12]:
!rm -rf ./results

python(14329) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
