In [1]:
# Importing libraries for data preparation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.data import Dataset
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import copy

# Data Preparation

In [2]:
df_train = pd.read_csv("X_train_stemm_cleaned.csv")
df_valid = pd.read_csv("X_valid_stemm_cleaned.csv")
df_test = pd.read_csv("X_test_stemm_cleaned.csv")

X_train_clean = df_train['tweet_stemming']
X_valid_clean = df_valid['tweet_stemming']
X_test_clean = df_test['tweet_stemming']

y_train = pd.read_csv("y_train.csv")
y_valid = pd.read_csv("y_valid.csv")
y_test = pd.read_csv("y_test.csv")

In [3]:
# For SparseCategoricalCrossentropy
with open('y_train_encoded.pkl', 'rb') as file:
    y_train_encoded = pickle.load(file)

with open('y_valid_encoded.pkl', 'rb') as file:
    y_valid_encoded = pickle.load(file)

with open('y_test_encoded.pkl', 'rb') as file:
    y_test_encoded = pickle.load(file)

# Model

In [4]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1', do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

In [5]:
max_length = 144
def tokenize_sentences(X):
    input_ids = []
    attention_masks = []

    for sent in X:
        encoded_dict = tokenizer.encode_plus(
                            sent,
                            add_special_tokens = True,
                            max_length = max_length,
                            padding = 'max_length',
                            return_attention_mask = True,
                            return_tensors = 'pt',
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [6]:
# Tokenize training, validation, and test sets
X_train_ids, X_train_masks = tokenize_sentences(X_train_clean)
X_valid_ids, X_valid_masks = tokenize_sentences(X_valid_clean)
X_test_ids, X_test_masks = tokenize_sentences(X_test_clean)

In [7]:
# Split data into train, validation, and test sets
dataset_train = TensorDataset(X_train_ids, X_train_masks, torch.tensor(y_train_encoded).type(torch.LongTensor))
dataset_valid = TensorDataset(X_valid_ids, X_valid_masks, torch.tensor(y_valid_encoded).type(torch.LongTensor))
dataset_test = TensorDataset(X_test_ids, X_test_masks, torch.tensor(y_test_encoded).type(torch.LongTensor))

In [8]:
batch_size = 16
train_dataloader = DataLoader(dataset_train, sampler=torch.utils.data.RandomSampler(dataset_train), batch_size=batch_size)
val_dataloader = DataLoader(dataset_valid, sampler=torch.utils.data.SequentialSampler(dataset_valid), batch_size=batch_size)
test_dataloader = DataLoader(dataset_test, sampler=torch.utils.data.SequentialSampler(dataset_test), batch_size=batch_size)

In [11]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', num_labels=3)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 50
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Define device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print("device:",device)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device: cuda


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Training

In [12]:
# Training loop
model.train()
best_model_wts = copy.deepcopy(model.state_dict())
best_loss = float('inf')
patience = 10  # Number of epochs with no improvement after which training will be stopped
patience_counter = 0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        model.zero_grad()

        outputs = model(**inputs)

        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        # Print loss every 10 steps
        if step % 10 == 0 and step != 0:
            print(f"  Step {step}/{len(train_dataloader)} - Loss: {loss.item():.4f}")

    avg_epoch_loss = total_loss / len(train_dataloader)
    print(f"Average loss for epoch {epoch + 1}: {avg_epoch_loss:.4f}")

    # Validation step (assuming you have a validation dataloader named val_dataloader)
    model.eval()
    val_loss = 0
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs[0]
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Average validation loss for epoch {epoch + 1}: {avg_val_loss:.4f}")

    model.train()

    # Check if we have a new best model
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        patience_counter = 0
    else:
        patience_counter += 1
    print("patience_counter:",patience_counter)
    # Check if early stopping should be applied
    if patience_counter >= patience:
        print("Early stopping triggered")
        break

# Load best model weights
model.load_state_dict(best_model_wts)

# Save the model
torch.save(model.state_dict(), 'indobert_model.bin')

Epoch 1/50
  Step 10/80 - Loss: 0.9814
  Step 20/80 - Loss: 1.0772
  Step 30/80 - Loss: 1.0396
  Step 40/80 - Loss: 1.0566
  Step 50/80 - Loss: 0.9092
  Step 60/80 - Loss: 1.1607
  Step 70/80 - Loss: 1.0211
Average loss for epoch 1: 0.9277
Average validation loss for epoch 1: 0.7464
patience_counter: 0
Epoch 2/50
  Step 10/80 - Loss: 0.6939
  Step 20/80 - Loss: 0.4117
  Step 30/80 - Loss: 0.6217
  Step 40/80 - Loss: 0.7116
  Step 50/80 - Loss: 0.8535
  Step 60/80 - Loss: 0.6563
  Step 70/80 - Loss: 0.7078
Average loss for epoch 2: 0.6374
Average validation loss for epoch 2: 0.7360
patience_counter: 0
Epoch 3/50
  Step 10/80 - Loss: 0.4362
  Step 20/80 - Loss: 0.2858
  Step 30/80 - Loss: 0.1803
  Step 40/80 - Loss: 0.5811
  Step 50/80 - Loss: 0.6065
  Step 60/80 - Loss: 0.2401
  Step 70/80 - Loss: 0.6389
Average loss for epoch 3: 0.4086
Average validation loss for epoch 3: 0.8943
patience_counter: 1
Epoch 4/50
  Step 10/80 - Loss: 0.1825
  Step 20/80 - Loss: 0.1806
  Step 30/80 - Loss: 

# Evaluation

In [13]:
# Load the model state dictionary from the file
model_state_dict = torch.load('indobert_model.bin')

# Load the state dictionary into the model
model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [14]:
# Evaluation loop
model.eval()

y_true = []
y_pred = []

for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}

        outputs = model(**inputs)

    logits = outputs[1]
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()

    y_pred.extend(logits.argmax(axis=-1))
    y_true.extend(label_ids)

# # Convert predictions and true labels back to original labels
# y_pred = label_encoder.inverse_transform(y_pred)
# y_true = label_encoder.inverse_transform(y_true)

def mapping_sentiment(x):
    if x == 2:
        return "positif"
    elif x == 0:
        return "negatif"
    else:
        return "netral"

y_pred = [mapping_sentiment(x) for x in y_pred]
y_true = [mapping_sentiment(x) for x in y_true]

# Print classification report
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

     negatif       0.58      0.78      0.67        89
      netral       0.70      0.52      0.59        91
     positif       0.62      0.59      0.60        92

    accuracy                           0.62       272
   macro avg       0.64      0.63      0.62       272
weighted avg       0.64      0.62      0.62       272

