In [164]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [165]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [166]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [167]:
import pandas as pd
import numpy as np

data = pd.read_csv("/content/drive/My Drive/Tugas Akhir/dataset_with_spelling_check.csv")

data.head()

Unnamed: 0,komentar,label
0,tukang ngibul jangan bagaimana enggak mau semu...,1
1,seolah negara hanya milik pkb saja parah dari ...,0
2,ketua ok dpp partai gerindra juga mengatakan a...,1
3,cara pandang ucapannya kodari kacau rusak demo...,1
4,harus tolak enggak ada yang nama nya apalagi p...,0


In [168]:
sentences = data['komentar']
labels = data['label']

In [169]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')

Loading BERT tokenizer...


In [170]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'indobenchmark/indobert-base-p2', # Use the 12-layer BERT model, with an cased vocab.
    num_labels = 2, 
    output_attentions = False, # return attentions weights
    output_hidden_states = False, # returns all hidden-states
    ignore_mismatched_sizes=True
)

# memberi tau Pytorch untuk menjalankan di GPU
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [171]:
import statistics
sent_length = []

for sent in sentences:
    # Tokenize data dan tambahkan token `[CLS]` dan `[SEP]`.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    sent_length.append(len(input_ids))

print('Average length = ', sum(sent_length)/len(sent_length)) # menghitung average panjang kalimat
print('Median length = ', statistics.median(sent_length)) # menghitung nilai tengah

Average length =  18.16103927688379
Median length =  15


In [310]:
from sklearn.model_selection import train_test_split

# Split data
test_size = 0.2

# membagi data secara acak
train_text, val_text, train_labels, val_labels = train_test_split(data['komentar'], data['label'], random_state = 1109, test_size=test_size, stratify=data['label'])

valid_text, test_text, valid_labels, test_labels = train_test_split(val_text, val_labels, random_state = 42, test_size=test_size, stratify=val_labels)


print(' training samples',len(train_text))
print(' validation samples', len(valid_text))
print('test samples', len(test_text))

 training samples 19736
 validation samples 3948
test samples 987


In [311]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    add_special_tokens = True,
    max_length = 30,
    padding= 'max_length',
    truncation=True,
    return_attention_mask = True,   # membangun attn. masks.
    return_tensors = 'pt',     # kembalikan menjadi pytorch tensors.
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    valid_text.tolist(),
    add_special_tokens = True,
    max_length = 30,
    padding= 'max_length',
    truncation=True,
    return_attention_mask = True,   # membangun attn. masks.
    return_tensors = 'pt',     # kembalikan menjadi pytorch tensors.
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    add_special_tokens = True,
    max_length = 30,
    padding= 'max_length',
    truncation=True,
    return_attention_mask = True,   # membangun attn. masks.
    return_tensors = 'pt',     # kembalikan menjadi pytorch tensors.
)

In [312]:
# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(valid_labels.tolist())

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

  train_seq = torch.tensor(tokens_train['input_ids'])
  train_mask = torch.tensor(tokens_train['attention_mask'])
  val_seq = torch.tensor(tokens_val['input_ids'])
  val_mask = torch.tensor(tokens_val['attention_mask'])
  test_seq = torch.tensor(tokens_test['input_ids'])
  test_mask = torch.tensor(tokens_test['attention_mask'])


In [313]:
# input_ids = []
# attention_masks = []

# for sent in sentences:
#     encoded_dict = tokenizer.encode_plus(
#                         sent,                      # kalimat yang akan di encode
#                         add_special_tokens = True, # Menambahkan token '[CLS]' dan '[SEP]'
#                         max_length = 30,           # Pad & truncate semua kalimat.
#                         padding = "max_length",
#                         truncation=True,
#                         return_attention_mask = True,   # membangun attn. masks.
#                         return_tensors = 'pt',     # kembalikan menjadi pytorch tensors.
#                    )
    
#     input_ids.append(encoded_dict['input_ids'])
#     attention_masks.append(encoded_dict['attention_mask'])

# # gabungkan semua tensor
# input_ids = torch.cat(input_ids, dim=0)
# attention_masks = torch.cat(attention_masks, dim=0)
# labels = torch.tensor(labels.tolist())

# print('Original: ', sentences[0])
# print('Token IDs:', input_ids[0])

In [314]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset

batch_size = 32

# membuat dataloader untuk training dan validation sets. 
train_data = TensorDataset(train_seq, train_mask, train_y)

val_data = TensorDataset(val_seq, val_mask, val_y)

train_dataloader = DataLoader(
            train_data,  # sampel data untuk melakukan training
            sampler = RandomSampler(train_data), # memilih batch secara acak
            batch_size = batch_size # melakukan training dengan batch size yang sudah ditentukan
        )

validation_dataloader = DataLoader(
            val_data, # sampel data untuk melakukan validasi
            sampler = SequentialSampler(val_data), # memilih batch secara acak
            batch_size = batch_size #  melakukan evaluasi dengan batch size yang sudah ditentukan
        )

In [315]:
# membuat optimizer
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8
                )

In [329]:
from transformers import get_linear_schedule_with_warmup

epochs = 12

# Total number of training steps is [number of batches] x [number of epochs]. 
print('Jumlah batch :', len(train_dataloader))
total_steps = len(train_dataloader) * epochs

# membuat learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Jumlah batch : 617


In [330]:
# Fungsi untuk menghitung akurasi hasil prediksi dibandingkan dengan label asli
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten() #menjadikan bentuknya jadi satu dimensi
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [331]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset total loss setiap epoch.
    total_train_loss = 0
    total_train_accuracy = 0
    # training model
    model.train()

    for step, batch in enumerate(train_dataloader):

        # Progress update setiap 50 batch
        if step % 50 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a backward pass
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # token_type_ids is same as the "segment ids", which differentiates 
        # sentence 1 and 2 in sentence-pair tasks
        loss, logits = model(b_input_ids, 
                             token_type_ids=None,
                             return_dict = False,
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. 
        total_train_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_train_accuracy += flat_accuracy(logits, label_ids)
        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    # print("  Accuracy: {0:.2f}".format(avg_train_accuracy))
    print("  Average training loss: {0:.5f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode (batchnorm, dropout disable)
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Deactivate autograd, it will reduce memory usage and speed up computations
        # but you won’t be able to backprop (which you don’t want in an eval script).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   return_dict = False,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.5f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.5f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    #save the best model
    if avg_val_loss < best_valid_loss:
        best_valid_loss = avg_val_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'Epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Accuracy': avg_train_accuracy,
            'Validation Loss': avg_val_loss,
            'Validation Accuracy': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    50  of    617.    Elapsed: 0:00:09.
  Batch   100  of    617.    Elapsed: 0:00:18.
  Batch   150  of    617.    Elapsed: 0:00:28.
  Batch   200  of    617.    Elapsed: 0:00:37.
  Batch   250  of    617.    Elapsed: 0:00:46.
  Batch   300  of    617.    Elapsed: 0:00:56.
  Batch   350  of    617.    Elapsed: 0:01:05.
  Batch   400  of    617.    Elapsed: 0:01:14.
  Batch   450  of    617.    Elapsed: 0:01:24.
  Batch   500  of    617.    Elapsed: 0:01:34.
  Batch   550  of    617.    Elapsed: 0:01:43.
  Batch   600  of    617.    Elapsed: 0:01:53.

  Average training loss: 0.10733
  Training epoch took: 0:01:56

Running Validation...
  Accuracy: 0.95741
  Validation Loss: 0.15879
  Validation took: 0:00:06

Training...
  Batch    50  of    617.    Elapsed: 0:00:10.
  Batch   100  of    617.    Elapsed: 0:00:19.
  Batch   150  of    617.    Elapsed: 0:00:29.
  Batch   200  of    617.    Elapsed: 0:00:39.
  Batch   250  of    617.    Elapsed: 0:00:48.
  Batch   300

In [None]:
# Display floats with two decimal places.
pd.set_option('precision', 5)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('Epoch')

# Display the table.
df_stats

In [None]:
import matplotlib.pyplot as plt

# summarize history for accuracy
plt.plot(df_stats['Training Accuracy'], 'b-o', label="Training")
plt.plot(df_stats['Validation Accuracy'], 'g-o', label="Validation")
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
# summarize history for loss
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Validation Loss'], 'g-o', label="Validation")
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

In [None]:
import os

model_indobert = '/content/drive/My Drive/Tugas Akhir/model/'

print("Saving model to %s" % model_indobert)

model_to_save = model.module if hasattr(model, 'module') else model 
model_to_save.save_pretrained(model_indobert)
tokenizer.save_pretrained(model_indobert)

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

In [None]:
# get predictions for test data
with torch.no_grad():
  outputs = model(test_seq.to(device), test_mask.to(device))
  preds = outputs[0]
  preds = preds.detach().cpu().numpy()

In [None]:
preds

In [None]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

In [None]:
# confusion matrix
pd.crosstab(test_y, preds)