In [1]:
# source https://www.youtube.com/watch?v=pEMe2d0MlTg
# install library
!pip install transformers



In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Jan 23 13:13:13 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


In [4]:
import torch
import pandas as pd

if torch.cuda.is_available():
  device = torch.device('cuda')

  print('there are %d GPU(s) available.' % torch.cuda.device_count())

  print('we will use the GPU: ', torch.cuda.get_device_name(0))

else:
  print("No GPU available, using the CPU instead")
  device = torch.device("cpu")

there are 1 GPU(s) available.
we will use the GPU:  Tesla T4


# Load dataset

In [5]:
df = pd.read_csv("train.csv")

# change bully to numerical value
df['bully'] = df['bully'].replace("no",0)
df['bully'] = df['bully'].replace("yes",1)
df

Unnamed: 0,bully,tweet,individual,group,gender,physical,race,religion
0,0,USER terimakasih Ustadz sudah bersuara tentang...,0,0,0,0,0,0
1,0,USER USER Maaf sebenarnya twiter pertama kali ...,0,0,0,0,0,0
2,1,USER Anjing tai goblok idiot bangsat monyet ba...,1,0,1,1,0,0
3,0,"Hadiri Lokakarya Kebudayaan Daerah, Bupati Rup...",0,0,0,0,0,0
4,1,USER USER USER yg kaya gini layak di tangkap.,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
10530,1,USER USER USER kyak dah lahir aja ngomong soeh...,0,1,0,0,0,0
10531,1,USER USER USER USER USER Pencitraan Lu...ah......,1,0,0,0,0,0
10532,1,USER USER USER USER USER cebong doang yg tukan...,0,1,0,0,0,0
10533,1,USER BIASA LAH PROF KAUM KAMBING BENGEK YA BEG...,0,1,0,0,0,0


In [6]:
sentences = df.tweet.values
labels = df.bully.values

# Load BERT Tokenizer

In [7]:
from transformers import BertTokenizer, AutoModel
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p2")
model = AutoModel.from_pretrained("indobenchmark/indobert-large-p2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [8]:
tokenizer.save_pretrained("/content")

('/content/tokenizer_config.json',
 '/content/special_tokens_map.json',
 '/content/vocab.txt',
 '/content/added_tokens.json')

In [9]:
ls

[0m[01;34msample_data[0m/  special_tokens_map.json  tokenizer_config.json  train.csv  vocab.txt


In [10]:
print("Original: ", sentences[0])
print("Tokenized: ", tokenizer.tokenize(sentences[0]))
print("Token IDS: ", tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

Original:  USER terimakasih Ustadz sudah bersuara tentang Radikal radikal ini. Entah apa yang ada dalam pikiran rejim. Mesjid radikal...kampus radikal....dosen radikal....padahal tempat tersebut pijakan peradaban. Memangnya mau menghancurkan Indonesia ?
Tokenized:  ['user', 'terimakasih', 'ustadz', 'sudah', 'bersuara', 'tentang', 'radikal', 'radikal', 'ini', '.', 'entah', 'apa', 'yang', 'ada', 'dalam', 'pikiran', 'rej', '##im', '.', 'mesjid', 'radikal', '.', '.', '.', 'kampus', 'radikal', '.', '.', '.', '.', 'dosen', 'radikal', '.', '.', '.', '.', 'padahal', 'tempat', 'tersebut', 'pijakan', 'peradaban', '.', 'memangnya', 'mau', 'menghancurkan', 'indonesia', '?']
Token IDS:  [6273, 5196, 9884, 259, 19117, 416, 10640, 10640, 92, 30470, 4303, 387, 34, 176, 112, 2865, 27078, 95, 30470, 13192, 10640, 30470, 30470, 30470, 4281, 10640, 30470, 30470, 30470, 30470, 4362, 10640, 30470, 30470, 30470, 30470, 2234, 515, 256, 24972, 7156, 30470, 26907, 422, 6844, 300, 30477]


In [11]:
# Add CLS and SEP index
input_ids = []

for sent in sentences:
  encoded_sent = tokenizer.encode(
      sent,
      add_special_tokens = True
  )
  input_ids.append(encoded_sent)

print("Original: ", sentences[0])
print("Token IDs: ", input_ids[0])



Original:  USER terimakasih Ustadz sudah bersuara tentang Radikal radikal ini. Entah apa yang ada dalam pikiran rejim. Mesjid radikal...kampus radikal....dosen radikal....padahal tempat tersebut pijakan peradaban. Memangnya mau menghancurkan Indonesia ?
Token IDs:  [2, 6273, 5196, 9884, 259, 19117, 416, 10640, 10640, 92, 30470, 4303, 387, 34, 176, 112, 2865, 27078, 95, 30470, 13192, 10640, 30470, 30470, 30470, 4281, 10640, 30470, 30470, 30470, 30470, 4362, 10640, 30470, 30470, 30470, 30470, 2234, 515, 256, 24972, 7156, 30470, 26907, 422, 6844, 300, 30477, 3]


In [12]:
print("Max sentence length: ", max([len(sen) for sen in input_ids]))

Max sentence length:  112


In [13]:
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 115 # pilih number diatas max sentence length supaya tidak ada informasi yang hilang

print("Padding/truncating all sentences to %d values" % MAX_LEN)
print('Padding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')

print("Done")

Padding/truncating all sentences to 115 values
Padding token: "[PAD]", ID: 0
Done


In [14]:
input_ids[0]

array([    2,  6273,  5196,  9884,   259, 19117,   416, 10640, 10640,
          92, 30470,  4303,   387,    34,   176,   112,  2865, 27078,
          95, 30470, 13192, 10640, 30470, 30470, 30470,  4281, 10640,
       30470, 30470, 30470, 30470,  4362, 10640, 30470, 30470, 30470,
       30470,  2234,   515,   256, 24972,  7156, 30470, 26907,   422,
        6844,   300, 30477,     3,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0])

In [15]:
attention_mask = []

for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]

  attention_mask.append(att_mask)

# Persiapkan data

In [16]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_labels, test_labels = train_test_split(input_ids,
                                                                      labels,
                                                                      random_state=2017,
                                                                      test_size=0.1)
train_mask, test_mask, _, _ = train_test_split(attention_mask,
                                               labels,
                                               random_state=2017,
                                               test_size=0.1)

train_input, validation_input, train_labels, validation_labels = train_test_split(train_input,
                                                                                  train_labels,
                                                                                  random_state=2018,
                                                                                  test_size=0.15)
train_mask, validation_mask, _, _ = train_test_split(train_mask,
                                                     train_mask,
                                                     random_state=2018,
                                                     test_size=0.15)

In [17]:
import numpy as np
print("== Train ==")
print("Input: ", train_input.shape)
print("Label: ", train_labels.shape)
print("Mask: ", np.array(train_mask).shape)

print("\n== Validation ==")
print("Input: ", validation_input.shape)
print("Label: ", validation_labels.shape)
print("Mask: ", np.array(validation_mask).shape)

print("\n== Test ==")
print("Input: ", test_input.shape)
print("Label: ", test_labels.shape)
print("Mask: ", np.array(test_mask).shape)

== Train ==
Input:  (8058, 115)
Label:  (8058,)
Mask:  (8058, 115)

== Validation ==
Input:  (1423, 115)
Label:  (1423,)
Mask:  (1423, 115)

== Test ==
Input:  (1054, 115)
Label:  (1054,)
Mask:  (1054, 115)


In [18]:
train_input = torch.tensor(train_input)
train_labels = torch.tensor(train_labels)
train_mask = torch.tensor(train_mask)

validation_input = torch.tensor(validation_input)
validation_labels = torch.tensor(validation_labels)
validation_mask = torch.tensor(validation_mask)

test_input = torch.tensor(test_input)
test_labels = torch.tensor(test_labels)
test_mask = torch.tensor(test_mask)

In [19]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 20

train_data = TensorDataset(train_input, train_mask, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_input, validation_mask, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input, test_mask, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Persiapkan model pre-trained BERT

In [20]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-large-p2",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

# move model to cuda
model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,

In [21]:
params = list(model.named_parameters())

print("The BERT model has {:} different named parameters.".format(len(params)))

print("==== Embedding Layer ====")
for p in params[0:5]:
  print("{:<60} {:>12}".format(p[0], str(tuple(p[1].size()))))

print("==== First Transformers ====")
for p in params[5:21]:
  print("{:<60} {:>12}".format(p[0], str(tuple(p[1].size()))))

print("==== Output Layer ====")
for p in params[-4:]:
  print("{:<60} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 393 different named parameters.
==== Embedding Layer ====
bert.embeddings.word_embeddings.weight                       (30522, 1024)
bert.embeddings.position_embeddings.weight                    (512, 1024)
bert.embeddings.token_type_embeddings.weight                    (2, 1024)
bert.embeddings.LayerNorm.weight                                  (1024,)
bert.embeddings.LayerNorm.bias                                    (1024,)
==== First Transformers ====
bert.encoder.layer.0.attention.self.query.weight             (1024, 1024)
bert.encoder.layer.0.attention.self.query.bias                    (1024,)
bert.encoder.layer.0.attention.self.key.weight               (1024, 1024)
bert.encoder.layer.0.attention.self.key.bias                      (1024,)
bert.encoder.layer.0.attention.self.value.weight             (1024, 1024)
bert.encoder.layer.0.attention.self.value.bias                    (1024,)
bert.encoder.layer.0.attention.output.dense.weight           (1024, 1024)
bert.

In [22]:
# tuning parameter
optimizer = AdamW(
    model.parameters(),
    lr = 2e-5, # default 2e-5 0.00002
    eps = 1e-8
)



In [23]:
from transformers import get_linear_schedule_with_warmup

epochs = 1

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                             num_warmup_steps = 0,
                                             num_training_steps = total_steps)

In [24]:
import numpy as np

def flat_accuracy(preds, labels):
  # input dari preds itu kan logit, nah kita ubah jadi label dengan skor yang paling tinggi
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [25]:
import time
import datetime

def format_time(elapsed):
  elapsed_rounded = int(round(elapsed))
  return str(datetime.timedelta(seconds=elapsed_rounded))

# Training BERT

In [26]:
# https://stackoverflow.com/questions/59129812/how-to-avoid-cuda-out-of-memory-in-pytorch

import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):

  # ===================================
  #              Training
  # ===================================

  print("======= Epoch {:} / {:} =======".format(epoch_i+1, epochs))
  print("Training...")

  t0 = time.time()

  total_loss = 0

  model.train()

  # For each batch of training data
  for step, batch in enumerate(train_dataloader):

    # Progress update every 40 batches
    if step % 40 == 0 and not step == 0:
      elapsed = format_time(time.time() - t0)

      print("Batch {:>5,} of {:>5,}.     Elapsed: {:}".format(step, len(train_dataloader), elapsed))

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)


    ## update parameter start ##
    # memastikan gradient 0 karena untuk update parameter butuh gradient dan sebelum kita ubah2 gradientnya, kita set 0 dulu di awal
    model.zero_grad()

    # bagian forward pass
    outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels)

    # error dari hasil model forward pass
    loss = outputs[0]

    total_loss += loss.item()

    # dari loss, bisa dapetin gradient dari parameter model yg menghasilkan loss tersebut. kemudian, gradient tersebut digunakan untuk update parameter
    # untuk ngitung gradient
    loss.backward()

    # gradient perlu di-clip supaya value gradient tidak terlalu besar dan tidak terlalu kecil supaya tidak jadi meledak
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameter berdasarkan gradient yang sudah di-update di atas
    optimizer.step()

    # utk ngatur learning rate supaya tidak konstan (kadang bisa naik dulu di awal terus turun lagi atau sebaliknya atau yang lain)
    scheduler.step()

    ## update parameter end ##

  avg_train_loss = total_loss / len(train_dataloader)

  loss_values.append(avg_train_loss)

  print("   Average training loss: {0:.2f}".format(avg_train_loss))
  print("   Training epoch took: {:}".format(format_time(time.time() - t0)))

  # ===================================
  #             Validation
  # ===================================

  print("Running Validation...")

  t0 = time.time()

  model.eval()

  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  for batch in validation_dataloader:

    # pindahin validation ke gpu (device)
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch

    # gak usah ngerecord gradient karna gak dipake untuk validation (supaya lebih cepat)
    with torch.no_grad():
      outputs = model(b_input_ids,
                      token_type_ids=None,
                      attention_mask=b_input_mask)

    # skor untuk menentukan dia masuk ke label yang mana (akan diclassify ke label yg skornya lebih besar)
    logits = outputs[0]

    # balikin cpu dan diubah ke numpy
    logits = logits.detach().cpu().numpy()

    # balikin cpu dan diubah ke numpy
    label_ids = b_labels.to('cpu').numpy()

    # itung akurasi
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_accuracy += tmp_eval_accuracy

    nb_eval_steps += 1

  print("   Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
  print("   Validation took: {:}".format(format_time(time.time() - t0)))

print("Training complete!")

Training...
Batch    40 of   403.     Elapsed: 0:00:50
Batch    80 of   403.     Elapsed: 0:01:43
Batch   120 of   403.     Elapsed: 0:02:35
Batch   160 of   403.     Elapsed: 0:03:28
Batch   200 of   403.     Elapsed: 0:04:20
Batch   240 of   403.     Elapsed: 0:05:13
Batch   280 of   403.     Elapsed: 0:06:06
Batch   320 of   403.     Elapsed: 0:06:58
Batch   360 of   403.     Elapsed: 0:07:50
Batch   400 of   403.     Elapsed: 0:08:43
   Average training loss: 0.33
   Training epoch took: 0:08:47
Running Validation...
   Accuracy: 0.89
   Validation took: 0:00:34
Training complete!


In [27]:
# Save the entire model (including architecture, optimizer state, etc.)
torch.save(model, '/content/model.pth')

# Predict and evaluate

In [29]:
print("Predicting labels for {:,} test sentences".format(len(test_input)))

model.eval()

prediction, true_labels = [], []

for batch in test_dataloader:
  #print(batch)
  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
    outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask)
    #print(outputs)

  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  prediction.append(logits)
  true_labels.append(label_ids)

# Save prediction
df = pd.DataFrame({'label':label_ids}).reset_index()
df.to_csv('pred.txt', index=False)

print(" DONE.")

Predicting labels for 1,054 test sentences
 DONE.


In [30]:
from sklearn.metrics import matthews_corrcoef

flat_prediction = [item for sublist in prediction for item in sublist]
flat_prediction = np.argmax(flat_prediction, axis=1).flatten()

flat_true_labels = [item for sublist in true_labels for item in sublist]

mcc = matthews_corrcoef(flat_true_labels, flat_prediction)

print("MCC: %.3f" %mcc)

MCC: 0.787


In [31]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(flat_true_labels, flat_prediction)

print("ACC: %.3f" %acc)


ACC: 0.895


# Predict New Dataset

In [32]:
# convert test dataset to input_ids
df_test = pd.read_csv("test.csv")

sentences_test = df_test.tweet.values
id_test = df_test.id.values

# Add CLS and SEP index
input_ids_test = []

for sent in sentences_test:
  encoded_sent = tokenizer.encode(
      sent,
      add_special_tokens = True
  )
  input_ids_test.append(encoded_sent)

input_ids_test = pad_sequences(input_ids_test, maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')


# create attention mask
attention_mask_test = []

for sent in input_ids_test:
  att_mask = [int(token_id > 0) for token_id in sent]

  attention_mask_test.append(att_mask)

In [33]:
test_input = torch.tensor(input_ids_test)
test_mask = torch.tensor(attention_mask_test)

In [34]:
test_data = TensorDataset(test_input, test_mask)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [37]:
prediction, true_labels = [], []

for batch in test_dataloader:
  #print(batch)
  batch = tuple(t.to(device) for t in batch)

  b_input_ids, b_input_mask = batch

  with torch.no_grad():
    outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask)
    #print(outputs)

  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  prediction.append(logits)
  true_labels.append(label_ids)

In [38]:
from sklearn.metrics import matthews_corrcoef

flat_prediction = [item for sublist in prediction for item in sublist]
flat_prediction = np.argmax(flat_prediction, axis=1).flatten()

flat_true_labels = [item for sublist in true_labels for item in sublist]
flat_prediction

array([0, 0, 0, ..., 1, 0, 0])

In [39]:
len(flat_prediction)

2634