In [None]:
# !pip install transformers
# !pip install sentencepiece
import sentencepiece
import torch.nn.functional as F
import os
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import torch
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, BertTokenizerFast, BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import get_linear_schedule_with_warmup
import time
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data/data_training_90.csv')
aug_df = pd.read_excel('data/data_baru/Augmented-Dataset/New/Cyberbullying_90_32_GPT2_augmented-Augmented-JAC-New-Text-Final.xlsx')

test_df = pd.read_csv('data/data_testing_10.csv')

In [None]:
aug_df = aug_df.rename(columns={
    'text': 'Text',
    'label' : 'Sentiment'
})

# Now concatenate the DataFrames
df = pd.concat([df, aug_df], ignore_index=True)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('Wikidepia/albert-bahasa-uncased-squad')
model = AutoModelForSequenceClassification.from_pretrained('Wikidepia/albert-bahasa-uncased-squad', num_labels=2)

In [None]:
dropout_rate = 0.5
model.config.hidden_dropout_prob = dropout_rate
model.config.attention_probs_dropout_prob = dropout_rate

In [None]:
# Bagi dataset menjadi data training dan data testing
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


train_encodings = tokenizer(train_df['Text'].tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_df['Text'].tolist(), truncation=True, padding=True, max_length=512)

# Konversi data ke dalam format tensor
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),
                                               torch.tensor(train_encodings['attention_mask']),
                                               torch.tensor(train_df['Sentiment'].tolist()))

val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']),
                                              torch.tensor(val_encodings['attention_mask']),
                                              torch.tensor(val_df['Sentiment'].tolist()))

In [None]:
# set device to CUDA if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


In [None]:
# set training parameters
optimizer = Adam(model.parameters(), lr=2e-5, eps=1e-8)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
total_steps = len(train_loader) * 2

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

loss_fn = torch.nn.CrossEntropyLoss()

#loss tiap epoch
loss_training = []
acurracy_training = []
f1_score_training = []

loss_testing = []
accuracy_testing = []
f1_score_testing = []

start_time = time.time()

for epoch in range(10):
    print('\nEpoch:', epoch+1)
    print('Training...')
    model.train()

    total_loss = 0

    for step, batch in enumerate(train_loader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        total_loss += loss.item()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        if step % 500 == 0 and step != 0:
            avg_train_loss = total_loss / 500
            print('Batch', step, 'of', len(train_loader), '| Average Training Loss:', avg_train_loss)
            total_loss = 0

    loss_training.append (total_loss)
    print('loss:', loss_training)


    print('Testing...')
    model.eval()

    test_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=True)

    predictions = []
    true_labels = []
    total_loss = 0

    for batch in test_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)

        predictions.extend(preds.cpu().numpy().tolist())
        true_labels.extend(labels.cpu().numpy().tolist())

        total_loss += loss.item()

    loss_testing.append (total_loss)

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)


    print('\nAccuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1)

    accuracy_testing.append(accuracy)
    f1_score_testing.append(f1)

    end_time = time.time()
    runtime = end_time - start_time

    print(f"Waktu runtime: {runtime} detik")
average_accuracy = np.mean(accuracy_testing) #<--

In [None]:
test_encodings = tokenizer(test_df['Text'].tolist(), truncation=True, padding=True, max_length=512)

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']),
                                              torch.tensor(test_encodings['attention_mask']),
                                              torch.tensor(test_df['Sentiment'].tolist()))

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True)

predictions = []
true_labels = []
total_loss = 0

for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, attention_mask, labels = batch

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    probs = torch.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1)

    predictions.extend(preds.cpu().numpy().tolist())
    true_labels.extend(labels.cpu().numpy().tolist())

    total_loss += loss.item()

loss_testing.append (total_loss)

accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)


print("Average accuracy during cross-validation:", average_accuracy)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)