In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,EarlyStoppingCallback
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import wandb

### Check GPU

In [2]:
# Kiểm tra GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

Using device: cuda


### Load Dataset

In [3]:
# Load dữ liệu
def load_data(file_path):
    label_map = {"Positive": 0, "Negative": 1, "Neutral": 2}
    df = pd.read_csv(file_path)
    texts = df['comment'].tolist()
    labels = df['label'].map(label_map).tolist()  # Encode nhãn
    return texts, labels

In [4]:
train_texts, train_labels = load_data("/kaggle/input/data-sentiment/train_processed.csv")
val_texts, val_labels = load_data("/kaggle/input/data-sentiment/val_processsed.csv")
test_texts, test_labels = load_data("/kaggle/input/data-sentiment/test_processsed.csv")

### Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")

def encode_data(texts, labels, tokenizer, max_length=512):
    inputs = tokenizer(
        texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    return inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels)

train_inputs, train_masks, train_labels = encode_data(train_texts, train_labels, tokenizer)
val_inputs, val_masks, val_labels = encode_data(val_texts, val_labels, tokenizer)
test_inputs, test_masks, test_labels = encode_data(test_texts, test_labels, tokenizer)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

### Create Dataset Sentiment

In [6]:
# Tạo Dataset class
class SentimentDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

train_dataset = SentimentDataset(train_inputs, train_masks, train_labels)
val_dataset = SentimentDataset(val_inputs, val_masks, val_labels)
test_dataset = SentimentDataset(test_inputs, test_masks, test_labels)

### Load Model Bert

In [7]:
# Load BERT model
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-multilingual-cased", num_labels=3)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Compute Metrics

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(axis=-1)  

    # Tính toán các chỉ số
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')  
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    # Trả về các chỉ số trong dạng dictionary
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1-score": f1
    }

### Hyperparameter Tuning

In [9]:
# Cài đặt TrainingArguments
wandb.login(key = '8a5cbfdaa29778a896996cc679358b1d96cf66b0')
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2 ,early_stopping_threshold=0.001)
training_args = TrainingArguments(
   output_dir="./bert_question_answer",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=34,
        per_device_eval_batch_size=34,
        gradient_accumulation_steps=8,
        num_train_epochs=15,
        disable_tqdm=False,
        weight_decay=0.25,
        save_total_limit=3,
        optim="adamw_hf",
        fp16=True,
        max_grad_norm=0.6,
        warmup_ratio=0.2,
        group_by_length=True,
        report_to="wandb",
        load_best_model_at_end=True,
        label_names=['labels'],
        lr_scheduler_type="linear"
)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


### Fine-Tuning

In [10]:
# Sử dụng Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],
)

# Huấn luyện model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhdang1696[0m ([33mhdang1696-no-work-experience[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241211_083151-nusqi7gg[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./bert_question_answer[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/hdang1696-no-work-experience/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/hdang1696-no-work-experience/huggingface/runs/nusqi7gg[0m
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
0,1.0685,0.943052,0.52518,0.275814,0.52518,0.36168
1,0.9006,0.762429,0.696043,0.66226,0.696043,0.661263
2,0.6839,0.59438,0.785971,0.710636,0.785971,0.744199
3,0.5232,0.50596,0.811151,0.830471,0.811151,0.76744
4,0.5046,0.478614,0.826439,0.818483,0.826439,0.797692
5,0.4538,0.479539,0.817446,0.792096,0.817446,0.791766
6,0.4212,0.474275,0.82554,0.81002,0.82554,0.798948
7,0.3643,0.487426,0.819245,0.80758,0.819245,0.812239
8,0.3564,0.48263,0.823741,0.810299,0.823741,0.815093


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  _warn_prf(average, modifier, msg_start, len(result))
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.cuda

TrainOutput(global_step=128, training_loss=0.5840489119291306, metrics={'train_runtime': 3910.3962, 'train_samples_per_second': 29.867, 'train_steps_per_second': 0.054, 'total_flos': 1.8356896928047104e+16, 'train_loss': 0.5840489119291306, 'epoch': 8.97391304347826})

### Evaluation

In [11]:
# Đánh giá trên tập test
test_results = trainer.evaluate(test_dataset)
print(f"Test Results: {test_results}")

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Test Results: {'eval_loss': 0.43345212936401367, 'eval_accuracy': 0.8462230215827338, 'eval_precision': 0.829242436606982, 'eval_recall': 0.8462230215827338, 'eval_f1-score': 0.817655362451093, 'eval_runtime': 43.7089, 'eval_samples_per_second': 50.882, 'eval_steps_per_second': 0.755, 'epoch': 8.97391304347826}


### Save Model

In [12]:
# Lưu mô hình
model.save_pretrained("bert_sentiment_model")
tokenizer.save_pretrained("bert_sentiment_model")

('bert_sentiment_model/tokenizer_config.json',
 'bert_sentiment_model/special_tokens_map.json',
 'bert_sentiment_model/vocab.txt',
 'bert_sentiment_model/added_tokens.json',
 'bert_sentiment_model/tokenizer.json')