In [1]:
import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ============ 1. 基本參數 ============
RANDOM_SEED = 42
TEST_SIZE   = 0.2
BATCH_SIZE  = 32
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Embedding Device ➜ {DEVICE}")

Embedding Device ➜ cuda


In [3]:
# ============ 2. 載入資料 ============
train_path = Path("review_data.csv")
test_path  = Path("X_test.csv")

# review_data.csv: id | review | helpfulness
df_train = pd.read_csv(train_path, header=0, names=["id", "review", "helpfulness"])
X_text, y = df_train["review"].tolist(), df_train["helpfulness"].values

# 測試集只有 id、review
df_test = pd.read_csv(test_path, header=0, names=["id", "review"])
X_test_text = df_test["review"].tolist()

In [4]:
# ============ 3. Split ============
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_text,
    y,
    test_size     = TEST_SIZE,
    random_state  = RANDOM_SEED,
    stratify      = y,
)

In [7]:
# ============ 4. 載入模型 ============
model_name = "microsoft/mpnet-base"
tokenizer  = AutoTokenizer.from_pretrained(model_name)#.to(DEVICE)
model      = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE)

def tok(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train_ds = Dataset.from_dict({"text": X_train_text, "label": y_train}).map(tok, batched=True)
val_ds   = Dataset.from_dict({"text": X_val_text,   "label": y_val  }).map(tok, batched=True)

# ============ 5. TrainingArguments ============
args = TrainingArguments(
    output_dir                  = "mpnet-helpfulness",
    fp16                        = torch.cuda.is_available(),
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size  = BATCH_SIZE,
    evaluation_strategy         = "epoch",
    save_strategy               = "epoch",
    save_total_limit            = 1,
    num_train_epochs            = 5,
    learning_rate               = 2e-5,
    weight_decay                = 0.01,
    seed                        = RANDOM_SEED,
    load_best_model_at_end      = True,
    metric_for_best_model       = "eval_loss",
    greater_is_better           = False,
    logging_strategy            = "epoch",
)

trainer = Trainer(model, args, train_dataset=train_ds, eval_dataset=val_ds)
trainer.train()

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 2118/2118 [00:00<00:00, 8038.81 examples/s]
Map: 100%|██████████| 530/530 [00:00<00:00, 6914.81 examples/s]


[2025-05-14 08:25:35,424] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/buffett/miniconda3/envs/py310/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/buffett/miniconda3/envs/py310/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlvsym'
/home/buffett/miniconda3/envs/py310/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/home/buffett/miniconda3/envs/py310/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/home/buffett/miniconda3/envs/py310/bin/../lib/gcc/x86_64-conda-linux-gnu/11.2.0/../../../../x86_64-conda-linux-gnu/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/home/buffett/mini

Epoch,Training Loss,Validation Loss
1,0.5622,0.508502
2,0.418,0.479742
3,0.3467,0.428628
4,0.2914,0.455634
5,0.2451,0.450391


TrainOutput(global_step=335, training_loss=0.3726846040184818, metrics={'train_runtime': 69.8001, 'train_samples_per_second': 151.719, 'train_steps_per_second': 4.799, 'total_flos': 1393173038131200.0, 'train_loss': 0.3726846040184818, 'epoch': 5.0})

In [8]:
# ============ 4. 推論並存 CSV ============
# 4-1. 準備 test dataset
test_ds = Dataset.from_dict({"text": X_test_text}).map(tok, batched=True)

# 4-2. 預測；trainer.predict 會回傳 logits
pred_logits = trainer.predict(test_ds).predictions
pred_labels = np.argmax(pred_logits, axis=1)

# 4-3. 輸出
df_submit = pd.DataFrame({
    "Id": df_test["id"],
    "helpfulness": pred_labels
})
df_submit.to_csv("mpnet_finetune_predictions.csv", index=False, encoding="utf-8-sig")
print("✔ 已輸出 mpnet_finetune_predictions.csv")

Map:   0%|          | 0/662 [00:00<?, ? examples/s]

Map: 100%|██████████| 662/662 [00:00<00:00, 5148.80 examples/s]


✔ 已輸出 mpnet_finetune_predictions.csv
