B09208038

In [1]:
import os, sys
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ============ 1. 基本參數 ============
TEST_SIZE   = 0.2
RANDOM_SEED = 42
BATCH_SIZE  = 32                     # MiniLM 預設批次大小
# MODEL_NAME  = "sentence-transformers/all-MiniLM-L6-v2"
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

device = "cuda:3" if torch.cuda.is_available() else "cpu"
print(f"Embedding Device ➜ {device}")

Embedding Device ➜ cuda:3


In [3]:
# ============ 2. 載入資料 ============
train_path = Path("review_data.csv")
test_path  = Path("X_test.csv")

# review_data.csv: id | review | helpfulness
df_train = pd.read_csv(train_path, header=0, names=["id", "review", "helpfulness"])
X_text, y = df_train["review"].tolist(), df_train["helpfulness"].values

# 測試集只有 id、review
df_test = pd.read_csv(test_path, header=0, names=["id", "review"])
X_test_text = df_test["review"].tolist()

In [4]:
# ============ 3. Split ============
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_text,
    y,
    test_size     = TEST_SIZE,
    random_state  = RANDOM_SEED,
    stratify      = y,
)

In [5]:
# ============ 4. 句向量 ============
embedder = SentenceTransformer(MODEL_NAME, device=device)

def encode_corpus(text_list, batch_size=BATCH_SIZE):
    """一次性批量編碼以提高效率"""
    return embedder.encode(
        text_list,
        batch_size         = batch_size,
        show_progress_bar  = True,
        convert_to_numpy   = True,
        normalize_embeddings = False     # 先不歸一化，交給 StandardScaler
    )

print("▶ Encoding training / validation / test sentences ...")
X_train_vec = encode_corpus(X_train_text)
X_val_vec   = encode_corpus(X_val_text)
X_test_vec  = encode_corpus(X_test_text)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


▶ Encoding training / validation / test sentences ...


Batches: 100%|██████████| 67/67 [00:06<00:00, 10.17it/s]
Batches: 100%|██████████| 17/17 [00:01<00:00, 10.19it/s]
Batches: 100%|██████████| 21/21 [00:02<00:00,  9.77it/s]


In [6]:
# ============ 5. 建立 Pipeline ============
pipe = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=False)),   # 稠密向量建議標準化；with_mean=False 省記憶體
    ("clf", LogisticRegression(
        max_iter   = 2000,
        solver     = "saga",          # 支援 L1/L2
        class_weight="balanced",
        n_jobs     = -1,
    )),
])

param_grid = {
    "clf__C"       : [0.01, 0.1, 1, 3, 10],
    "clf__penalty" : ["l2", "l1"],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

print("▶ Grid search ...")
grid = GridSearchCV(
    estimator   = pipe,
    param_grid  = param_grid,
    cv          = cv,
    scoring     = "accuracy",
    n_jobs      = -1,
    verbose     = 2,
    refit       = True,
)

grid.fit(X_train_vec, y_train)

print("\nBest params:", grid.best_params_)
print("Best CV accuracy:", grid.best_score_)


▶ Grid search ...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[CV] END .......................clf__C=0.01, clf__penalty=l2; total time=   2.1s
[CV] END .......................clf__C=0.01, clf__penalty=l1; total time=   3.5s
[CV] END .......................clf__C=0.01, clf__penalty=l1; total time=   3.5s
[CV] END .......................clf__C=0.01, clf__penalty=l1; total time=   3.9s
[CV] END .......................clf__C=0.01, clf__penalty=l1; total time=   6.7s
[CV] END ........................clf__C=0.1, clf__penalty=l1; total time=   7.7s
[CV] END ........................clf__C=0.1, clf__penalty=l2; total time=   8.0s
[CV] END ........................clf__C=0.1, clf__penalty=l2; total time=   8.5s
[CV] END ........................clf__C=0.1, clf__penalty=l2; total time=   8.4s
[CV] END ........................clf__C=0.1, clf__penalty=l2; total time=   8.6s
[CV] END ........................clf__C=0.1, clf__penalty=l2; total time=   9.2s
[CV] END .......................clf__C=0.01, clf__penalty=l1; total time=  17.6s
[CV] END ...................



[CV] END ........................clf__C=0.1, clf__penalty=l1; total time= 1.3min




[CV] END ........................clf__C=0.1, clf__penalty=l1; total time= 1.3min




[CV] END .........................clf__C=10, clf__penalty=l2; total time=  52.2s




[CV] END .........................clf__C=10, clf__penalty=l2; total time=  44.4s




[CV] END ..........................clf__C=3, clf__penalty=l1; total time=  59.6s




[CV] END ..........................clf__C=3, clf__penalty=l1; total time= 1.2min




[CV] END ..........................clf__C=3, clf__penalty=l1; total time=  57.2s




[CV] END .........................clf__C=10, clf__penalty=l2; total time=  56.3s
[CV] END .........................clf__C=10, clf__penalty=l2; total time=  56.4s




[CV] END .........................clf__C=10, clf__penalty=l2; total time=  57.9s




[CV] END ..........................clf__C=3, clf__penalty=l1; total time= 1.0min




[CV] END ..........................clf__C=3, clf__penalty=l1; total time= 1.0min




[CV] END .........................clf__C=10, clf__penalty=l1; total time=  41.3s




[CV] END .........................clf__C=10, clf__penalty=l1; total time=  37.6s




[CV] END .........................clf__C=10, clf__penalty=l1; total time= 1.0min




[CV] END .........................clf__C=10, clf__penalty=l1; total time=  53.6s




[CV] END .........................clf__C=10, clf__penalty=l1; total time=  47.1s

Best params: {'clf__C': 0.01, 'clf__penalty': 'l2'}
Best CV accuracy: 0.8050002230251128


In [7]:
# ============ 6. 驗證集評估 ============
val_pred = grid.predict(X_val_vec)
val_acc  = accuracy_score(y_val, val_pred)
print(f"Validation accuracy: {val_acc:.4f}")
print(classification_report(y_val, val_pred, digits=4))

Validation accuracy: 0.8019
              precision    recall  f1-score   support

           0     0.7376    0.7762    0.7564       210
           1     0.8479    0.8187    0.8331       320

    accuracy                         0.8019       530
   macro avg     0.7927    0.7975    0.7947       530
weighted avg     0.8042    0.8019    0.8027       530



In [8]:
# ============ 7. 測試集預測 ============
test_pred = grid.predict(X_test_vec)

df_submit = pd.DataFrame({
    "Id"         : df_test["id"],
    "helpfulness": test_pred
})
out_name = "MPNet_LogReg_submission.csv"
df_submit.to_csv(out_name, index=False, encoding="utf-8-sig")
print(f"Saved → {out_name}")

Saved → MPNet_LogReg_submission.csv
