In [None]:
# 匯入所需的庫
import json
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, TensorDataset
import time
from sklearn.metrics import f1_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm  # 新增 tqdm 用於顯示進度條

# 檢查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 讀取資料
with open('arxiv_data.json', 'r') as f:
    data = json.load(f)

# 將資料構建成 DataFrame
df = pd.DataFrame({
    'title': data['titles'],
    'abstract': data['summaries'],
    'labels': data['terms']
})

# 預處理標籤
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

# 分割資料集
train_texts, test_texts, y_train, y_test = train_test_split(df['abstract'], y, test_size=0.15, random_state=42)
train_texts, val_texts, y_train, y_val = train_test_split(train_texts, y_train, test_size=0.1765, random_state=42)

# 初始化 BERT Tokenizer 和 Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

# 將文本轉換為 BERT 向量表示
def encode_texts(texts):
    input_ids, attention_masks = [], []
    for text in tqdm(texts, desc="Encoding texts"):  # 使用 tqdm 顯示文本編碼進度
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return torch.cat(input_ids, dim=0).to(device), torch.cat(attention_masks, dim=0).to(device)

# 編碼訓練、驗證和測試集
train_inputs, train_masks = encode_texts(train_texts)
val_inputs, val_masks = encode_texts(val_texts)
test_inputs, test_masks = encode_texts(test_texts)

# 構建 DataLoader
batch_size = 8
train_data = TensorDataset(train_inputs, train_masks, torch.tensor(y_train).to(device))
train_dataloader = DataLoader(train_data, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, torch.tensor(y_val).to(device))
val_dataloader = DataLoader(val_data, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, torch.tensor(y_test).to(device))
test_dataloader = DataLoader(test_data, batch_size=batch_size)

# 提取 BERT 嵌入
def get_bert_embeddings(dataloader):
    bert_model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating BERT embeddings"):  # 顯示 BERT 嵌入提取進度
            input_ids, attention_mask = batch[0], batch[1]
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token embedding
            embeddings.append(cls_embeddings)
    return torch.cat(embeddings, dim=0)

# 取得訓練、驗證和測試集的嵌入
train_embeddings = get_bert_embeddings(train_dataloader)
val_embeddings = get_bert_embeddings(val_dataloader)
test_embeddings = get_bert_embeddings(test_dataloader)

# 將 BERT 嵌入轉為 NumPy 格式以供 Scikit-Learn 使用
X_train = train_embeddings.cpu().numpy()
X_val = val_embeddings.cpu().numpy()
X_test = test_embeddings.cpu().numpy()

# 定義模型（以 Logistic Regression 為例）
model = MultiOutputClassifier(LogisticRegression(max_iter=1000), n_jobs=-1)

# 訓練模型並評估表現
start_time = time.time()
print("Training the model...")
model.fit(X_train, y_train)
train_time = time.time() - start_time
print("Model training completed.")

# 評估模型
start_time = time.time()
y_val_pred = model.predict(X_val)
inference_time = time.time() - start_time

# 計算 F1 分數和生成分類報告
val_f1_score = f1_score(y_val, y_val_pred, average='micro')
val_report = classification_report(y_val, y_val_pred, zero_division=0)
print(f"Validation F1 Score: {val_f1_score:.4f}")
print(f"Validation Classification Report:\n{val_report}")

# 測試集評估
y_test_pred = model.predict(X_test)
test_report = classification_report(y_test, y_test_pred, zero_division=0)
print(f"\nTest Classification Report:\n{test_report}")

# 顯示訓練和推理時間
print(f"\nTraining time: {train_time:.4f} seconds")
print(f"Inference time: {inference_time:.4f} seconds")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Encoding texts: 100%|███████████████████████████████████████████████████████████| 36239/36239 [03:05<00:00, 195.24it/s]
Encoding texts: 100%|█████████████████████████████████████████████████████████████| 7768/7768 [00:39<00:00, 197.15it/s]
Encoding texts: 100%|█████████████████████████████████████████████████████████████| 7767/7767 [00:39<00:00, 197.50it/s]
  attn_output = torch.nn.functional.scaled_dot_product_attention(
Generating BERT embeddings:  16%|███████▊                                         | 722/4530 [31:16<3:56:11,  3.72s/it]