In [2]:
import json
import pandas as pd
import re
import time
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# 加載數據
with open('arxiv_data.json', 'r') as f:
    data = json.load(f)

# 構建 DataFrame
df = pd.DataFrame({
    'title': data['titles'],
    'abstract': data['summaries'],
    'labels': data['terms']
})

# 預處理文本
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # 去除非字母字符
    text = text.lower()  # 全部轉小寫
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['abstract'] = df['abstract'].apply(preprocess_text)

# 將多標籤轉為二值矩陣
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

# 分割訓練集、驗證集、測試集
train_texts, test_texts, y_train, y_test = train_test_split(df['abstract'], y, test_size=0.15, random_state=42)
train_texts, val_texts, y_train, y_val = train_test_split(train_texts, y_train, test_size=0.1765, random_state=42)

# 使用 TF-IDF 表示文本特徵
tfidf = TfidfVectorizer(max_features=500)  # 限制特徵數量以提高速度
X_train = tfidf.fit_transform(train_texts).toarray()
X_val = tfidf.transform(val_texts).toarray()
X_test = tfidf.transform(test_texts).toarray()

# 使用最佳超參數初始化 Hist Gradient Boosting 模型
best_params = {
    'max_iter': 291,
    'learning_rate': 0.0235,
    'max_leaf_nodes': 96,
    'min_samples_leaf': 9
}
hgb_model = HistGradientBoostingClassifier(
    max_iter=best_params['max_iter'],
    learning_rate=best_params['learning_rate'],
    max_leaf_nodes=best_params['max_leaf_nodes'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42
)
multi_target_model = MultiOutputClassifier(hgb_model, n_jobs=-1)

# 訓練時間測量
start_time = time.time()
multi_target_model.fit(X_train, y_train)
train_time = time.time() - start_time

# 驗證集推理和評估
start_time = time.time()
y_val_pred = multi_target_model.predict(X_val)
inference_time_val = time.time() - start_time

val_f1_score = f1_score(y_val, y_val_pred, average='micro')
val_report = classification_report(y_val, y_val_pred, zero_division=0)
print(f"Validation F1 Score: {val_f1_score:.4f}")
print("Validation Classification Report:\n", val_report)

# 測試集推理和評估
start_time = time.time()
y_test_pred = multi_target_model.predict(X_test)
inference_time_test = time.time() - start_time

test_f1_score = f1_score(y_test, y_test_pred, average='micro')
test_report = classification_report(y_test, y_test_pred, zero_division=0)
print(f"\nTest F1 Score: {test_f1_score:.4f}")
print("Test Classification Report:\n", test_report)

# 顯示訓練和推理時間
print(f"\nTraining time: {train_time:.4f} seconds")
print(f"Validation inference time: {inference_time_val:.4f} seconds")
print(f"Test inference time: {inference_time_test:.4f} seconds")

# 將報告寫入 results.txt
output_path = "Results.txt"
with open(output_path, "w") as f:
    f.write("Validation Classification Report:\n")
    f.write(val_report)
    f.write("\n\nTest Classification Report:\n")
    f.write(test_report)

print(f"Results saved to {output_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Validation F1 Score: 0.7824
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00        11
           2       0.22      0.40      0.29        10
           3       1.00      0.33      0.50         6
           4       0.79      0.28      0.41      1150
           5       0.00      0.00      0.00         5
           6       0.11      0.25      0.15         4
           7       0.00      0.00      0.00         9
           8       1.00      0.53      0.69        19
           9       0.89      0.55      0.68       238
          10       0.86      0.30      0.45       103
          11       0.94      0.93      0.94      4526
          12       1.00      0.20      0.33        30
          13       0.73      0.35      0.47        23
          14       1.00      0.18      0.31        49
          15       0.00      0.00      0.00         3
          16      