In [4]:
import pandas as pd
import re
import time
import nltk
import json
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, classification_report

# 初始化
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

with open('arxiv_data.json', 'r') as f:
    data = json.load(f)


# 構建 DataFrame
df = pd.DataFrame({
    'title': data['titles'],
    'abstract': data['summaries'],
    'labels': data['terms']
})

# 查看 DataFrame 結構
print(df.head())
# 把 85% 的數據分割為訓練集和 15% 的數據分割為測試集
train_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

# 把訓練集再分割出 15% 作為驗證集 (即 70% 訓練集, 15% 驗證集)
train_df, val_df = train_test_split(train_df, test_size=0.1765, random_state=42)  # 0.1765 確保驗證集約佔 15% 原始數據

# 查看各集的大小
print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Testing set size:", len(test_df))

# 解析和視覺化標籤分佈
# 展開標籤並統計頻次
all_labels = [label for labels in df['labels'] for label in labels]
label_counts = pd.Series(all_labels).value_counts()

# 繪製標籤分佈
plt.figure(figsize=(25, 8))
label_counts.plot(kind='bar')
plt.title('Label Distribution in the Dataset')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.xticks(rotation=50)  # 調整 x 軸標籤角度以便閱讀
plt.tight_layout()       # 自動調整佈局，防止標籤被截斷
plt.show()
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # 去除非字母字符
    text = text.lower()  # 全部轉小寫
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['abstract'] = df['abstract'].apply(preprocess_text)

# 將多標籤轉為二值矩陣
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

# 分割訓練集、驗證集、測試集
train_texts, test_texts, y_train, y_test = train_test_split(df['abstract'], y, test_size=0.15, random_state=42)
train_texts, val_texts, y_train, y_val = train_test_split(train_texts, y_train, test_size=0.1765, random_state=42)

# 使用 TF-IDF 表示文本特徵
tfidf = TfidfVectorizer(max_features=500)
X_train = tfidf.fit_transform(train_texts).toarray()
X_val = tfidf.transform(val_texts).toarray()
X_test = tfidf.transform(test_texts).toarray()

# 定義模型列表
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Hist Gradient Boosting': HistGradientBoostingClassifier(max_iter=100),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=3),
    'Ridge Classifier': RidgeClassifier()
}

# 儲存訓練和推理時間的字典
train_times = {}
inference_times = {}
val_scores = {}
reports = {}

# 訓練每個模型並評估驗證集的表現
total_models = len(models)
for idx, (name, model) in enumerate(models.items(), start=1):
    multi_target_model = MultiOutputClassifier(model, n_jobs=-1)
    
    # 訓練時間測量
    start_time = time.time()
    multi_target_model.fit(X_train, y_train)
    train_times[name] = time.time() - start_time
    
    # 推理時間測量
    start_time = time.time()
    y_val_pred = multi_target_model.predict(X_val)
    inference_times[name] = time.time() - start_time
    
    # 計算 F1 分數作為驗證指標
    score = f1_score(y_val, y_val_pred, average='micro')
    val_scores[name] = score
    print(f"{name} Validation F1 Score: {score:.4f}")
    
    # 生成分類報告（驗證集）
    reports[name] = classification_report(y_val, y_val_pred, zero_division=0)
    print(f"{name} Validation Classification Report:\n{reports[name]}")
    
    # 顯示進度百分比
    progress = (idx / total_models) * 100
    print(f"Progress: {progress:.2f}% complete\n")

# 選擇最佳模型進行測試報告
best_model_name = max(val_scores, key=val_scores.get)
best_model = MultiOutputClassifier(models[best_model_name], n_jobs=-1)
best_model.fit(X_train, y_train)
y_test_pred = best_model.predict(X_test)

# 生成測試集的分類報告
test_report = classification_report(y_test, y_test_pred, zero_division=0)
print(f"\nTest Classification Report ({best_model_name}):\n{test_report}")

# 顯示各分類器的訓練和推理時間
print("\nTraining and Inference Times:")
for name in models.keys():
    print(f"{name} - Training time: {train_times[name]:.4f} seconds, Inference time: {inference_times[name]:.4f} seconds")

# 將驗證集和測試集的報告寫入 results.txt
output_path = "results.txt"
with open(output_path, "w") as f:
    f.write("Validation Classification Report:\n")
    f.write(reports[best_model_name])
    f.write("\n\nTest Classification Report:\n")
    f.write(test_report)

print(f"Results saved to {output_path}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                               title  \
0  Survey on Semantic Stereo Matching / Semantic ...   
1  FUTURE-AI: Guiding Principles and Consensus Re...   
2  Enforcing Mutual Consistency of Hard Regions f...   
3  Parameter Decoupling Strategy for Semi-supervi...   
4  Background-Foreground Segmentation for Interio...   

                                            abstract                 labels  
0  Stereo matching is one of the widely used tech...         [cs.CV, cs.LG]  
1  The recent advancements in artificial intellig...  [cs.CV, cs.AI, cs.LG]  
2  In this paper, we proposed a novel mutual cons...         [cs.CV, cs.AI]  
3  Consistency training has proven to be an advan...                [cs.CV]  
4  To ensure safety in automated driving, the cor...         [cs.CV, cs.LG]  


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001988AC61550>>
Traceback (most recent call last):
  File "C:\Users\USER\AppData\Roaming\Python\Python38\site-packages\ipykernel\ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
KeyboardInterrupt: 

KeyboardInterrupt

