In [1]:
# 安裝必要套件
!pip install transformers datasets scikit-learn torch

# 掛載 Google Drive（如果需要保存模型或數據）
#from google.colab import drive
#drive.mount('/content')

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [3]:
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 加载训练数据
with open('train_data.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# 加载测试数据
with open('test_data.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

# 将数据转换为DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# 提取商品描述并将每个描述的子列表合并为一个字符串
train_descriptions = [' '.join([str(word) for word in item]) for item in train_df['description_wseg_list']]
test_descriptions = [' '.join([str(word) for word in item]) for item in test_df['description_wseg_list']]

# 提取目标分类标签
y_train = train_df['catid']
y_test = test_df['catid']

# 创建CountVectorizer对象
vectorizer = CountVectorizer()

# 将商品描述转换为词袋特征
X_train = vectorizer.fit_transform(train_descriptions)
X_test = vectorizer.transform(test_descriptions)

# 查看特征矩阵的维度
print(f"训练集特征维度: {X_train.shape}")
print(f"测试集特征维度: {X_test.shape}")

# 创建BaggingClassifier
base_model = DecisionTreeClassifier()
bagging_model = BaggingClassifier(base_model, n_estimators=50, random_state=42)

# 训练模型
bagging_model.fit(X_train, y_train)

# 评估训练集上的准确度
train_accuracy = bagging_model.score(X_train, y_train)
print(f"训练集准确度: {train_accuracy:.4f}")

# 使用测试集进行预测
y_pred = bagging_model.predict(X_test)

# 输出预测的准确度
test_accuracy = bagging_model.score(X_test, y_test)
print(f"Test_Accuracy: {test_accuracy* 100:.2f}%")

# 显示部分预测结果
test_df['predicted_catid'] = y_pred
print(test_df[['itemid', 'catid', 'predicted_catid']].head())

# 输出分类报告
print(classification_report(y_test, y_pred))

# 输出混淆矩阵
print(confusion_matrix(y_test, y_pred))


训练集特征维度: (8541, 53520)
测试集特征维度: (949, 53520)
训练集准确度: 0.9970
Test_Accuracy: 70.92%
     itemid  catid  predicted_catid
0  27842036     68               68
1   6922161     67               67
2  11696377   1837             1837
3  75867903  10076            10076
4  20497245     69               69
              precision    recall  f1-score   support

          62       0.67      0.54      0.60        52
          63       0.84      0.74      0.79        50
          64       0.74      0.82      0.78        45
          65       0.61      0.70      0.65        43
          66       0.78      0.81      0.79        47
          67       0.60      0.54      0.57        59
          68       0.67      0.72      0.69        50
          69       0.86      0.69      0.77        52
          70       0.78      0.80      0.79        45
          73       0.49      0.67      0.57        49
          74       0.91      0.91      0.91        44
          75       0.61      0.69      0.64        54