In [1]:
import pandas as pd
import jieba
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# 1. 合并训练数据
categories = ['政治', '经济', '军事', '社会', '文化', '科技', '体育']
dfs = []
for cat in categories:
    df = pd.read_csv(f'{cat}train.csv')
    dfs.append(df)
full_data = pd.concat(dfs, ignore_index=True)

In [3]:
# 2. 中文文本预处理
def chinese_text_processing(text):
    text = re.sub(r'[^\u4e00-\u9fa5]', '', str(text))  # 去除非汉字字符
    words = jieba.cut(text)                           # 结巴分词
    return ' '.join([w for w in words if w.strip()])   # 去除空格

full_data['processed'] = full_data['Content'].apply(chinese_text_processing)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kk/ylyfvmrj6zv853wrvp3s_0180000gn/T/jieba.cache
Loading model cost 0.330 seconds.
Prefix dict has been built successfully.


In [4]:
# 3. 特征提取
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(full_data['processed'])
y = full_data['Area']

In [5]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import numpy as np

In [6]:
# 定义评估指标
scoring = {
    'accuracy': 'accuracy',
    'f1_macro': 'f1_macro',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'roc_auc_ovr': make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr')
}

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [9]:
# 定义分类模型
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'RandomForest': RandomForestClassifier(),
    'NaiveBayes': MultinomialNB(),
}

In [10]:
# 交叉验证评估
results = []
for name, model in models.items():
    cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)
    metrics = {
        'Model': name,
        'Accuracy': np.mean(cv_results['test_accuracy']),
        'F1': np.mean(cv_results['test_f1_macro']),
        'Precision': np.mean(cv_results['test_precision_macro']),
        'Recall': np.mean(cv_results['test_recall_macro']),
        'ROC_AUC': np.mean(cv_results['test_roc_auc_ovr'])
    }
    results.append(metrics)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Traceback (most recent call last):
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 194, in wrapper
    params = func_sig.bind(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/inspect.py", line 3212, in bind
    return self._bind(args, kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/inspect.py", line 3201, in _bind
    raise TypeError(
TypeError: g

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Traceback (most recent call last):
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 194, in wrapper
    params = func_sig.bind(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/inspect.py", line 3212, in bind
    return self._bind(args, kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/inspect.py", line 3201, in _bind
    raise TypeError(
TypeError: g

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Traceback (most recent call last):
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 194, in wrapper
    params = func_sig.bind(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/inspect.py", line 3212, in bind
    return self._bind(args, kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/inspect.py", line 3201, in _bind
    raise TypeError(
TypeError: g

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Traceback (most recent call last):
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 388, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 194, in wrapper
    params = func_sig.bind(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/inspect.py", line 3212, in bind
    return self._bind(args, kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/canoe/anaconda3/lib/python3.11/inspect.py", line 3201, in _bind
    raise TypeError(
TypeError: g

In [11]:
# 输出评估报告
pd.DataFrame(results).to_csv('model_evaluation.csv', index=False)