In [8]:
pip install numpy scipy scikit-learn matplotlib



In [9]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [10]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [11]:
# 特征向量化：CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

In [12]:
# 特征向量化：TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [13]:
# 初始化分类器
classifiers = {
    'LinearSVC': LinearSVC(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

In [14]:
# 测试和评估
results = {}
for name, clf in classifiers.items():
    for vect_name, X_train_vec, X_test_vec in [('CountVec', X_train_counts, X_test_counts),
                                               ('TfidfVec', X_train_tfidf, X_test_tfidf)]:
        # 训练模型
        clf.fit(X_train_vec, y_train)
        # 预测
        y_pred = clf.predict(X_test_vec)
        # 计算评估指标
        report = classification_report(y_test, y_pred, target_names=newsgroups_train.target_names)
        results[f'{name}_{vect_name}'] = report
        print(f"Results for {name} with {vect_name}:")
        print(report)

Results for LinearSVC with CountVec:
                        precision    recall  f1-score   support

           alt.atheism       0.91      0.81      0.85       319
         comp.graphics       0.89      0.94      0.91       389
               sci.med       0.92      0.84      0.87       396
soc.religion.christian       0.86      0.97      0.91       398

              accuracy                           0.89      1502
             macro avg       0.89      0.89      0.89      1502
          weighted avg       0.89      0.89      0.89      1502

Results for LinearSVC with TfidfVec:
                        precision    recall  f1-score   support

           alt.atheism       0.96      0.83      0.89       319
         comp.graphics       0.90      0.97      0.93       389
               sci.med       0.95      0.91      0.93       396
soc.religion.christian       0.90      0.96      0.93       398

              accuracy                           0.92      1502
             macro avg   