In [1]:
import pandas as pd

df = pd.read_csv('mushrooms.csv')
df_encoded = pd.get_dummies(df) # 转换成独热编码
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [2]:
df_encoded.head()


Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [3]:
X = df_encoded.iloc[:, 2: ] # 去除标签列
y = df_encoded.iloc[:, 1] # 转换成独热编码后标签有两列，保留其中一列

## 建立pipeline


In [4]:
from sklearn.svm import SVC # support vector classifier
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

pca = PCA(n_components=15, whiten=True, random_state=42) # 通过主坐标分析对特征进行降维
svc = SVC(kernel='linear', class_weight='balanced') # 使用线性核
model = make_pipeline(pca, svc) # 搭建流程，降维结果作为支持向量机的输入

## 将数据分为训练数据和测试数据

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=41)


## 通过交叉验证寻找最佳超参数C (C越大，越不允许松弛)

In [7]:
from sklearn.model_selection import GridSearchCV

param_grid = {'svc__C': [1, 5, 10, 50]}
grid = GridSearchCV(model, param_grid)
grid.fit(X_train, y_train)
print(grid.best_params_) # 查看最佳参数

{'svc__C': 10}


## 使用最佳模型进行预测


In [8]:
model = grid.best_estimator_ # 获取最佳模型
y_fit = model.predict(X_test)

## 生成性能报告


In [9]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_fit, target_names=['e', 'p']))

              precision    recall  f1-score   support

           e       0.97      0.96      0.96      1047
           p       0.96      0.97      0.96       984

    accuracy                           0.96      2031
   macro avg       0.96      0.96      0.96      2031
weighted avg       0.96      0.96      0.96      2031



## 使用带核函数的SVM


In [12]:
tuned_params = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
# 每一种模型的参数是一个字典，每个超参数对应的值是一个列表
scores = ['precision', 'recall']

for score in scores:
    print('----- Tuning hyper-parameters for {} -----'.format(score))
    clf = GridSearchCV(SVC(), tuned_params, cv=5, scoring='{}_macro'.format(score)) # 5折交叉验证
    clf.fit(X_train, y_train)

    print('Best parameters set found on development set:')
    print(clf.best_params_)
    print('Grid scores on development set:')
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print('{:.3f} (+/-{:.3f} for {}'.format(mean, std, params))
    print('Performance on full evaluation set:')
    y_pred = clf.best_estimator_.predict(X_test)
    print(classification_report(y_test, y_pred))


----- Tuning hyper-parameters for precision -----
Best parameters set found on development set:
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
Grid scores on development set:
0.975 (+/-0.004 for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.913 (+/-0.004 for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.997 (+/-0.003 for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.975 (+/-0.003 for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
1.000 (+/-0.001 for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.997 (+/-0.003 for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
1.000 (+/-0.000 for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
1.000 (+/-0.001 for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
1.000 (+/-0.000 for {'C': 1, 'kernel': 'linear'}
1.000 (+/-0.000 for {'C': 10, 'kernel': 'linear'}
1.000 (+/-0.000 for {'C': 100, 'kernel': 'linear'}
1.000 (+/-0.000 for {'C': 1000, 'kernel': 'linear'}
Performance on full evaluation set:
              precision    recall  f1-score   support

           0       1.0