In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
part1 = pd.read_table('./data/horseColicTraining.txt',header=None)
part2 = pd.read_table('./data/horseColicTest.txt',header=None)

In [3]:
# 先把所有数据级联到一起
samples = pd.concat((part1,part2))
samples.shape

(366, 22)

In [4]:
train = samples.values[:,:-1]
target = samples.values[:,-1]

In [5]:
train.shape,target.shape

((366, 21), (366,))

In [6]:
# 先对数据进行映射，然后再进行特征预处理
train[0]

array([ 2. ,  1. , 38.5, 66. , 28. ,  3. ,  3. ,  0. ,  2. ,  5. ,  4. ,
        4. ,  0. ,  0. ,  0. ,  3. ,  5. , 45. ,  8.4,  0. ,  0. ])

In [7]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer

train1 = StandardScaler().fit_transform(train)
train2 = MinMaxScaler().fit_transform(train)
train3 = Normalizer().fit_transform(train)

trains = [train1,train2,train3]
feature_project_names = ['StandardScaler','MinMaxScaler','Normalizer']

In [8]:
from sklearn.model_selection import train_test_split


# 预测不同的特征处理，对算法的影响
# 使用此函数找到一个好的特征处理方案
def score_with_model(model,trains,target,feature_project_names):
    for i,train in enumerate(trains):
        X_train,X_test,y_train,y_test = train_test_split(train,target,random_state=1)
        score = model.fit(X_train,y_train).score(X_test,y_test)
        print("{} 特征处理{} 得分:{}".format(model.__class__.__name__,feature_project_names[i],score))

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [10]:
score_with_model(KNeighborsClassifier(),trains,target,feature_project_names)

KNeighborsClassifier 特征处理StandardScaler 得分:0.6739130434782609
KNeighborsClassifier 特征处理MinMaxScaler 得分:0.6847826086956522
KNeighborsClassifier 特征处理Normalizer 得分:0.7065217391304348


In [11]:
score_with_model(LogisticRegression(),trains,target,feature_project_names)

LogisticRegression 特征处理StandardScaler 得分:0.6521739130434783
LogisticRegression 特征处理MinMaxScaler 得分:0.6630434782608695
LogisticRegression 特征处理Normalizer 得分:0.6304347826086957


In [12]:
score_with_model(DecisionTreeClassifier(),trains,target,feature_project_names)

DecisionTreeClassifier 特征处理StandardScaler 得分:0.6630434782608695
DecisionTreeClassifier 特征处理MinMaxScaler 得分:0.6304347826086957
DecisionTreeClassifier 特征处理Normalizer 得分:0.6195652173913043


In [13]:
score_with_model(SVC(),trains,target,feature_project_names)

SVC 特征处理StandardScaler 得分:0.7282608695652174
SVC 特征处理MinMaxScaler 得分:0.6847826086956522
SVC 特征处理Normalizer 得分:0.5869565217391305


In [14]:
# 交叉验证，获得好的算法
# 先使用MinMaxScaler处理
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def select_best_model(model,train,target):
    kfold = KFold(n_splits=10)
    results = cross_val_score(model,train,target,cv=kfold)
    print("{}算法 平均值是{}，方差是{}，最大值{},最小值{}".format(model.__class__.__name__,results.mean(),results.std(),results.max(),results.min()))
    print(results)

In [15]:
select_best_model(KNeighborsClassifier(),train2,target)

KNeighborsClassifier算法 平均值是0.6554804804804805，方差是0.06615852287004709，最大值0.7837837837837838,最小值0.5555555555555556
[0.78378378 0.67567568 0.7027027  0.59459459 0.64864865 0.62162162
 0.58333333 0.55555556 0.66666667 0.72222222]


In [16]:
select_best_model(LogisticRegression(),train2,target)

LogisticRegression算法 平均值是0.6940690690690691，方差是0.06473652809907691，最大值0.8611111111111112,最小值0.6111111111111112
[0.7027027  0.64864865 0.7027027  0.7027027  0.72972973 0.64864865
 0.66666667 0.66666667 0.86111111 0.61111111]


In [17]:
select_best_model(DecisionTreeClassifier(),train2,target)

DecisionTreeClassifier算法 平均值是0.6721471471471472，方差是0.06484248619913455，最大值0.8055555555555556,最小值0.5675675675675675
[0.56756757 0.67567568 0.7027027  0.67567568 0.72972973 0.67567568
 0.63888889 0.80555556 0.66666667 0.58333333]


In [18]:
select_best_model(SVC(),train2,target)

SVC算法 平均值是0.7183933933933934，方差是0.059556305165923036，最大值0.8108108108108109,最小值0.6388888888888888
[0.67567568 0.72972973 0.81081081 0.75675676 0.75675676 0.64864865
 0.66666667 0.63888889 0.80555556 0.69444444]


In [19]:
# 拆分样本集
X_train,X_test,y_train,y_test = train_test_split(train2,target,test_size=0.1)

In [20]:
# 综合考虑，选择SVC算法，使用标准化对数据特征进行预处理
# 算法调参，使用GridSearchCV
from sklearn.model_selection import GridSearchCV

svc = SVC()
param_dic = {
    'kernel':['linear','rbf','poly'],
    'C':[0.001,0.01,0.1,1,10,100],
    'gamma':np.arange(0,100,10)
}

gridCV = GridSearchCV(svc,param_grid=param_dic)

gridCV.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'kernel': ['linear', 'rbf', 'poly'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': array([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
best_svc = gridCV.best_estimator_

In [22]:
best_svc

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
best_svc.score(X_test,y_test)

0.7567567567567568

In [24]:
X_train,X_test,y_train,y_test = train_test_split(train,target,test_size=0.1)
SVC(C=0.001,kernel='poly',gamma=10).fit(X_train,y_train).score(X_test,y_test)

0.7027027027027027

In [32]:
# 特征没选好，原始数据比特征处理过的数据表现要好
# 特征选择
# 降维

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [33]:
# 因为要使用lda进行有监督的降维处理，所以，先使用pca查看大概有多少个特征起主导作用
train4 = PCA(n_components=0.97).fit_transform(train)

In [34]:
train4.shape

(366, 5)

In [35]:
X_train,X_test,y_train,y_test = train_test_split(train4,target,test_size=0.1)
SVC(C=0.001,kernel='rbf',gamma=10).fit(X_train,y_train).score(X_test,y_test)

0.5135135135135135

In [36]:
X_train,X_test,y_train,y_test = train_test_split(train4,target,test_size=0.1)
SVC().fit(X_train,y_train).score(X_test,y_test)

0.7837837837837838