In [1]:
# coding:utf-8
# 1. 读取原训练数据转换为pd格式
# 2. 对数据进行处理
# 3. 可视化一波，清洗数据
# 4. 使用不同分类方法，选择最佳参数
# 5. 使用stacking训练模型
# 6. 读取待分割图像，预测，分割

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import numpy as np 
def get_train_data(train_data_path, _base=1):
    # 获取数据转换为矩阵
    train_data, labels = [], []
    with open(train_data_path) as f:
        for line in f.readlines():
            line_data = line.strip().split("\t")
            _data = []
            for d in line_data[_base:-1]:
                _data.append(float(d))
            train_data.append(_data)
            labels.append(int(line_data[-1]))
    return np.mat(train_data), np.array(labels).T


def pre_process(data, alpha=0.99, is_total=False):
    """离差标准化(0, 1)"""
    m, n = np.shape(data)
    ret = np.zeros((m, n))
    for i in range(m):
        total = np.sum(data[i, :])
        max_value = np.max(data[i, :])
        for j in range(n):
            if is_total:
                ret[i, j] = data[i, j] / total * alpha
            else:
                ret[i, j] = [data[i, j], 1][data[i, j] == 0] / max_value * alpha
    return ret


In [3]:
# 训练数据路径
train_data_path = r"C:\Study\github\Lookoops\tool\bone_segementation_use_stacking\data.txt"
train_raw, _labels = get_train_data(train_data_path, 2)
train_raw = pre_process(train_raw)
train_dataset = pd.DataFrame(train_raw)
labels = pd.DataFrame(_labels)
train_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,0.0006,0.0032,0.001055,0.001727,0.003509,0.011182,0.016764,0.021691,0.0218,0.030291,...,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05
1,0.000203,0.419711,0.00099,0.004577,0.011115,0.013969,0.018935,0.037278,0.057617,0.077153,...,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06
2,0.000827,1.6e-05,1.6e-05,1.6e-05,0.000203,0.001545,0.005897,0.016553,0.03451,0.064886,...,0.005632,0.006241,0.005507,0.005975,0.005258,0.00429,0.00429,0.003245,0.002091,0.001498
3,0.00039,0.019799,8e-06,8e-06,8e-06,8e-06,8e-06,0.000284,0.053961,0.456731,...,0.001056,0.000975,0.001081,0.001535,0.004387,0.002177,8e-06,8e-06,8e-06,8e-06
4,0.001272,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,...,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05


In [4]:
labels.head()

Unnamed: 0,0
0,25
1,21
2,27
3,14
4,95


In [21]:
_train = np.hstack((train_dataset,labels))
train = pd.DataFrame(_train)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,256
0,0.0006,0.0032,0.001055,0.001727,0.003509,0.011182,0.016764,0.021691,0.0218,0.030291,...,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,1.8e-05,25.0
1,0.000203,0.419711,0.00099,0.004577,0.011115,0.013969,0.018935,0.037278,0.057617,0.077153,...,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,9e-06,21.0
2,0.000827,1.6e-05,1.6e-05,1.6e-05,0.000203,0.001545,0.005897,0.016553,0.03451,0.064886,...,0.006241,0.005507,0.005975,0.005258,0.00429,0.00429,0.003245,0.002091,0.001498,27.0
3,0.00039,0.019799,8e-06,8e-06,8e-06,8e-06,8e-06,0.000284,0.053961,0.456731,...,0.000975,0.001081,0.001535,0.004387,0.002177,8e-06,8e-06,8e-06,8e-06,14.0
4,0.001272,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,...,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,3.9e-05,95.0


In [22]:
_y = train[256]
_y.head()

0    25.0
1    21.0
2    27.0
3    14.0
4    95.0
Name: 256, dtype: float64

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.metrics import mean_squared_error

In [24]:
# 获得均差
def cv_rmse(model, X, y):
    # cross_val_score函数用法：https://www.cnblogs.com/lzhc/p/9175707.html
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse

In [25]:
models = [
    KNeighborsClassifier(),# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
    SVC(), # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    DecisionTreeClassifier(), # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
    RandomForestClassifier(), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    ExtraTreesClassifier(), # 
    AdaBoostClassifier(), # 
    GradientBoostingClassifier (), # 
#     VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('gnb', GaussianNB())],
#                      voting='soft', flatten_transform=True), # 
    BaggingClassifier(), # 
    GaussianNB(), # 
    LogisticRegression(), # 
    XGBClassifier(), # 
]

In [26]:
# 查看各个模型对数据的分类能力， 差的不要
names = ["KNeighbors", "SVC", "DecisionTree", "RandomForest", "ExtraTrees", "AdaBoost", "GradientBoosting", "Voting","Bagging","GaussianNB","LogisticRegression","XGB"]
for name, model in zip(names, models):
    score = cv_rmse(model, train_dataset, _y)
    print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))

KNeighbors: 10.771922, 10.2362
SVC: 12.710739, 8.7934
DecisionTree: 8.075998, 6.4435
RandomForest: 7.749291, 5.3838
ExtraTrees: 7.563148, 4.9892
AdaBoost: 12.359919, 8.5791
GradientBoosting: 9.394859, 7.1262
Voting: 9.216266, 9.1847
Bagging: 9.651912, 6.9889
GaussianNB: 5.884041, 1.8734


  if diff:
  if diff:
  if diff:
  if diff:


LogisticRegression: 8.750351, 7.3658


  if diff:


In [27]:
# 网格搜索
class grid():
    def __init__(self, model):
        self.model = model
        
    def grid_train(self, X, y, train_para):
        grid_search = GridSearchCV(self.model, train_para, cv=5, scoring="neg_mean_squared_error")
        grid_search.fit(X, y)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_)) # 打印最好的结果
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

In [28]:
# 调参
grid(SVC()).grid_train(train_dataset, _y, {'kernel': ['rbf', 'poly', 'sigmoid'], 'gamma':[0.05, 0.1, 0.5, 1, 5, 10, 15], 'degree':[2, 3]})

{'degree': 2, 'gamma': 5, 'kernel': 'poly'} 8.339176933078123
                                               params  mean_test_score  \
0       {'degree': 2, 'gamma': 0.05, 'kernel': 'rbf'}        18.184536   
1      {'degree': 2, 'gamma': 0.05, 'kernel': 'poly'}        16.925292   
2   {'degree': 2, 'gamma': 0.05, 'kernel': 'sigmoid'}        19.963637   
3        {'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}        16.239434   
4       {'degree': 2, 'gamma': 0.1, 'kernel': 'poly'}        15.095753   
5    {'degree': 2, 'gamma': 0.1, 'kernel': 'sigmoid'}        19.208475   
6        {'degree': 2, 'gamma': 0.5, 'kernel': 'rbf'}        17.496939   
7       {'degree': 2, 'gamma': 0.5, 'kernel': 'poly'}        12.167246   
8    {'degree': 2, 'gamma': 0.5, 'kernel': 'sigmoid'}        17.341115   
9          {'degree': 2, 'gamma': 1, 'kernel': 'rbf'}        17.449011   
10        {'degree': 2, 'gamma': 1, 'kernel': 'poly'}        11.974110   
11     {'degree': 2, 'gamma': 1, 'kernel': 'sigmoi

In [29]:
grid(DecisionTreeClassifier()).grid_train(train_dataset, _y, {})

{} 10.927549132777262
  params  mean_test_score  std_test_score
0     {}        10.927549      125.618193


In [12]:
# stacking集成学习
class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, model, fusion_model):
        self.model = model
        self.fusion_model = fusion_model
        self.kf = KFold(n_splits=14, random_state=50, shuffle=True)
        
    def fit(self, X, y):
        self.model_saved = [list() for i in self.model] 
        train_pred = np.zeros((X.shape[0], len(self.model))) # 存储每个数据被预测的结果， 其结果使用融合模型进行训练
        
        for i,mod in enumerate(self.model):
            for train_index, value_index in self.kf.split(X, y):
                tmp_model = clone(mod)
                tmp_model.fit(X[train_index], y[train_index])
                self.model_saved[i].append(tmp_model)
                train_pred[value_index, i] = tmp_model.predict(X[value_index])
        self.fusion_model.fit(train_pred, y) # 将训练数据预测结果作为融合模型的输入训练数据
        
        return self
    
    def predict(self, X):
        test_mean = np.column_stack([np.column_stack(mod.predict(X) for mod in tmp_model).mean(axis=1) for tmp_model in self.model_saved]) # 对每个test数据进行预测并取平局值
        return self.fusion_model.predict(test_mean)


In [13]:
# 根据上面调参得到的参数填入下面模型中作为最优模型
train_models = [
    KNeighborsClassifier(),
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier (),
    VotingClassifier(estimators=[('lr', LogisticRegression()), ('rf', RandomForestClassifier()), ('gnb', GaussianNB())],
                     voting='soft', flatten_transform=True),
    BaggingClassifier(),
    GaussianNB(),
    LogisticRegression(),
    XGBClassifier(),
]

In [14]:
stack_model = stacking(train_models, LogisticRegression()) # 选择一个简单并且rms较低的模型作为融合模型

In [15]:
stack_model.fit(train_dataset, _y) # 训练模型

KeyError: '[256 257 258 259 260 261 262 263 265 266 267 268 269 270 271 272 273 274\n 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292\n 293 295 296 297 298 299 300 301 303 304 305 306 307 308 309 311 312 313\n 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331\n 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349\n 350 351 352 353 354 356 357 358 359 360 361 362 363 364 365 366 367 369\n 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387\n 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405] not in index'

In [None]:
import pickle 
fp = open("pickle_prediction_model.pkl", "wb")
pickle.dump(stack_model, fp)
fp.close()