In [1]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

  from numpy.core.umath_tests import inner1d


In [2]:
import pickle
def getData(dataFilePath):
    """读取数据"""
    data = pickle.load(open(dataFilePath, 'rb'))

    return data 

In [3]:
import numpy as np
def cv_rmse(model, X, y):
    # cross_val_score函数用法：https://www.cnblogs.com/lzhc/p/9175707.html
    rmse = np.sqrt(cv_mse(model, X, y))
    return rmse

def cv_mse(model, X, y):
    rmse = -cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)
    return rmse

In [4]:
dataFilePath = r"C:\Study\github\Lookoops\tool\毕设代码\data\samples-data.data"
labelsFilePath = r"C:\Study\github\Lookoops\tool\毕设代码\data\samples-data-labels.data"
X = getData(dataFilePath)
y = getData(labelsFilePath)
Y = y.reshape(1, len(y))[0]
print(X)
print("*"*20)
print(Y)

[[  42.18 -142.08 -130.84  233.82  124.2 ]
 [ -60.    168.3  -121.16  -56.34  -77.  ]
 [ 285.96 -285.3   -98.88 -228.72   69.96]
 [ 144.66 -222.36 -160.4   290.82  -10.96]
 [ -61.56 -117.66 -111.24  153.24  -84.12]
 [  37.92  -66.42   27.56 -154.2   110.8 ]
 [-169.74  243.78   16.8  -269.46 -116.92]
 [  12.96   20.46 -194.68  175.86  181.8 ]
 [ 263.04  281.76  -30.56   -3.6  -105.56]
 [ 195.42 -167.76 -216.36  242.16   15.8 ]
 [ -82.92   46.44 -260.96  200.1   144.08]
 [ 231.18 -240.72   69.2   -21.6   173.08]
 [-126.12  114.06   49.44 -126.54   59.92]
 [ 125.82  -81.54  -17.92 -112.38 -134.52]
 [-267.48   -7.14   -3.48   91.8  -132.68]
 [  96.6   -47.22 -237.88  134.16  195.88]
 [ -26.88 -128.88  -36.6  -252.84  -46.92]
 [ 219.18  239.7  -286.04   15.66 -174.8 ]
 [ 259.86 -216.6  -171.24 -197.46  -61.72]
 [-208.86  274.26 -224.88  -64.14  -34.32]
 [-235.32  193.14  -60.64   52.32 -167.36]
 [-111.42 -276.54   94.44  -91.5    36.84]
 [-150.72  210.72   36.08 -211.32 -189.4 ]
 [-189.    

In [5]:
from xgboost import XGBRegressor
models = [LinearRegression(),
          Ridge(), # http://www.cnblogs.com/pinard/p/6023000.html
          Lasso(alpha=0.01,max_iter=10000), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
          RandomForestRegressor(), # https://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestRegressor.html
          GradientBoostingRegressor(), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
          SVR(), # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR
          LinearSVR(), # https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVR.html
          ElasticNet(alpha=0.001,max_iter=10000), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
          SGDRegressor(max_iter=10000,tol=1e-3), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html
          BayesianRidge(), # 
          KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5), # https://scikit-learn.org/stable/modules/generated/sklearn.kernel_ridge.KernelRidge.html
         ExtraTreesRegressor(), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html
          XGBRegressor(), 
          AdaBoostRegressor(n_estimators=50), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
          BaggingRegressor(), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
          DecisionTreeRegressor(), #https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
          KNeighborsRegressor()] # https://scikit-learn.org/0.18/modules/generated/sklearn.neighbors.KNeighborsRegressor.html

In [6]:
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","xgbr", "AdaBoost", "Bagging", "DT", "KN"]
for name, model in zip(names, models):
    score = cv_rmse(model, X, Y)
    print("{}: {:.6f}, {:.4f}".format(name,score.mean(), score.std()))

LR: 668.966995, 132.1050
Ridge: 668.965818, 132.1036
Lasso: 668.966895, 132.1049
RF: 650.058998, 82.3724
GBR: 833.616662, 64.4024
SVR: 614.468927, 128.1587
LinSVR: 28559.197786, 6796.9907
Ela: 668.966976, 132.1050
SGD: 250350937549766.687500, 89981549581914.5781
Bay: 578.561397, 119.6475
Ker: 12667.172637, 7050.5246
Extra: 738.173771, 94.6411
xgbr: 828.493549, 53.0106
AdaBoost: 760.629009, 113.3231
Bagging: 720.868170, 43.6084
DT: 924.218702, 56.1583
KN: 599.128679, 117.7451


In [7]:
import pandas as pd
class grid():
    def __init__(self, model):
        self.model = model
        
    def grid_train(self, X, y, train_para):
        grid_search = GridSearchCV(self.model, train_para, cv=5, scoring="neg_mean_squared_error", return_train_score=True)
        grid_search.fit(X, y)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_)) # 打印最好的结果
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
#         print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])

In [8]:
grid(Lasso()).grid_train(X,Y,{'alpha': [0.001, 0.002, 0.0003, 0.00035, 0.0004,0.0005,0.0007,0.0006,0.0009,0.0008], 'max_iter':[10000]})

{'alpha': 0.002, 'max_iter': 10000} 681.8860239277785


In [9]:
grid(Ridge()).grid_train(X,Y,{'alpha':[i for i in range(10, 20)]})

{'alpha': 19} 681.8589080937944


In [10]:
grid(RandomForestRegressor()).grid_train(X, Y,{'max_depth':[i for i in range(1, 10)]+[None], 'n_estimators':[i for i in range(10, 120, 10)]})

{'max_depth': 1, 'n_estimators': 60} 633.4422509530473


In [11]:
grid(GradientBoostingRegressor()).grid_train(X,Y,{'learning_rate':[float(i/10) for i in range(1, 10)], 'n_estimators':[i for i in range(50, 150, 10)]})

{'learning_rate': 0.6, 'n_estimators': 80} 798.3916762816986


In [12]:
grid(SVR()).grid_train(X,Y,
                       {
                           'kernel':['rbf'], 
                           'gamma':[0.0001, 0.0005, 0.001,0.005, 0.01, 0.05, 0.1, 0.5],
                           'epsilon':[0.0005, 0.001,0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 20, 50, 100, 500, 1000, 5000]
                       })

{'epsilon': 1000, 'gamma': 0.005, 'kernel': 'rbf'} 583.9436418446987


In [13]:
grid(LinearSVR()).grid_train(X,Y,{'epsilon':[0.0005, 0.001,0.005, 0.01, 0.05, 0.1, 0.5], 'loss':['epsilon_insensitive', 'squared_epsilon_insensitive']})

{'epsilon': 0.001, 'loss': 'epsilon_insensitive'} 29354.641398160365


In [14]:
grid(GradientBoostingRegressor()).grid_train(X,Y,{'learning_rate':[float(i/10) for i in range(1, 10)]})

{'learning_rate': 0.2} 815.7000128990016


In [15]:
grid(ElasticNet()).grid_train(X,Y,{'alpha':[0.0005, 0.001,0.005, 0.01, 0.05, 0.1, 0.5],'l1_ratio':[0.08,0.1,0.3,0.5,0.7],'max_iter':[10000]})

{'alpha': 0.5, 'l1_ratio': 0.08, 'max_iter': 10000} 681.8697631819316


In [16]:
# 报错
# grid(SGDRegressor()).grid_train(X,Y,{'alpha':[0.005, 0.01, 0.05, 0.1,0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 5],'l1_ratio':[0.08,0.1,0.3,0.5,0.7, 0.8, 0.9, 1]})

In [17]:
grid(BayesianRidge()).grid_train(X,Y, {'alpha_1':[1e-6, 5e-6, 1e-5], 'alpha_2':[5e-7, 1e-6], 'tol':[0.0001, 0.0005, 0.001, 0.002, 0.005, 0.01]})

{'alpha_1': 1e-06, 'alpha_2': 1e-06, 'tol': 0.0001} 590.7314031155472


In [18]:
grid(KernelRidge()).grid_train(X,Y,{'alpha':[0.05, 0.1, 0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 5], 'kernel':['polynomial'], 'coef0':[1, 1.2, 1.5, 1.6, 1.8, 1.9, 2, 2.2, 2.5, 3]})

{'alpha': 5, 'coef0': 3, 'kernel': 'polynomial'} 189772.72357015283


In [19]:
grid(ExtraTreesRegressor()).grid_train(X, Y,{'min_samples_split':[i/10 for i in range(1, 10)], 'min_samples_leaf':[i/10 for i in range(1, 5)]})

{'min_samples_leaf': 0.4, 'min_samples_split': 0.9} 590.7041906622155


In [20]:
grid(AdaBoostRegressor()).grid_train(X, Y, {'n_estimators':[i for i in range(10, 100, 10)], 'learning_rate':[i/10 for i in range(1, 10)]})

{'learning_rate': 0.4, 'n_estimators': 40} 676.6760069325853


In [21]:
grid(BaggingRegressor()).grid_train(X, Y, {'n_estimators':[i for i in range(1, 50)]})

{'n_estimators': 30} 650.7333432071651


In [22]:
grid(DecisionTreeRegressor()).grid_train(X, Y, {})

{} 919.0267071623101


In [23]:
grid(KNeighborsRegressor()).grid_train(X, Y, {'n_neighbors':[i for i in range(1, 20)]})

{'n_neighbors': 19} 594.9841725357646
