# 自动机器学习工具
该notebook将比较市面上的多个AutoAI工具，分别采用两组数据集进行比较，分别是加利福尼亚住房数据集（回归）和森林植被类型（多分类）

# optuna一种超参数优化框架
https://github.com/optuna/optuna

### 波士顿房价预测任务（回归）

In [7]:
import numpy as np
import pandas as pd
import time
import gc

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb  # 使用lgb模型

In [2]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X, y = data['data'], data['target']

In [3]:
X = pd.DataFrame(X,columns=data.feature_names)
X.head(2)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [5]:
# 切分训练和测试集
train_valid_x, test_x, train_valid_y, test_y = train_test_split(X, y,random_state=42,test_size=0.1)

# 切分训练和验证集
train_x, valid_x, train_y, valid_y = train_test_split(train_valid_x, train_valid_y,random_state=42)
print('训练集：',train_x.shape)
print('验证集：',valid_x.shape)
print('测试集：', test_x.shape)

训练集： (13932, 8)
验证集： (4644, 8)
测试集： (2064, 8)


### 使用LGB作为模型，不使用optuna调参

In [12]:
params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          "metric": 'rmse'}
dtrain = lgb.Dataset(train_x, label=train_y)
dvalid = lgb.Dataset(valid_x, label=valid_y)
model = lgb.train(params=params, train_set=dtrain,valid_sets=[dvalid],
                  verbose_eval=50,
                  early_stopping_rounds=20,
                  num_boost_round=5000)
predict = model.predict(test_x)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 13932, number of used features: 8
[LightGBM] [Info] Start training from score 2.072422
Training until validation scores don't improve for 20 rounds
[50]	valid_0's rmse: 0.492877
[100]	valid_0's rmse: 0.471803
[150]	valid_0's rmse: 0.46445
[200]	valid_0's rmse: 0.459716
[250]	valid_0's rmse: 0.456658
[300]	valid_0's rmse: 0.454223
[350]	valid_0's rmse: 0.452644
Early stopping, best iteration is:
[342]	valid_0's rmse: 0.452522


In [14]:
# 评估指标rmse，越小越好
np.sqrt(mean_squared_error(test_y,predict))

0.4531666044672748

### 使用LGB作为模型，使用optuna调参

In [41]:
pip install optuna

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting optuna
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/2a/b4/061c8721b5da1614794d1b66fcb212eee156efd5284f66854d02f295b0be/optuna-2.9.1-py3-none-any.whl (302 kB)
Collecting cmaes>=0.8.2
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/01/1f/43b01223a0366171f474320c6e966c39a11587287f098a5f09809b45e05f/cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting cliff
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/18/f7/2a98b032a43b2925ea32bc13a8feb6cf9416e7d2b2c0f6d2ce14636a03b1/cliff-3.9.0-py3-none-any.whl (80 kB)
Collecting alembic
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/7a/5a/be479a2c379e6b3c57dc56ea3b139ad4d46c2d244a0035ac4d7475116076/alembic-1.7.1-py3-none-any.whl (208 kB)
Collecting colorlog
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/2d/93/4b0bb101e54206e92feb3c986c274902212b2ed8c55423e6e7f6d8b693ca/colorlog-6.4.1-py2.py3-none-any.whl (11 kB)
Collecting importlib-res

In [15]:
import optuna

In [18]:
def objective(trial,train_x, valid_x, train_y, valid_y):
    dtrain = lgb.Dataset(train_x, label=train_y)
    dvalid = lgb.Dataset(valid_x, label=valid_y)

    param = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        'random_state':42,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "rmse")
    gbm = lgb.train(
        param, dtrain, valid_sets=[dvalid], verbose_eval=False, callbacks=[pruning_callback]
    )

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    rmse = np.sqrt(mean_squared_error(valid_y,pred_labels))
    return rmse

In [19]:
if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="minimize"
    )
    study.optimize(lambda trial: objective(trial, train_x, valid_x, train_y, valid_y), n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2021-09-07 09:26:30,952][0m A new study created in memory with name: no-name-f2147511-069d-495f-90ec-5990dc3c3716[0m
[32m[I 2021-09-07 09:26:31,284][0m Trial 0 finished with value: 0.5579165649228477 and parameters: {'lambda_l1': 1.6814939560853405e-06, 'lambda_l2': 8.772634980486007, 'num_leaves': 79, 'feature_fraction': 0.4415746779386105, 'bagging_fraction': 0.5306839081914155, 'bagging_freq': 5, 'min_child_samples': 13}. Best is trial 0 with value: 0.5579165649228477.[0m
[32m[I 2021-09-07 09:26:31,523][0m Trial 1 finished with value: 0.5577111827415846 and parameters: {'lambda_l1': 0.00013684423280171766, 'lambda_l2': 0.010116712675880523, 'num_leaves': 42, 'feature_fraction': 0.4758659908396936, 'bagging_fraction': 0.8916447942940564, 'bagging_freq': 7, 'min_child_samples': 32}. Best is trial 1 with value: 0.5577111827415846.[0m
[32m[I 2021-09-07 09:26:31,986][0m Trial 2 finished with value: 0.5382832732103667 and parameters: {'lambda_l1': 1.849141022226266, 'lam

[32m[I 2021-09-07 09:26:46,126][0m Trial 26 finished with value: 0.5310399189972692 and parameters: {'lambda_l1': 5.65218762871428e-06, 'lambda_l2': 1.6944446205708956, 'num_leaves': 256, 'feature_fraction': 0.6693624417062835, 'bagging_fraction': 0.8606541857435407, 'bagging_freq': 4, 'min_child_samples': 25}. Best is trial 20 with value: 0.5262550702976.[0m
[32m[I 2021-09-07 09:26:47,541][0m Trial 27 finished with value: 0.5289141422774571 and parameters: {'lambda_l1': 3.372780137131952e-07, 'lambda_l2': 0.004231573716537988, 'num_leaves': 222, 'feature_fraction': 0.5834642111209242, 'bagging_fraction': 0.9283224808721449, 'bagging_freq': 3, 'min_child_samples': 33}. Best is trial 20 with value: 0.5262550702976.[0m
[32m[I 2021-09-07 09:26:47,734][0m Trial 28 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-09-07 09:26:47,883][0m Trial 29 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-09-07 09:26:48,032][0m Trial 30 pruned. Trial was pruned at iteration

[32m[I 2021-09-07 09:27:09,210][0m Trial 68 finished with value: 0.524942396458539 and parameters: {'lambda_l1': 0.0011590093830295628, 'lambda_l2': 0.06353809127053359, 'num_leaves': 198, 'feature_fraction': 0.7106295012909083, 'bagging_fraction': 0.973655849802224, 'bagging_freq': 5, 'min_child_samples': 7}. Best is trial 41 with value: 0.5233364787469251.[0m
[32m[I 2021-09-07 09:27:09,572][0m Trial 69 pruned. Trial was pruned at iteration 29.[0m
[32m[I 2021-09-07 09:27:10,511][0m Trial 70 finished with value: 0.5227354154865925 and parameters: {'lambda_l1': 0.0019204609823774911, 'lambda_l2': 0.006957053438847566, 'num_leaves': 200, 'feature_fraction': 0.7174135338967843, 'bagging_fraction': 0.9747628414441452, 'bagging_freq': 5, 'min_child_samples': 8}. Best is trial 70 with value: 0.5227354154865925.[0m
[32m[I 2021-09-07 09:27:11,445][0m Trial 71 pruned. Trial was pruned at iteration 72.[0m
[32m[I 2021-09-07 09:27:12,707][0m Trial 72 finished with value: 0.5273794666

Number of finished trials: 100
Best trial:
  Value: 0.5196727190561812
  Params: 
    lambda_l1: 0.0025915189209523223
    lambda_l2: 0.37941398551753863
    num_leaves: 159
    feature_fraction: 0.6942886423659413
    bagging_fraction: 0.9888819332176355
    bagging_freq: 6
    min_child_samples: 5


In [24]:
params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          "metric": 'rmse'}
for key, value in trial.params.items():
    params[key]=value
print(params)

{'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'rmse', 'lambda_l1': 0.0025915189209523223, 'lambda_l2': 0.37941398551753863, 'num_leaves': 159, 'feature_fraction': 0.6942886423659413, 'bagging_fraction': 0.9888819332176355, 'bagging_freq': 6, 'min_child_samples': 5}


In [25]:
dtrain = lgb.Dataset(train_x, label=train_y)
dvalid = lgb.Dataset(valid_x, label=valid_y)
model = lgb.train(params=params, train_set=dtrain,valid_sets=[dvalid],
                  verbose_eval=50,
                  early_stopping_rounds=20,
                  num_boost_round=5000)
predict = model.predict(test_x)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 13932, number of used features: 8
[LightGBM] [Info] Start training from score 2.072422
Training until validation scores don't improve for 20 rounds
[50]	valid_0's rmse: 0.459286
[100]	valid_0's rmse: 0.449051
[150]	valid_0's rmse: 0.448356
Early stopping, best iteration is:
[166]	valid_0's rmse: 0.447872


In [26]:
# 评估指标rmse，越小越好
np.sqrt(mean_squared_error(test_y,predict))

0.44403838770137805

### 回归任务的结论
不使用optuna的分数是0.4531666044672748，使用的分数是0.44403838770137805，提升了0.00912821676589675。