# xgb模型

In [1]:
import pandas as pd
import numpy as np
import math
import logging
from datetime import datetime

from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

In [3]:
IS_PRED = False

## 模型

### 数据预处理

In [4]:
dataset = pd.read_csv('../features/dataset.csv')

dataset_beta = dataset[dataset.date_block_num < 33]
dataset_alpha = dataset[dataset.date_block_num == 33]

In [5]:
dataset.date_block_num.unique()

array([25, 26, 27, 28, 29, 30, 31, 32, 33])

In [5]:
# dataset_pred = pd.read_csv('../features/dataset_pred.csv')

In [6]:
if IS_PRED:
    dataset_beta = pd.concat([dataset_alpha, dataset_beta])
    dataset_alpha = dataset_pred

In [7]:
continous = [
    'shop_id', 'item_id', 'date_block_num',
       'item_avg_price', 'item_category_id', 'sis1', 'sis2',
       'sis3', 'sis4', 'sis5', 'sis6', 'sis7', 'sis8', 'sis9', 'sis10',
       'sis11', 'sis12', 'sis13', 'sis14', 'sis15', 'sis16', 'sis17',
       'sis18', 'sis19', 'sis20', 'sis21', 'sis22', 'sis23', 'sis24',
       'sis25', 'sis26', 'sis27', 'sis28', 'sis29', 'sis30', 'sis31',
       'sis32', 'sis33', 'sis34', 'sis35', 'sis36', 'sis37', 'sis38',
       'sis39', 'sis40', 'sis41', 'sis42', 'sis43', 'sis44', 'sis45',
       'sis46', 'sis47', 'sis48', 'sis49', 'sis50', 'sis51', 'sis52',
       'sis53', 'sis54', 'sis55', 'sis56', 'sis57', 'sis58', 'sis59',
       'sis60', 'sis61', 'is1', 'is2', 'is3', 'is4', 'is5', 'is6', 'is7',
       'is8', 'is9', 'is10', 'is11', 'is12', 'is13', 'is14', 'is15',
       'is16', 'is17', 'is18', 'is19', 'is20', 'is21', 'is22', 'is23',
       'is24', 'is25', 'is26', 'is27', 'is28', 'is29', 'is30', 'is31',
       'is32', 'is33', 'is34', 'is35', 'is36', 'is37', 'is38', 'is39',
       'is40', 'is41', 'is42', 'is43', 'is44', 'is45', 'is46', 'is47',
       'is48', 'is49', 'is50', 'is51', 'is52', 'is53', 'is54', 'is55',
       'is56', 'is57', 'is58', 'is59', 'is60', 'is61', 's1', 's2', 's3',
       's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13',
       's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21', 's22',
       's23', 's24', 's25', 's26', 's27', 's28', 's29', 's30', 's31',
       's32', 's33', 's34', 's35', 's36', 's37', 's38', 's39', 's40',
       's41', 's42', 's43', 's44', 's45', 's46', 's47', 's48', 's49',
       's50', 's51', 's52', 's53', 's54', 's55', 's56', 's57', 's58',
       's59', 's60', 's61', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7',
       'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16', 'c17',
       'c18', 'c19', 'c20', 'c21', 'c22', 'c23', 'c24', 'c25', 'c26',
       'c27', 'c28', 'c29', 'c30', 'c31', 'c32', 'c33', 'c34', 'c35',
       'c36', 'c37', 'c38', 'c39', 'c40', 'c41', 'c42', 'c43', 'c44',
       'c45', 'c46', 'c47', 'c48', 'c49', 'c50', 'c51', 'c52', 'c53',
       'c54', 'c55', 'c56', 'c57', 'c58', 'c59', 'c60', 'c61', 'ic1',
       'ic2', 'ic3', 'ic4', 'ic5', 'ic6', 'ic7', 'ic8', 'ic9', 'ic10',
       'ic11', 'ic12', 'ic13', 'ic14', 'ic15', 'ic16', 'ic17', 'ic18',
       'ic19', 'ic20', 'ic21', 'ic22', 'ic23', 'ic24', 'ic25', 'ic26',
       'ic27', 'ic28', 'ic29', 'ic30', 'ic31', 'ic32', 'ic33', 'ic34',
       'ic35', 'ic36', 'ic37', 'ic38', 'ic39', 'ic40', 'ic41', 'ic42',
       'ic43', 'ic44', 'ic45', 'ic46', 'ic47', 'ic48', 'ic49', 'ic50',
       'ic51', 'ic52', 'ic53', 'ic54', 'ic55', 'ic56', 'ic57', 'ic58',
       'ic59', 'ic60', 'ic61', 't1', 't2', 't3', 't4', 't5', 't6', 't7',
       't8', 't9', 't10', 't11', 't12', 't13', 't14', 't15', 't16', 't17',
       't18', 't19', 't20', 't21', 't22', 't23', 't24', 't25', 't26',
       't27', 't28', 't29', 't30', 't31', 't32', 't33', 't34', 't35',
       't36', 't37', 't38', 't39', 't40', 't41', 't42', 't43', 't44',
       't45', 't46', 't47', 't48', 't49', 't50', 't51', 't52', 't53',
       't54', 't55', 't56', 't57', 't58', 't59', 't60', 't61', 'st1',
       'st2', 'st3', 'st4', 'st5', 'st6', 'st7', 'st8', 'st9', 'st10',
       'st11', 'st12', 'st13', 'st14', 'st15', 'st16', 'st17', 'st18',
       'st19', 'st20', 'st21', 'st22', 'st23', 'st24', 'st25', 'st26',
       'st27', 'st28', 'st29', 'st30', 'st31', 'st32', 'st33', 'st34',
       'st35', 'st36', 'st37', 'st38', 'st39', 'st40', 'st41', 'st42',
       'st43', 'st44', 'st45', 'st46', 'st47', 'st48', 'st49', 'st50',
       'st51', 'st52', 'st53', 'st54', 'st55', 'st56', 'st57', 'st58',
       'st59', 'st60', 'st61'
]

fields = [
    'city', 'type', 'subtype'
]

label = ['item_cnt_month']

In [8]:
features_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
#             ('normalize', Normalizer())
        ])),
        ('fields', Pipeline([
            ('extract', ColumnSelector(fields)),
            ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
            ('one_hot', OneHotEncoder(categories='auto', handle_unknown='ignore')),
            ('to_dense', DenseTransformer())
        ])),
    ])),
])

features_pipeline.fit(dataset_beta, dataset_beta[label].values.ravel())

train_dataset_x = features_pipeline.transform(dataset_beta)
train_dataset_y = dataset_beta[label].values.ravel()

valid_dataset_x = features_pipeline.transform(dataset_alpha)

if not IS_PRED:
    valid_dataset_y = dataset_alpha[label].values.ravel()

In [None]:
selector_model = xgb.sklearn.XGBRegressor(max_depth=3, n_estimators=100, random_state=0)
selector_model.fit(train_dataset_x, train_dataset_y)

In [None]:
thresh = 0.0
selection = SelectFromModel(selector_model, threshold=thresh, prefit=True)

train_dataset_x = selection.transform(train_dataset_x)
valid_dataset_x = selection.transform(valid_dataset_x)

In [None]:
selector_model.feature_importances_

In [None]:
feature_selector = []
for index, value in enumerate(selector_model.feature_importances_):
    if value > 0 and index < len(continous):
        feature_selector.append((continous[index], value))

feature_selector

### 模型训练

#### 调参数

In [None]:
parameters = {
    'xgb__learn_rate': [0.01, ],
    'xgb__max_depth': [6],
    'xgb__min_child_weight': [1],
    'xgb__subsample': [0.7,],
    'xgb__colsample_bytree': [0.7,],
    'xgb__colsample_bylevel': [0.7,],
    'xgb__objective': ['rank:pairwise'],
    'xgb__n_estimators': range(100, 401, 100), # 使用1-3都可以被接受
    'xgb__gamma': [0.1,],
    'xgb__reg_alpha': [1,],
    'xgb__reg_lambda': [1,],
    'xgb__max_delta_step': [0,],
    'xgb__scale_pos_weight': [1,],
    'xgb__silent': [True],
    'xgb__eval_metric': ['auc']
}

cv = GridSearchCV(model_pipeline, parameters, scoring = 'roc_auc', n_jobs= 4)
cv.fit(train_dataset_x, train_dataset_y)

In [None]:
# 查看每组评估的具体数据
cv.cv_results_['param_xgb__n_estimators'].data

# 结果训练
cv.cv_results_['mean_train_score']

#### 最优参数训练

In [None]:
model_pipeline = Pipeline([
    ('xgb', xgb.sklearn.XGBRegressor())
])

model_pipeline.set_params(
    xgb__learn_rate=0.1,
    xgb__max_depth=12,
    xgb__min_child_weight=100,
    xgb__subsample=0.8,
    xgb__colsample_bytree=0.8,
    xgb__colsample_bylevel=0.8,
    xgb__n_estimators=300,
#     xgb__gamma=0,
#     xgb__reg_alpha=0,
#     xgb__reg_lambda=10,
#     xgb__objective='reg:linear',
    xgb__silent=True,
    xgb__eval_metric='rmse'
).fit(train_dataset_x, train_dataset_y)

In [None]:
from sklearn.metrics import mean_squared_error

class Evaluator():
    def __init__(self, df, pipe):
        self.df = df
        self.pipe = pipe
        
    def cal(self, dataset):
        pred = self.pipe.predict(dataset)
        logging.info(pred)
        
        return mean_squared_error(self.df[label], pred)
    
    def predict(self, dataset):
        pred = self.pipe.predict(dataset)
        
        self.df['pred'] = pred
        result = self.df[['ID', 'item_cnt_month']]
        return result

In [None]:
evaluator = Evaluator(dataset_alpha, model_pipeline)

if IS_PRED:
    final_result_df = evaluator.predict(valid_dataset_x)
    final_result_df.to_csv('/Users/leewind/Desktop/submission_20190208.csv', index=False, header=False)
    final_result_df.describe()
else:
    logger.info(evaluator.cal(valid_dataset_x))

In [None]:
evaluator = Evaluator(dataset_beta, model_pipeline)
evaluator.cal(train_dataset_x)

In [34]:
from xgboost import XGBRegressor

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    reg_lambda=10,
    eta=0.3,    
    seed=0)

model.fit(
    train_dataset_x, 
    train_dataset_y, 
    eval_metric="rmse", 
    eval_set=[(train_dataset_x, train_dataset_y), (valid_dataset_x, valid_dataset_y)], 
    verbose=True, 
    early_stopping_rounds = 10)

[0]	validation_0-rmse:17.9849	validation_1-rmse:7.26879
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:17.3652	validation_1-rmse:7.26995
[2]	validation_0-rmse:16.806	validation_1-rmse:7.20502
[3]	validation_0-rmse:16.2861	validation_1-rmse:7.83759
[4]	validation_0-rmse:15.7423	validation_1-rmse:7.81087
[5]	validation_0-rmse:15.2689	validation_1-rmse:8.00764
[6]	validation_0-rmse:14.9207	validation_1-rmse:7.98743
[7]	validation_0-rmse:14.5083	validation_1-rmse:7.99062
[8]	validation_0-rmse:14.1061	validation_1-rmse:8.94814
[9]	validation_0-rmse:13.7201	validation_1-rmse:9.19358
[10]	validation_0-rmse:13.4098	validation_1-rmse:9.42763
[11]	validation_0-rmse:13.1206	validation_1-rmse:9.45643
[12]	validation_0-rmse:12.8756	validation_1-rmse:9.66713
Stopping. Best iteration:
[2]	validation_0-rmse:16.806	validation_1-rmse:7.20502



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, eta=0.3, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=8, min_child_weight=1.1, missing=None,
       n_estimators=1000, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=10, scale_pos_weight=1,
       seed=0, silent=True, subsample=0.8)