In [17]:
import numpy as np

import pandas as pd

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

import xgboost as xgb

import math
import feature_engineering as fe

In [18]:
INPUT_PATH = '../input/'
OUTPUT_PATH = '../output/'


def load_data():
    train = pd.read_csv(INPUT_PATH + 'trainv1.csv')
    train['target'] = train['target'].map(lambda x: data_log(x))
    submit = pd.read_csv(INPUT_PATH + 'submitv1.csv')
    return train, submit


def get_xgb_imp(xgb, feat_names):
    from numpy import array
    imp_vals = xgb.get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}


def merge_dic(dicts):
    ret = {}
    for dict in dicts:
        for key in dict:
            val = dict[key]
            ret[key] = ret[key]+val if key in ret else val
    return ret

def data_log(x):
    if x <= 0:
        return 0
    else:
        return np.math.log(x + 1, 5)

In [19]:
print('begin to load data')
train, submit = load_data()
train, submit = fe.add_cate_features(train, submit)
kfold = 10
skf = KFold(n_splits=kfold,shuffle=True, random_state=42)

begin to load data


In [20]:
params = {
        'objective': 'reg:linear',
        'max_depth': 6,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'eta': 0.025,
        'gamma': 1,
        'reg_alpha': 0.5,
        'reg_lambda': 0.8,
        'eval_metric': 'rmse',
        'min_child_weight': 2,
        'silent': 1,
        'nthread': 6
    }



In [21]:
# cum = get_column_by_cum(train)
cum= ['comsume_counts_sum','comsume_amounts_sum','click_counts_sum']
actives = ['dev_median_loan_amount_cum','median_loan_amount_cum','dev_median_loan_amount','median_loan_amount']
#cum= []
all_features = [x for x in train.columns if not x in ['uid', 'target', 'active_date' ]+cum+actives]

X = train.drop(['uid', 'target', 'active_date']+cum+actives, axis=1).values
y = train.target.values

sub_id = submit.uid.values
to_submit = submit.drop(['uid', 'active_date']+cum+actives, axis=1)

sub = pd.DataFrame()
sub['uid'] = sub_id
sub['target'] = np.zeros_like(sub_id)

scores = []

importances = []

In [24]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_submit = xgb.DMatrix(to_submit.values)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    mdl = xgb.train(params, d_train, 4000, watchlist, early_stopping_rounds = 70, verbose_eval = 50)

    f_importance = get_xgb_imp(mdl,all_features)
    print(f_importance)
    importances.append(f_importance)

    score_this = np.sqrt(mean_squared_error(mdl.predict(d_valid, ntree_limit = mdl.best_ntree_limit), y_valid))
    print(score_this)
    scores.append(score_this)

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_submit, ntree_limit=mdl.best_ntree_limit)

    sub['target'] += p_test / kfold

[0]	train-rmse:2.45788	valid-rmse:2.47392
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 70 rounds.
[50]	train-rmse:1.87218	valid-rmse:1.87603
[100]	train-rmse:1.80416	valid-rmse:1.81191
[150]	train-rmse:1.79075	valid-rmse:1.80411
[200]	train-rmse:1.78395	valid-rmse:1.80261
[250]	train-rmse:1.77893	valid-rmse:1.80229
[300]	train-rmse:1.77495	valid-rmse:1.8022
[350]	train-rmse:1.77122	valid-rmse:1.80198
[400]	train-rmse:1.76727	valid-rmse:1.80191
Stopping. Best iteration:
[366]	train-rmse:1.76977	valid-rmse:1.80184

{'avg_click_count_cum': 0.013630361630532011, 'plannum': 0.041998551774076756, 'sex': 0.00085189760190825068, 'limit_get_promoted_ever': 0.004429867529922903, 'loan_amount_cum': 0.06171998125825276, 'consume_amount': 0.033522170635089665, 'dev_loan_amount': 0.049239681390296886, 'avg_loan_amount_cum': 0.0074115091366017804, 'active_day_of_week': 0.014056310431486136, 'median_click_count': 

1.80079705381
[Fold 4/10 Prediciton:]
[0]	train-rmse:2.46072	valid-rmse:2.44898
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 70 rounds.
[50]	train-rmse:1.87223	valid-rmse:1.87428
[100]	train-rmse:1.80357	valid-rmse:1.81436
[150]	train-rmse:1.79015	valid-rmse:1.80772
[200]	train-rmse:1.78344	valid-rmse:1.80669
[250]	train-rmse:1.77868	valid-rmse:1.80619
[300]	train-rmse:1.77462	valid-rmse:1.80604
[350]	train-rmse:1.77066	valid-rmse:1.80597
Stopping. Best iteration:
[318]	train-rmse:1.7732	valid-rmse:1.80595

{'avg_click_count_cum': 0.016209651048722379, 'plannum': 0.042929882748633623, 'sex': 0.00098098752744429397, 'limit_get_promoted_ever': 0.0042976596440416686, 'loan_amount_cum': 0.060354089783715606, 'consume_amount': 0.032793011631709251, 'dev_loan_amount': 0.051198206194235533, 'avg_loan_amount_cum': 0.0075209043770729199, 'active_day_of_week': 0.013126547391040314, 'median_click_count': 0.00

1.79623196413
[Fold 8/10 Prediciton:]
[0]	train-rmse:2.45989	valid-rmse:2.45656
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 70 rounds.
[50]	train-rmse:1.87324	valid-rmse:1.86774
[100]	train-rmse:1.80506	valid-rmse:1.8025
[150]	train-rmse:1.79203	valid-rmse:1.79465
[200]	train-rmse:1.78533	valid-rmse:1.79324
[250]	train-rmse:1.78059	valid-rmse:1.79267
[300]	train-rmse:1.77632	valid-rmse:1.79254
[350]	train-rmse:1.77242	valid-rmse:1.79243
[400]	train-rmse:1.76872	valid-rmse:1.79242
[450]	train-rmse:1.76457	valid-rmse:1.79235
Stopping. Best iteration:
[426]	train-rmse:1.76641	valid-rmse:1.7923

{'avg_click_count_cum': 0.014119570467803664, 'plannum': 0.039683424367406087, 'sex': 0.0008174488165570542, 'limit_get_promoted_ever': 0.0042358711403410994, 'loan_amount_cum': 0.057592984802883364, 'consume_amount': 0.037825586147958237, 'dev_loan_amount': 0.047820755768587672, 'avg_loan_amount_cum': 0.00743

In [26]:
imp_sum = merge_dic(importances)
sort_rec = sorted(imp_sum.items(), key=lambda x:x[1])
print(sort_rec)

print('cv avg scores %s' % np.mean(scores))

[('sex', 0.0084581996772920415), ('active_year', 0.011654067268094998), ('median_click_count', 0.027977040671209458), ('limit_get_promoted', 0.044293798182236124), ('limit_get_promoted_ever', 0.044998007766228081), ('age', 0.060857474989361865), ('avg_loan_amount_cum', 0.077582458131881749), ('avg_click_count', 0.086284643685520773), ('avg_loan_amount', 0.11669631825714674), ('median_click_count_cum', 0.11807027767589492), ('active_day_of_week', 0.14146183104795995), ('actived_months', 0.1428543675546412), ('avg_click_count_cum', 0.14759871456864396), ('median_consume_amount_cum', 0.16213569982804679), ('avg_consume_amount_cum', 0.200189558632452), ('comsume_count', 0.20110658216735725), ('dev_median_click_count', 0.20473498166298737), ('limit', 0.20810849005493753), ('dev_median_click_count_cum', 0.21696581278744714), ('active_month', 0.21868140177930884), ('dev_median_consume_amount_cum', 0.23226516368546357), ('dev_consume_amount_cum', 0.25152592159656278), ('click_count', 0.2522612

In [17]:
scores

[1.8018676541459515,
 1.8107008002628047,
 1.8194558554226632,
 1.7998679808241393,
 1.8057300392566591,
 1.8020730212166454,
 1.8220356340339823,
 1.7960921106684471,
 1.7921954324533003,
 1.8050646933697356]

In [27]:
from datetime import datetime
sub.loc[sub.target < 0, 'target'] = 0
sub.to_csv(OUTPUT_PATH +"sub{}.csv".format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

In [326]:
all_features = [x for x in train.columns if not x in ['uid', 'target', 'active_date']]

X = train.drop(['uid', 'target', 'active_date'], axis=1)
y = train.target

In [348]:
xgb_params = {
        'objective': ['reg:linear'],
        'learning_rate': [0.025],
        'max_depth': [6],
        'min_child_weight': [2],
        'silent': [1],
        'subsample': [ 0.8],
        'colsample_bytree': [0.8],
        'n_estimators': [400],
        'gamma': [1,3],
        'reg_alpha': [0.5,0.8],
        'reg_lambda': [0.8,1.3] 
    }

In [349]:

from sklearn.model_selection import GridSearchCV

import xgboost as xgb


In [350]:
modelXgboost = xgb.XGBRegressor()

    # Run the grid search
grid_obj_xgb = GridSearchCV(modelXgboost, xgb_params, cv=KFold(n_splits=5, shuffle=True, random_state=42), n_jobs=2, verbose=2, refit=True,scoring='mean_squared_error')
grid_obj_xgb = grid_obj_xgb.fit(X, y)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.6min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.6min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.6min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.6min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 28.4min


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min


[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed: 29.7min finished


In [351]:
clf = grid_obj_xgb.best_estimator_
grid_obj_xgb.best_params_

{'colsample_bytree': 0.8,
 'gamma': 1,
 'learning_rate': 0.025,
 'max_depth': 6,
 'min_child_weight': 2,
 'n_estimators': 400,
 'objective': 'reg:linear',
 'reg_alpha': 0.5,
 'reg_lambda': 0.8,
 'silent': 1,
 'subsample': 0.8}

In [22]:
all_features

['age',
 'sex',
 'limit',
 'active_month',
 'active_year',
 'active_day_of_week',
 'comsume_count',
 'consume_amount',
 'loan_amount',
 'loan_count',
 'plannum',
 'click_count',
 'limit_get_promoted',
 'limit_get_promoted_ever',
 'comsume_count_cum',
 'consume_amount_cum',
 'loan_amount_cum',
 'loan_count_cum',
 'click_count_cum',
 'plannum_cum',
 'actived_months',
 'avg_consume_amount_cum',
 'median_consume_amount_cum',
 'dev_consume_amount_cum',
 'dev_median_consume_amount_cum',
 'avg_loan_amount_cum',
 'dev_loan_amount_cum',
 'avg_loan_amount',
 'dev_loan_amount',
 'avg_click_count_cum',
 'median_click_count_cum',
 'dev_click_count_cum',
 'dev_median_click_count_cum',
 'avg_click_count',
 'median_click_count',
 'dev_click_count',
 'dev_median_click_count',
 'cate_0',
 'cate_1',
 'cate_2']

In [6]:
list(train)

['uid',
 'age',
 'sex',
 'active_date',
 'limit',
 'active_month',
 'active_year',
 'comsume_count',
 'consume_amount',
 'loan_amount',
 'loan_count',
 'plannum',
 'click_count',
 'limit_get_promoted',
 'limit_get_promoted_ever',
 'comsume_count_cum',
 'consume_amount_cum',
 'loan_amount_cum',
 'loan_count_cum',
 'click_count_cum',
 'plannum_cum',
 'actived_months',
 'comsume_counts_sum',
 'comsume_amounts_sum',
 'click_counts_sum',
 'target',
 'avg_consume_amount_cum',
 'median_consume_amount_cum',
 'dev_consume_amount_cum',
 'dev_median_consume_amount_cum',
 'avg_loan_amount_cum',
 'median_loan_amount_cum',
 'dev_loan_amount_cum',
 'dev_median_loan_amount_cum',
 'avg_loan_amount',
 'median_loan_amount',
 'dev_loan_amount',
 'dev_median_loan_amount',
 'avg_click_count_cum',
 'median_click_count_cum',
 'dev_click_count_cum',
 'dev_median_click_count_cum',
 'avg_click_count',
 'median_click_count',
 'dev_click_count',
 'dev_median_click_count',
 'cate_0',
 'cate_1',
 'cate_2']

In [29]:
    train_cat = pd.read_csv(INPUT_PATH + "train_cate_id.csv")
    submit_cat = pd.read_csv(INPUT_PATH + "submit_cate_id.csv")
    col_num = 3
    name_basic = 'cate_{}'

In [30]:
cols_svd_name = map(lambda x: name_basic.format(x), range(0, col_num))

In [31]:
cols_svd_name

['cate_0', 'cate_1', 'cate_2']

In [32]:
    all_cols = cols_svd_name + ['cat_counts']


In [33]:
all_cols

['cate_0', 'cate_1', 'cate_2', 'cat_counts']

In [23]:
train['active_day_of_week']

0         1
1         6
2         1
3         5
4         1
5         3
6         5
7         3
8         3
9         1
10        4
11        2
12        3
13        2
14        1
15        0
16        3
17        3
18        1
19        3
20        1
21        4
22        4
23        4
24        2
25        5
26        5
27        1
28        3
29        6
         ..
272949    2
272950    1
272951    0
272952    2
272953    2
272954    2
272955    1
272956    1
272957    6
272958    6
272959    3
272960    1
272961    1
272962    1
272963    2
272964    2
272965    6
272966    0
272967    1
272968    1
272969    0
272970    5
272971    2
272972    4
272973    2
272974    6
272975    6
272976    2
272977    0
272978    3
Name: active_day_of_week, Length: 272979, dtype: int64