In [164]:
import numpy as np

import pandas as pd

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

import xgboost as xgb

import math

In [165]:
INPUT_PATH = '../input/'
OUTPUT_PATH = '../output/'


def load_data():
    train = pd.read_csv(INPUT_PATH + 'trainv1.csv')
    train['target'] = train['target'].map(lambda x: data_log(x))
    submit = pd.read_csv(INPUT_PATH + 'submitv1.csv')
    return train, submit


def get_xgb_imp(xgb, feat_names):
    from numpy import array
    imp_vals = xgb.get_fscore()
    imp_dict = {feat_names[i]:float(imp_vals.get('f'+str(i),0.)) for i in range(len(feat_names))}
    total = array(imp_dict.values()).sum()
    return {k:v/total for k,v in imp_dict.items()}


def merge_dic(dicts):
    ret = {}
    for dict in dicts:
        for key in dict:
            val = dict[key]
            ret[key] = ret[key]+val if key in ret else val
    return ret

def data_log(x):
    if x <= 0:
        return 0
    else:
        return np.math.log(x + 1, 5)

In [166]:
print('begin to load data')
train, submit = load_data()

kfold = 10
skf = KFold(n_splits=kfold,shuffle=True, random_state=42)

begin to load data


In [167]:
params = {
        'objective': 'reg:linear',
        'max_depth': 6,
        'colsample_bytree': 0.8,
        'subsample': 0.8,
        'eta': 0.025,
        'gamma': 1,
        'reg_alpha': 0.5,
        'reg_lambda': 0.8,
        'eval_metric': 'rmse',
        'min_child_weight': 2,
        'silent': 1,
        'nthread': 6
    }



In [168]:
# cum = get_column_by_cum(train)
cum= ['comsume_counts_sum','comsume_amounts_sum','click_counts_sum']
#actives = [ 'limit_get_promoted','limit_get_promoted_ever']
#actives = ['dev_comsume_count', 'dev_consume_amount', 'dev_loan_amount', 'dev_loan_count', 'dev_plannum', 'dev_click_count']
#cum= []
actives =[]
all_features = [x for x in train.columns if not x in ['uid', 'target', 'active_date' ]+cum+actives]

X = train.drop(['uid', 'target', 'active_date']+cum+actives, axis=1).values
y = train.target.values

sub_id = submit.uid.values
to_submit = submit.drop(['uid', 'active_date']+cum+actives, axis=1)

sub = pd.DataFrame()
sub['uid'] = sub_id
sub['target'] = np.zeros_like(sub_id)

scores = []

importances = []

In [169]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_submit = xgb.DMatrix(to_submit.values)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    mdl = xgb.train(params, d_train, 4000, watchlist, early_stopping_rounds = 70, verbose_eval = 50)

    f_importance = get_xgb_imp(mdl,all_features)
    print(f_importance)
    importances.append(f_importance)

    score_this = np.sqrt(mean_squared_error(mdl.predict(d_valid, ntree_limit = mdl.best_ntree_limit), y_valid))
    print(score_this)
    scores.append(score_this)

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_submit, ntree_limit=mdl.best_ntree_limit)

    sub['target'] += p_test / kfold

[0]	train-rmse:2.45869	valid-rmse:2.47464
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 70 rounds.
[50]	train-rmse:1.91	valid-rmse:1.90872
[100]	train-rmse:1.84815	valid-rmse:1.84705
[150]	train-rmse:1.83603	valid-rmse:1.83912
[200]	train-rmse:1.82996	valid-rmse:1.83739
[250]	train-rmse:1.82559	valid-rmse:1.83668
[300]	train-rmse:1.82178	valid-rmse:1.83662
[350]	train-rmse:1.81817	valid-rmse:1.8364
[400]	train-rmse:1.81455	valid-rmse:1.83613
[450]	train-rmse:1.81116	valid-rmse:1.83602
[500]	train-rmse:1.80784	valid-rmse:1.83592
[550]	train-rmse:1.80408	valid-rmse:1.83585
[600]	train-rmse:1.80076	valid-rmse:1.83582
[650]	train-rmse:1.79734	valid-rmse:1.83564
[700]	train-rmse:1.79422	valid-rmse:1.83554
[750]	train-rmse:1.79083	valid-rmse:1.83579
Stopping. Best iteration:
[691]	train-rmse:1.79484	valid-rmse:1.83553

{'plannum': 0.0417042832781623, 'sex': 0.0035359614805898286, 'limit_get_promoted_ever'

1.85422122877
[Fold 5/10 Prediciton:]
[0]	train-rmse:2.46133	valid-rmse:2.45159
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 70 rounds.
[50]	train-rmse:1.91005	valid-rmse:1.90472
[100]	train-rmse:1.84761	valid-rmse:1.84822
[150]	train-rmse:1.83539	valid-rmse:1.84166
[200]	train-rmse:1.82953	valid-rmse:1.84062
[250]	train-rmse:1.82462	valid-rmse:1.84008
[300]	train-rmse:1.82085	valid-rmse:1.83992
[350]	train-rmse:1.81715	valid-rmse:1.83987
[400]	train-rmse:1.81362	valid-rmse:1.83982
Stopping. Best iteration:
[371]	train-rmse:1.81561	valid-rmse:1.83969

{'plannum': 0.04625046791165828, 'sex': 0.0036601089714261947, 'limit_get_promoted_ever': 0.0043671754772698912, 'loan_amount_cum': 0.064592604916191829, 'consume_amount': 0.060433390175934784, 'dev_loan_amount': 0.056814873351911162, 'limit_get_promoted': 0.0080688765960986574, 'loan_count': 0.039138210705818743, 'limit': 0.026078276421411639, 'consu

1.84946371427
[Fold 10/10 Prediciton:]


In [171]:
imp_sum = merge_dic(importances)
sort_rec = sorted(imp_sum.items(), key=lambda x:x[1])
print(sort_rec)

print('cv avg scores %s' % np.mean(scores))

[('active_year', 0.016076646707923907), ('sex', 0.034687922009097809), ('limit_get_promoted_ever', 0.035631269529135375), ('limit_get_promoted', 0.071514691682693435), ('age', 0.18920050947561112), ('limit', 0.24179535029140614), ('active_month', 0.28740772094539868), ('actived_months', 0.29367900080515097), ('loan_count_cum', 0.3532558145383019), ('loan_count', 0.38010421252378551), ('comsume_count', 0.41493403492173886), ('plannum_cum', 0.43711623600454835), ('plannum', 0.43768602242738369), ('comsume_count_cum', 0.54747505883505976), ('dev_loan_amount', 0.55382561986006051), ('loan_amount', 0.56252414028325315), ('consume_amount_cum', 0.56777921693482258), ('dev_consume_amount_cum', 0.57302750374747435), ('click_count', 0.60674682846323114), ('click_count_cum', 0.61838306452427549), ('consume_amount', 0.6301455345808864), ('loan_amount_cum', 0.6563083326773238), ('dev_click_count_cum', 0.70863383015419534), ('dev_loan_amount_cum', 0.7820614380772416)]
cv avg scores 1.8477754098


In [153]:
scores

[1.8390787554421544,
 1.8475016540169513,
 1.8236955621642075,
 1.8263216332859971,
 1.8438928088074051,
 1.8237335786571609,
 1.8005292591168336,
 1.8154897544551583,
 1.7843487756625094,
 1.8217104675132709]

In [154]:
from datetime import datetime
sub.loc[sub.target < 0, 'target'] = 0
sub.to_csv(OUTPUT_PATH +"sub{}.csv".format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)

In [326]:
all_features = [x for x in train.columns if not x in ['uid', 'target', 'active_date']]

X = train.drop(['uid', 'target', 'active_date'], axis=1)
y = train.target

In [348]:
xgb_params = {
        'objective': ['reg:linear'],
        'learning_rate': [0.025],
        'max_depth': [6],
        'min_child_weight': [2],
        'silent': [1],
        'subsample': [ 0.8],
        'colsample_bytree': [0.8],
        'n_estimators': [400],
        'gamma': [1,3],
        'reg_alpha': [0.5,0.8],
        'reg_lambda': [0.8,1.3] 
    }

In [349]:

from sklearn.model_selection import GridSearchCV

import xgboost as xgb


In [350]:
modelXgboost = xgb.XGBRegressor()

    # Run the grid search
grid_obj_xgb = GridSearchCV(modelXgboost, xgb_params, cv=KFold(n_splits=5, shuffle=True, random_state=42), n_jobs=2, verbose=2, refit=True,scoring='mean_squared_error')
grid_obj_xgb = grid_obj_xgb.fit(X, y)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.6min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=1, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=1, total= 1.3min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.4min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.6min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.5, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.6min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.6min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3 
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=0.8, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.3min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.5min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 28.4min


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.4min
[CV] reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3 


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min
[CV]  reg_alpha=0.8, colsample_bytree=0.8, silent=1, learning_rate=0.025, min_child_weight=2, n_estimators=400, subsample=0.8, reg_lambda=1.3, objective=reg:linear, max_depth=6, gamma=3, total= 1.2min


[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed: 29.7min finished


In [351]:
clf = grid_obj_xgb.best_estimator_
grid_obj_xgb.best_params_

{'colsample_bytree': 0.8,
 'gamma': 1,
 'learning_rate': 0.025,
 'max_depth': 6,
 'min_child_weight': 2,
 'n_estimators': 400,
 'objective': 'reg:linear',
 'reg_alpha': 0.5,
 'reg_lambda': 0.8,
 'silent': 1,
 'subsample': 0.8}

In [59]:

users = pd.read_csv(INPUT_PATH + 't_user.csv')
orders = pd.read_csv(INPUT_PATH + 't_order.csv')
loans = pd.read_csv(INPUT_PATH + 't_loan.csv')
loans_sum = pd.read_csv(INPUT_PATH + 't_loan_sum.csv')
clicks = pd.read_csv(INPUT_PATH + 't_click.csv')

In [64]:
loans.loan_amount = 5**loans.loan_amount
loans_sum.loan_sum = 5 ** loans_sum.loan_sum - 1
orders.price = 5 ** orders.price
orders.discount = 5 ** orders.discount - 1
users.limit = 5 ** users.limit -1


In [65]:
orders["consume_amount"] = orders["price"] * orders["qty"] - orders["discount"]

In [68]:
import matplotlib.pyplot as plt

In [69]:
plt.figure(figsize=(8,6))
plt.scatter(range(orders.shape[0]), np.sort(orders.consume_amount.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('consume_amount', fontsize=12)
plt.show()

  if self._edgecolors == 'face':


In [91]:
ulimit = np.percentile(orders.consume_amount.values, 99.9)
llimit = 0


In [81]:
orders.consume_amount.isnull().values.any()

False

In [94]:
orders = orders[ orders.consume_amount > llimit ]
orders = orders[ orders.consume_amount < ulimit ]

In [80]:
orders.consume_amount = orders.consume_amount.fillna(0)

In [79]:
orders[orders.cate_id==33].price.describe()

count    395972.000000
mean        999.471488
std        1802.303636
min           1.000000
25%          30.000000
50%          89.000000
75%        1100.000000
max       69888.999996
Name: price, dtype: float64

In [92]:
ulimit

44036.000002161658

In [93]:
llimit

0

In [162]:
all_features

['age',
 'sex',
 'limit',
 'active_month',
 'active_year',
 'comsume_count',
 'consume_amount',
 'loan_amount',
 'loan_count',
 'plannum',
 'click_count',
 'limit_get_promoted',
 'limit_get_promoted_ever',
 'comsume_count_cum',
 'consume_amount_cum',
 'loan_amount_cum',
 'loan_count_cum',
 'click_count_cum',
 'plannum_cum',
 'actived_months',
 'dev_consume_amount_cum',
 'dev_loan_amount_cum',
 'dev_loan_amount',
 'dev_click_count_cum']

In [43]:
train.head()

Unnamed: 0,uid,age,sex,active_date,limit,active_month,active_year,comsume_count,consume_amount,loan_amount,...,consume_amount_cum,loan_amount_cum,loan_count_cum,click_count_cum,plannum_cum,actived_months,comsume_counts_sum,comsume_amounts_sum,click_counts_sum,target
0,26308,30,1,2016-02-16,15000.000001,2,1,1.0,700.0,2500.0,...,0.0,0.0,0.0,0.0,0.0,6,3.0,1283.0,213.0,5.070593
1,78209,40,1,2016-02-21,5000.0,2,1,38.0,8429.1,0.0,...,0.0,0.0,0.0,0.0,0.0,6,86.0,12510.6,0.0,0.0
2,51930,35,1,2016-04-19,24999.999999,4,1,2.0,790.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4,3.0,830.9,35.0,6.405333
3,10113,25,1,2016-03-12,24999.999999,3,1,23.0,2251.8,0.0,...,0.0,0.0,0.0,0.0,0.0,5,34.0,2775.0,154.0,0.0
4,17067,35,1,2016-02-16,15000.000001,2,1,51.0,19244.299999,0.0,...,0.0,0.0,0.0,0.0,0.0,6,148.0,33788.489999,62.0,0.0


In [59]:
list(train)

['uid',
 'age',
 'sex',
 'active_date',
 'limit',
 'active_month',
 'active_year',
 'comsume_count',
 'consume_amount',
 'loan_amount',
 'loan_count',
 'plannum',
 'click_count',
 'comsume_count_cum',
 'consume_amount_cum',
 'loan_amount_cum',
 'loan_count_cum',
 'click_count_cum',
 'plannum_cum',
 'actived_months',
 'comsume_counts_sum',
 'comsume_amounts_sum',
 'click_counts_sum',
 'target']

In [163]:
train.shape

(181986, 30)