In [34]:
import pandas as pd
import numpy as np
import os
import scipy as sp
import sklearn
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import copy
from sklearn.externals import joblib

In [2]:
def logloss(act, preds):
    epsilon = 1e-15
    preds = sp.maximum(epsilon, preds)
    preds = sp.minimum(1 - epsilon, preds)
    ll = sum(act * sp.log(preds) + sp.subtract(1, act) * sp.log(sp.subtract(1, preds)))
    ll = ll * -1.0 / len(act)
    return ll

In [38]:
def commission(model,test,test_file):
    '''
    预测测试集效果，并且提交
    '''
    df = copy.deepcopy(test)
    proba = model.predict(df) #预测转换率
    result_v1 = pd.DataFrame(df.index)
    result_v1["predicted_score"] = proba
    result_v1.to_csv(test_file,encoding="utf-8",sep=" ",index=False)
    print("提交文件保存在 {0}".format(test_file)) 

In [3]:
#删除没有用的字段
drop_cols = ['hourClick_min',
             'hourClick_median',
             'category_0_sum_count',
             'is_train',
             'item_price_level_16',
             'item_price_level_17',
             'item_pv_level_1',
             'shop_review_num_level_0',
             'shop_review_num_level_1',
             'shop_star_level_4999',
             'category_0_7908382889764677758']
train = pd.read_pickle("../数据/merge_data/_5_merge_data_train.pickle") #读取训练集
train = train.set_index("instance_id")
val = pd.read_pickle("../数据/merge_data/_5_merge_data_val.pickle") #读取验证集
val = val.set_index("instance_id") 
test = pd.read_pickle("../数据/merge_data/_5_merge_data_test.pickle") #读取测试集
test = test.set_index("instance_id") #将instance_id 当做是索引键
train = train.drop(drop_cols,axis=1)
test = test.drop(drop_cols,axis=1)
val = val.drop(drop_cols,axis=1)

In [4]:
train.shape

(203093, 374)

In [5]:
X_tr = train.drop(["is_trade"],axis=1)
y_tr = train["is_trade"]
X_val = val.drop(["is_trade"],axis=1)
y_val = val["is_trade"]

del train
del val

#### 用lightgbm 训练模型

In [18]:
# model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=29, max_depth=6, learning_rate=0.01, n_estimators=10000,
#                            max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
#                            min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1,
#                            colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, nthread=-1, silent=True)
# model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=6, min_data_in_leaf = 500,max_depth=3, learning_rate=0.01, n_estimators=10000,
#                            max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
#                            min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1,
#                            colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, nthread=-1, silent=True
#                             ,is_unbalance =True,bagging_fraction = 0.7)
model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=15, max_depth=6, learning_rate=0.01, n_estimators=10000,
                           max_bin=425, subsample_for_bin=50000, objective='binary', min_split_gain=0,
                           min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1,
                           min_data_in_leaf = 2000,bagging_fraction =0.7,bagging_freq = 1,
                           colsample_bytree=1, reg_alpha=3, reg_lambda=5, seed=1000, nthread=-1, silent=True)

In [19]:
model.fit(X_tr, y_tr, eval_metric='logloss',eval_set=[(X_tr, y_tr), (X_val, y_val)],early_stopping_rounds=100)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


[1]	training's binary_logloss: 0.683909	valid_1's binary_logloss: 0.683889
Training until validation scores don't improve for 100 rounds.
[2]	training's binary_logloss: 0.674862	valid_1's binary_logloss: 0.674819
[3]	training's binary_logloss: 0.665986	valid_1's binary_logloss: 0.665925
[4]	training's binary_logloss: 0.657285	valid_1's binary_logloss: 0.657204
[5]	training's binary_logloss: 0.648756	valid_1's binary_logloss: 0.648657
[6]	training's binary_logloss: 0.640388	valid_1's binary_logloss: 0.640272
[7]	training's binary_logloss: 0.632177	valid_1's binary_logloss: 0.632041
[8]	training's binary_logloss: 0.624124	valid_1's binary_logloss: 0.623968
[9]	training's binary_logloss: 0.61622	valid_1's binary_logloss: 0.616047
[10]	training's binary_logloss: 0.608462	valid_1's binary_logloss: 0.608272
[11]	training's binary_logloss: 0.600844	valid_1's binary_logloss: 0.600623
[12]	training's binary_logloss: 0.593363	valid_1's binary_logloss: 0.593123
[13]	training's binary_logloss: 0.5



LGBMClassifier(bagging_fraction=0.7, bagging_freq=1, boosting_type='gbdt',
        colsample_bytree=1, learning_rate=0.01, max_bin=425, max_depth=6,
        min_child_samples=10, min_child_weight=5, min_data_in_leaf=2000,
        min_split_gain=0, n_estimators=10000, n_jobs=-1, nthread=-1,
        num_leaves=15, objective='binary', random_state=None, reg_alpha=3,
        reg_lambda=5, seed=1000, silent=True, subsample=1,
        subsample_for_bin=50000, subsample_freq=1)

In [24]:
%%time
param = model.get_params() #获取模型的参数
nrounds = model.best_iteration_ #获取模型最佳的迭代次数
train_data = lgb.Dataset(X_tr,y_tr) #
estimators = lgb.train(param,train_data,num_boost_round=nrounds) #按照最佳的迭代次数训练

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


CPU times: user 8min 5s, sys: 2.44 s, total: 8min 8s
Wall time: 2min 10s


### 测评

In [28]:
pred_val = estimators.predict(X_val)

In [33]:
logloss(y_val,pred_val) #测评为0.07938 holly shit!!!绝逼过拟合

0.0793827692739475

### 提交

In [40]:
commission(estimators,test,"../数据/model/_6_model_20_22_201804112316_v2.txt")

提交文件保存在 ../数据/model/_6_model_20_22_201804112316_v2.txt
