In [1]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.model_selection import train_test_split

In [2]:
#性能评价函数
def myauc(test):
    testgroup = test.groupby(['coupon_id'])
    aucs = []
    for i in testgroup:
        tmpdf = i[1] 
        if len(tmpdf['label'].unique()) != 2:
            continue
        fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
        aucs.append(auc(fpr,tpr))
    return np.average(aucs)

In [4]:
# 数据准备
feature_path = '/Users/Bin/repos/ml-datasets/O2O-Coupon-Usage-Forecast/features/'
model_path = '../../ml-datasets/O2O-Coupon-Usage-Forecast/xgbmodel'
pred_path = '/Users/Bin/repos/ml-datasets/O2O-Coupon-Usage-Forecast/'

dataset1 = pd.read_csv(feature_path + 'ProcessDataSet1.csv')
dataset1.label.replace(-1,0,inplace=True) 
dataset2 = pd.read_csv(feature_path + 'ProcessDataSet2.csv')
dataset2.label.replace(-1,0,inplace=True)
dataset3 = pd.read_csv(feature_path + 'ProcessDataSet3.csv')

dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset12 = pd.concat([dataset1,dataset2],axis=0)
dataset12_y = dataset12['label']
dataset12_x = dataset12.drop(['user_id','label','day_gap_before','coupon_id','day_gap_after'],axis=1)      
                                         
dataset3.drop_duplicates(inplace=True)
dataset3_preds = dataset3[['user_id','coupon_id','date_received']]
dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)

dataTrain = xgb.DMatrix(dataset12_x,label=dataset12_y)
dataTest = xgb.DMatrix(dataset3_x)

In [None]:
# 第一个模型的训练
params={'booster':'gbtree',
	    'objective': 'rank:pairwise',
	    'eval_metric':'auc',
	    'gamma':0.1,
	    'min_child_weight':1.1,
	    'max_depth':5,
	    'lambda':10,
	    'subsample':0.7,
	    'colsample_bytree':0.7,
	    'colsample_bylevel':0.7,
	    'eta': 0.01,
	    'tree_method':'exact',
	    'seed':0,
	    'nthread':12
	    }
watchlist = [(dataTrain,'train')]
model = xgb.train(params,dataTrain,num_boost_round=3500,evals=watchlist, verbose_eval=50)

model.save_model(model_path)
model=xgb.Booster(params)
model.load_model(model_path) 

#predict test set 
dataset3_preds1 = dataset3_preds
dataset3_preds1['label'] = model.predict(dataTest)

#标签归一化在[0，1]原作者代码这里有错
#修改前
#dataset3_preds.label = MinMaxScaler(copy=True,feature_range=(0,1)).fit_transform(dataset3_preds.label)
 
#修改后
dataset3_preds1.label = MinMaxScaler(copy=True,feature_range=(0,1)).fit_transform(dataset3_preds1.label.reshape(-1,1))
dataset3_preds1.sort_values(by=['coupon_id','label'],inplace=True)
dataset3_preds1.to_csv(pred_path + 'xgb_preds.csv',index=None,header=None)
print(dataset3_preds1.describe())

[0]	train-auc:0.828312
[50]	train-auc:0.855485
[100]	train-auc:0.857107
[150]	train-auc:0.858487
[200]	train-auc:0.860476
[250]	train-auc:0.86219
[300]	train-auc:0.863925
[350]	train-auc:0.865646
[400]	train-auc:0.867399
[450]	train-auc:0.869238
[500]	train-auc:0.870713
[550]	train-auc:0.87226
[600]	train-auc:0.873593
[650]	train-auc:0.874776
[700]	train-auc:0.875748
[750]	train-auc:0.876727
[800]	train-auc:0.877551
[850]	train-auc:0.878451
[900]	train-auc:0.879236
[950]	train-auc:0.879946
[1000]	train-auc:0.880634
[1050]	train-auc:0.881365
[1100]	train-auc:0.881962
[1150]	train-auc:0.88253
[1200]	train-auc:0.883045
[1250]	train-auc:0.883551
[1300]	train-auc:0.884024
[1350]	train-auc:0.884504
[1400]	train-auc:0.884966
[1450]	train-auc:0.885393
[1500]	train-auc:0.885819
[1550]	train-auc:0.88626
[1600]	train-auc:0.886684
[1650]	train-auc:0.887081
[1700]	train-auc:0.887452
[1750]	train-auc:0.887833
[1800]	train-auc:0.888166
[1850]	train-auc:0.888513
[1900]	train-auc:0.888884
[1950]	train-

In [None]:
model=xgb.Booster()
model.load_model(model_path) 

temp = dataset12[['coupon_id','label']].copy()
temp['pred'] =model.predict(xgb.DMatrix(dataset12_x))
temp.pred = MinMaxScaler(copy=True,feature_range=(0,1)).fit_transform(temp['pred'].values.reshape(-1,1))
print(myauc(temp))