In [None]:
import sklearn
import numpy as np
import pandas as pd
from pprint import pprint
import timeit
import random
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
import plotly as py
import plotly.graph_objs as go
from matplotlib.pylab import style
style.use('ggplot')    
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False  
warnings.simplefilter('ignore')

In [None]:
train_data = pd.read_csv('sample_train20000_80.csv')
X = train_data.drop('target',axis=1)
y = train_data['target']
del train_data

In [None]:
from scipy.stats import ks_2samp
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import time
from pprint import pprint
class XGBPipeline():
    def __init__(self,X_train,X_test,y_train,y_test):
        self.X_train = X_train
        self.y_train = y_train       
        self.X_test = X_test    
        self.y_test = y_test
    
    def get_ks(self,y_true,y_pred):
        return ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic
    
    def timer (func):
        def wrapper(*args,**kwargs): 
            start = time.time()
            result = func(*args,**kwargs)
            end = time.time()
            print(func.__name__+'运行时间：','{:.2f}'.format(end-start))
            return result
        return wrapper
    
    @timer
    def trainall(self,params):
      model = XGBClassifier(**params,verbosity=2)
      model.fit(pd.concat([self.X_train,self.X_test]),pd.concat([self.y_train,self.y_test]),verbose=True,eval_set=[(self.X_train,self.y_train)],eval_metric='auc')
      return model
    
    @timer
    def fit_baseline(self):
        params={'booster':'gbtree',
                'objective': 'binary:logistic',
                'max_depth':5,
                'subsample':0.8,
                'colsample_bytree':0.8,
                'min_child_weight':8,
                'learning_rate ': 0.05,
                'nthread':-1,
                'n_estimators':100,
                'updater':'grow_gpu'}
        model = XGBClassifier(**params,verbosity=2)
        model.fit(self.X_train,self.y_train,verbose=True,eval_set=[(self.X_train,self.y_train)],eval_metric='auc')
        self.model_evl(model)
        
    @timer
    def train_model(self,params):
        model = XGBClassifier(**params,verbosity=2)
        model.fit(self.X_train,self.y_train,verbose=2)
        self.model_evl(model)
        return model
        
    @timer
    def gridsearch_para(self):
        params={'booster':'gbtree',
                'objective': 'binary:logistic',
                'max_depth':5,
                'subsample':0.8,
                'colsample_bytree':0.8,
                'min_child_weight':8,
                'learning_rate ': 0.005,
                'nthread':-1,
                'n_estimators':100,
                'updater':'grow_gpu'}
        param_test1 = {'n_estimators':range(40,160,20)}
        xgb = XGBClassifier(
            **params,
            scale_pos_weight=float(len(self.y_train.values)-np.sum(self.y_train.values))/float(np.sum(self.y_train.values)),
            seed=2018,
            silent=False)
        gsearch1 = GridSearchCV(estimator = xgb, param_grid = param_test1, scoring='roc_auc',cv=5,n_jobs=-1)
        gsearch1.fit(self.X_train,self.y_train)
        params.update(gsearch1.best_params_)
        param_test2 = {'max_depth':range(3,7,1)}
        xgb = XGBClassifier(
            **params,
            scale_pos_weight=float(len(self.y_train.values)-np.sum(self.y_train.values))/float(np.sum(self.y_train.values)),
            seed=2018,
            silent=False)
        gsearch2 = GridSearchCV(estimator = xgb, param_grid = param_test2, scoring='roc_auc',cv=5,n_jobs=-1)
        gsearch2.fit(self.X_train,self.y_train)
        params.update(gsearch2.best_params_)
        param_test3 = {'min_child_weight':range(3,9,1)}
        xgb = XGBClassifier(
            **params,
            scale_pos_weight=float(len(self.y_train.values)-np.sum(self.y_train.values))/float(np.sum(self.y_train.values)),
            seed=2018,
            silent=False)
        gsearch3 = GridSearchCV(estimator = xgb, param_grid = param_test3, scoring='roc_auc',cv=5,n_jobs=-1)
        gsearch3.fit(self.X_train,self.y_train)
        params.update(gsearch3.best_params_)
        param_test4 = {
         'subsample':[i/10.0 for i in range(7,10,1)],
        #  'colsample_bytree':[i/10.0 for i in range(6,10,1)]
        }
        xgb = XGBClassifier(
            **params,
            scale_pos_weight=float(len(self.y_train.values)-np.sum(self.y_train.values))/float(np.sum(self.y_train.values)),
            seed=2018,
            silent=False)
        gsearch4 = GridSearchCV(estimator = xgb, param_grid = param_test4, scoring='roc_auc',cv=5,n_jobs=-1)
        gsearch4.fit(self.X_train,self.y_train)
        params.update(gsearch4.best_params_)
        param_test5 = {
          'colsample_bytree':[i/10.0 for i in range(7,10,1)]
        }
        xgb = XGBClassifier(
            **params,
            scale_pos_weight=float(len(self.y_train.values)-np.sum(self.y_train.values))/float(np.sum(self.y_train.values)),
            seed=2018,
            silent=False)
        gsearch5 = GridSearchCV(estimator = xgb, param_grid = param_test5, scoring='roc_auc',cv=5,n_jobs=-1)
        gsearch5.fit(self.X_train,self.y_train)
        params.update(gsearch5.best_params_)
        params.update({'scale_pos_weight':float(len(self.y_train.values)-np.sum(self.y_train.values))/float(np.sum(self.y_train.values)),
            'seed':2018,
            'silent':False})
        return self.train_model(params),params
        
    def model_evl(self,model):
        pred = model.predict_proba(self.X_test)
        pred_y = pred[:,1]
        ypred = (pred_y>=0.5)*1 
        

In [54]:

from sklearn.model_selection import train_test_split
from sklearn import metrics
# 论文中两方法的复现
# E&N method
# 强假设：某个样本为正的概率[P（y = 1 | x）]等于一个样本被标记的概率[P（s = 1 | x）]除以一个正样本被标记的概率[P（s = 1 | y = 1）]
# 方法有一定问题，特别是c的估计上，有偏很严重
# 方法本质上是对模型选择阈值的一个调节，在实际操作上没有必要这么麻烦，可以直接基于训练集和测试集的划分选出最优的阈值
class PUAdjProba(object):
  def __init__(self,X_train,X_test,y_train,y_test):
    self.X_test = X_test
    self.y_test = y_test
    self.X_train,self.X_val,self.y_train,self.y_val = train_test_split(X_train,y_train,test_size=0.15,random_state=2020)
    self.e1 = 0
    self.e2 = 0
    self.adj_model = None
    self.weight_model = None
  
  def get_ks(self,y_true,y_pred):
    return ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic
  
  def timer(func):
    def wrapper(*args,**kwargs): 
      start = time.time()
      result = func(*args,**kwargs)
      end = time.time()
      print(func.__name__+' 运行时间：','{:.2f}min'.format((end-start)/60))
      return result
    return wrapper

  @timer
  def adj_train(self):
    xgbp = XGBPipeline(self.X_train,self.X_test,self.y_train,self.y_test)
    model,_ = xgbp.gridsearch_para()
    e1 = (sum(model.predict_proba(self.X_val)[self.y_val==1])[1] / sum(self.y_val))
    self.adj_model = model
    self.e1 = e1
    self.adj_model_evl(self.X_test,self.y_test,1)
    return model,e1
  
  def adj_predict_proba(self,x,para):
    if para == 1:
      proba = self.adj_model.predict_proba(x) / self.e1
    else:
      proba = self.adj_model.predict_proba(x) / self.e2
    return proba

  def adj_predict(self):
    if para == 1:
      proba = self.adj_model.predict_proba(x) / self.e1
    else:
      proba = self.adj_model.predict_proba(x) / self.e2
    ypred = 1*(proba>0.5)
    return ypred

  '''@timer  
  def adj_train_all(self):
    X_t,X_v,y_t,y_v = train_test_split(pd.concat([self.X_train,self.X_val,self.X_test]),pd.concat([self.y_train,self.y_val,self.y_test]),test_size=0.15,random_state=2020)
    xgbp = XGBPipeline(X_t,X_v,y_t,y_v)
    model,_ = xgbp.gridsearch_para()
    sumpv = sum(model.predict_proba(X_v)[:,1])
    val_size = len(X_v)
    e1 = sumpv / val_size
    return model,e1'''

  def adj_model_evl(self,x,true,para):
    print('Method1-PROBA_ADJ模型评价：')
    pred = self.adj_predict_proba(x,para)
    pred_y = pred[:,1]
    print ('AUC: %.4f' % metrics.roc_auc_score(true,pred_y))
    ypred = (pred_y>=0.5)*1 
    print ('ACC: %.4f' % metrics.accuracy_score(true,ypred))
    print ('Recall: %.4f' % metrics.recall_score(true,ypred))
    print ('Precesion: %.4f' %metrics.precision_score(true,ypred))
    print ('F1-score: %.4f' %metrics.f1_score(true,ypred))
    print ('KS: %.4f' %self.get_ks(true,ypred))
    print('\n')
    print(metrics.confusion_matrix(true,ypred))  

In [55]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2020)

In [56]:
padj = PUAdjProba(X_train,X_test,y_train,y_test)

In [57]:
model_t,e_t = padj.adj_train()

train_model运行时间： 3.38
gridsearch_para运行时间： 270.88
Method1-PROBA_ADJ模型评价：
AUC: 0.7966
ACC: 0.4771
Recall: 0.8986
Precesion: 0.0582
F1-score: 0.1092
KS: 0.3601


[[1846 2154]
 [  15  133]]
adj_train 运行时间： 4.52min


In [89]:
print(e_t)
u_proba = model_t.predict_proba(X_train.loc[y_train==0])
u_weight = (1-e_t)/(e_t)*(u_proba)/(1-u_proba)
u_weight = u_weight[:,1]

0.5418538675679789


In [122]:
# weight方法不调参，基于adj-proba给出参数
params={
                'objective': 'binary:logistic',
                'metric':'auc',
                'max_depth':3,
                'subsample':0.7,
                'colsample_bytree':0.7,
                'min_child_weight':8,
                'learning_rate ': 0.005,
                'nthread':-1,
                'num_round':70,
                'updater':'grow_gpu'}

In [144]:
from scipy.stats import ks_2samp
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import time
from pprint import pprint
class PUweight():
    def __init__(self,X_train,X_test,y_train,y_test,weight,para):
        self.X_train = X_train
        self.y_train = y_train       
        self.X_test = X_test    
        self.y_test = y_test
        self.weight = weight
        self.para = para
        self.dtrain = None
        self.dtest = xgb.DMatrix(self.X_test,self.y_test)

    def get_ks(self,y_true,y_pred):
        return ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic
    
    def timer (func):
      def wrapper(*args,**kwargs): 
          start = time.time()
          result = func(*args,**kwargs)
          end = time.time()
          print(func.__name__+'运行时间：','{:.2f}'.format(end-start))
          return result
      return wrapper

   
    @timer
    def train_model(self):
        X2train = pd.concat([X_train.loc[y_train==1],X_train.loc[y_train==0],X_train.loc[y_train==0]])
        y2train = np.hstack([np.ones((1,len(X_train)))[0],np.zeros((1,len(X_train.loc[y_train==0])))[0]])
        w2train = np.hstack([np.ones((1,len(X_train.loc[y_train==1])))[0],self.weight,(1-self.weight)])
        self.dtrain = xgb.DMatrix(X2train,y2train,weight=w2train)
        model = xgb.train(self.para,self.dtrain)
        self.adj_model_evl(model)
        return model

    def adj_model_evl(self,model):
      print('Method1-Weight_ADJ模型评价：')
      pred = model.predict(self.dtest)
      true = self.y_test
      print(pred)
      pred_y = pred
      print ('AUC: %.4f' % metrics.roc_auc_score(true,pred_y))
      ypred = (pred_y>=0.5)*1 
      print ('ACC: %.4f' % metrics.accuracy_score(true,ypred))
      print ('Recall: %.4f' % metrics.recall_score(true,ypred))
      print ('Precesion: %.4f' %metrics.precision_score(true,ypred))
      print ('F1-score: %.4f' %metrics.f1_score(true,ypred))
      print ('KS: %.4f' %self.get_ks(true,ypred))
      print('\n')
      print(metrics.confusion_matrix(true,ypred))      
    

In [145]:
puw = PUweight(X_train,X_test,y_train,y_test,u_weight,params)

In [147]:
modelw = puw.train_model()

Method1-Weight_ADJ模型评价：
[0.24033266 0.65861255 0.30434465 ... 0.275582   0.43676144 0.29959998]
AUC: 0.7360
ACC: 0.6061
Recall: 0.7230
Precesion: 0.0629
F1-score: 0.1158
KS: 0.3247


[[2407 1593]
 [  41  107]]
train_model运行时间： 6.02
