In [250]:
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn import svm 
import matplotlib.pyplot as plt
import numpy as np

### 加载数据

In [251]:
# samples = pd.read_csv('./data/samples_all_fillna_nor.csv',index_col=0)
samples = pd.read_csv('./data/samples_all_fillna.csv',index_col=0)
# samples = pd.read_csv('./data/samples_pca.csv',index_col=0)

test = samples.iloc[-200:,:]
test_ma = xgb.DMatrix(test.iloc[:,:-1],label=test.iloc[:,-1])
train = samples.iloc[:-200,:]
samples.shape

(1391, 63)

In [252]:
def pro_optimization(y_pro,val_Y):    
    res_list =[]
    for pro in xrange(1,10):
        result ={}
        result['pro_thr'] = pro*1.0/10
        y_pre_label = [1 if x>pro*1.0/10 else 0 for x in y_pro]
        result['accuracy'] = accuracy_score(val_Y,y_pre_label)
        result['precision'] =precision_score(val_Y,y_pre_label)
        result['recall'] = recall_score(val_Y,y_pre_label)
        result['f1_score'] =f1_score(val_Y,y_pre_label)
        res_df = pd.DataFrame.from_dict([result])
        res_list.append(result)
    df =pd.DataFrame.from_dict(res_list)
    return df

In [253]:
#ks
def ks_plot(y_true,y_pre,model_name):
    plt.figure()
    fpr,tpr,t = roc_curve(y_true,y_pre)
    plt.plot(fpr,label='FPR')
    plt.plot(tpr,label='TPR')

    gap = tpr-fpr
    ks_index = np.where(gap == np.max(gap))
    y = [fpr[ks_index], tpr[ks_index]]
    x = [int(ks_index[0]), int(ks_index[0])]
    ks_value = np.max(gap)
    plt.plot(x,y,label='KS=%.2f'%ks_value)
    
    plt.title("%s KS graph" % (model_name))
    plt.legend(loc="mid right")
    plt.savefig('./figure/%s_f2.png'%model_name)
    plt.show()

### GBDT模型

In [254]:
def model_xgb(train):
    #数据准备
    X= train.iloc[:,:-1]
    Y = train.iloc[:,-1]
    train_X,val_X,train_Y,val_Y =train_test_split(X,Y,test_size=0.2)
    dtrain = xgb.DMatrix(train_X,label=train_Y)
    dval = xgb.DMatrix(val_X,label=val_Y)
    #log
    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
#         'scale_pos_weight': 484130 / 7183,
        'eval_metric': 'auc',
        'gamma': 0.01,
        'silent': 1,
        'max_depth': 5,
        'lambda': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.2,
        'min_child_weight': 0.1,
        'eta': 0.016,
        'nthread': 5
    }

    evals_result = {}
    watchlist = [(dtrain,'train'),(dval,'val')]
    xgb_model = xgb.train(params,dtrain,num_boost_round=1000,early_stopping_rounds=200,verbose_eval=50,
                          evals=watchlist,evals_result=evals_result)
     
    result = max(evals_result['val']['auc'])
    return xgb_model,result

In [255]:
model_xgb,auc = model_xgb(train)
y_pro = model_xgb.predict(test_ma)
# df = pro_optimization(y_pro,test.iloc[:,-1])


[0]	train-auc:0.798896	val-auc:0.654663
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 200 rounds.
[50]	train-auc:0.950667	val-auc:0.813415
[100]	train-auc:0.970359	val-auc:0.817288
[150]	train-auc:0.984412	val-auc:0.81901
[200]	train-auc:0.989991	val-auc:0.834362
[250]	train-auc:0.992877	val-auc:0.832927
[300]	train-auc:0.995316	val-auc:0.832281
[350]	train-auc:0.996626	val-auc:0.831564
[400]	train-auc:0.99749	val-auc:0.834003
[450]	train-auc:0.99813	val-auc:0.833286
[500]	train-auc:0.99877	val-auc:0.834146
[550]	train-auc:0.999045	val-auc:0.837159
[600]	train-auc:0.999279	val-auc:0.83429
[650]	train-auc:0.999441	val-auc:0.83429
Stopping. Best iteration:
[475]	train-auc:0.998537	val-auc:0.837303



In [256]:
ks_plot(test.iloc[:,-1],y_pro,'GBDT')

In [257]:
#特征权重
dic = model.get_fscore()
fea_rank = pd.DataFrame.from_dict([dic])
# fea_rank.T.sort_values(by=0,ascending= False).to_csv('./data/features_rank_model_nor.csv')
# fea_rank.T.sort_values(by=0,ascending= False)

### LR模型

In [258]:
# samples = pd.read_csv('./data/samples_all_fillna.csv',index_col=0)
# samples = pd.read_csv('./data/samples_pca.csv',index_col=0)
# samples =samples.fillna(0)
#数据准备
X= samples.iloc[:,:-1]
Y = samples.iloc[:,-1]
train_X,val_X,train_Y,val_Y =train_test_split(X,Y,test_size=0.2)

In [259]:
#class_weight = {1:0.9,0:0.1}
lg = LogisticRegressionCV(class_weight='balanced')
lg.fit(train_X,train_Y)  

LogisticRegressionCV(Cs=10, class_weight='balanced', cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [260]:
y_pre = lg.predict(val_X)
print 'accuracy:',accuracy_score(val_Y,y_pre)
print 'precision:',precision_score(val_Y,y_pre)
print 'recall:',recall_score(val_Y,y_pre)
print 'f1_score:',f1_score(val_Y,y_pre)

accuracy: 0.709677419355
precision: 0.19512195122
recall: 0.516129032258
f1_score: 0.283185840708


In [261]:
y_pro_lr = lg.predict_proba(val_X)
y_pro_lr = [x[1] for x in y_pro_lr]
ks_plot(val_Y,y_pro_lr,'LR')

In [262]:
# print 'accuracy:',accuracy_score(val_Y,y_pre_label)
# print 'precision:',precision_score(val_Y,y_pre_label)
# print 'recall:',recall_score(val_Y,y_pre_label)
# print 'f1_score:',f1_score(val_Y,y_pre_label)

### SVM 模型

In [263]:
model_svm = svm.SVC(class_weight='balanced',probability=True)
model_svm.fit(train_X,train_Y)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [264]:
y_pro_svm = model_svm.predict_proba(val_X)
y_pro_svm = [x[1] for x in y_pro_svm]
ks_plot(val_Y,y_pro_svm,'SVM')