数据默认放在notebook的同目录的data文件下


In [62]:
import pandas as pd, numpy as np, matplotlib.pylab as plt

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    fig.set_size_inches(10,10)
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           title=title,
           ylabel='True label',
           xlabel='Predicted label')
    ax.set_xticklabels(classes[0],fontsize=20)
    ax.set_yticklabels(classes[1],fontsize=20)
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt), fontsize =30,
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax


In [63]:
#读取数据
df = pd.read_csv('data/train.csv')



In [64]:
df.head(5)

#不需要id code
df = df.drop('ID_code',axis = 1)

In [71]:
#分测试训练集

#如需更改训练测试比例，请在此处更改

fraction = 0.7

df_train = df.sample(frac=fraction)
df_test = df.drop(df_train.index,axis=0)

In [72]:
#采样处理函数区


#为方便之后测试，请务必使用函数形式

def under_sample(df_train):
    '''
    input: training dateframe including target
    output: under sampled dataframe
    '''
    t0 = df_train[df_train['target']==0]
    t1 = df_train[df_train['target']==1]


    df = t0.sample(frac=len(t1)/len(t0)).append(t1)
    
    return df

def over_sample(df_train):
    '''
    input: training dateframe including target
    output: under sampled dataframe
    '''
    t0 = df_train[df_train['target']==0]
    t1 = df_train[df_train['target']==1]


    df = t1.sample(frac=len(t0)/len(t1),replace=True).append(t0)
    
    return df
    

In [76]:
#（一般情况下不需要更改）

df_train_sampled = over_sample(df_train)



print ('数据总共有%s个， 其中0标签%s个， 1标签%s个'%( len(df_train_sampled),sum(df_train_sampled['target']==0),sum(df_train_sampled['target']==1)))

数据总共有252122个， 其中0标签126061个， 1标签126061个


In [77]:
#分开标签 (一般情况下不需要更改)
t_train = df_train_sampled['target']
x_train = df_train_sampled.drop('target',axis=1)

t_test = df_test['target']
x_test = df_test.drop('target',axis=1)

In [75]:
#变量处理



In [36]:
#变量晒选



12683

In [None]:
#初步建模
%timeit
#xgb
from xgboost import XGBClassifier
model = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=9,
 min_child_weight=5,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=1,
 seed=27)

model.fit(x_train,t_train)



In [None]:
#模型检验
t_pred = model.predict(x_test)

# Plot normalized confusion matrix
plot_confusion_matrix(t_test, t_pred, classes=[u'TF',u'TF'], normalize=True, title='Normalized XGboost confusion matrix')
plt.show()

#因子重要性
from xgboost import plot_importance
fig,ax = plt.subplots(figsize=(10,15))
ax.set_yticklabels(x_train.columns,fontsize= 15)
plot_importance(model,height=0.5,max_num_features=20,ax=ax)
plt.show()

#ROCAUC
t_score = roc_auc_score(t_test,model.predict_proba(x_test)[:,1])
print('AUC score: %s'%(t_score))

fpr, tpr, thresholds= roc_curve(t_test,model.predict_proba(X_test)[:,1],pos_label=None,sample_weight=None,drop_intermediate=True)
plt.plot(fpr,tpr,marker = 'o')
plt.title('ROC')
plt.show()

