# 将整个文件分为四个大块，分别是载入数据部分、处理特征部分、训练及测试模型部分、输出结果部分

# 载入数据

In [49]:
from __future__ import print_function

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

In [2]:
data_dir = './Competition_170330/'

In [3]:
train_path = os.path.join(data_dir, 'train.csv')
test_path = os.path.join(data_dir, 'test.csv')

In [4]:
train_raw_data = pd.read_csv(train_path)
test_raw_data = pd.read_csv(test_path)

In [5]:
train_raw_data.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,Feature_10,...,Feature_47_flag,Feature_48_flag,Feature_49_flag,Feature_50_flag,Feature_51_flag,Feature_52_flag,Feature_53_flag,Feature_54_flag,Feature_55_flag,Target
0,1.216284,0.676507,0.792257,0.216828,0.002462,-0.009971,-0.109648,0.664535,0.782265,-0.010646,...,0,1,0,1,0,0,0,0,1,1
1,-0.232558,-0.26944,0.080897,0.046138,-0.108725,0.216557,-0.109648,1.624962,0.009902,-0.009909,...,0,0,0,0,0,0,0,1,1,0
2,0.208814,-0.00888,0.418653,-0.054095,-0.524811,-0.017276,-0.109648,-0.323712,-1.235586,-0.010646,...,0,1,0,0,0,0,0,1,1,0
3,0.743907,-0.849146,-0.733933,-0.164306,-0.388252,-0.015602,-0.109648,-1.436586,,0.000796,...,1,0,0,0,0,1,0,0,0,0
4,0.322668,0.085196,-0.044906,-0.048319,,-0.013313,-0.109648,-1.535841,,-0.010596,...,1,1,0,0,1,1,0,1,0,0


In [6]:
test_raw_data.head()

Unnamed: 0,ID,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,Feature_46_flag,Feature_47_flag,Feature_48_flag,Feature_49_flag,Feature_50_flag,Feature_51_flag,Feature_52_flag,Feature_53_flag,Feature_54_flag,Feature_55_flag
0,34163,0.294855,-0.096278,-0.17084,-0.178637,0.267999,-0.017179,-0.109648,0.086023,,...,0,0,0,0,0,0,0,0,1,1
1,21507,-0.216876,-0.430887,-0.138086,-0.013533,,-0.02397,-0.109648,-1.671952,,...,0,1,1,0,1,1,0,1,1,1
2,1360,-0.256933,-0.485702,0.185486,0.038258,-0.295331,-0.025732,-0.109648,-1.653569,,...,0,1,1,0,0,1,1,0,0,1
3,53945,0.117295,-0.184288,-0.435384,-0.167559,,-0.024499,-0.109648,-1.666576,,...,0,1,1,0,1,1,0,1,1,1
4,22547,0.181915,0.328964,0.215342,-0.126309,-0.195803,-0.021019,-0.109648,-0.691598,,...,0,1,0,0,0,0,1,0,0,1


In [7]:
test_raw_data.Feature_9.isnull().sum()

16558

# 特征处理

In [8]:
# 因为除了XGBoost以外的一些模型需要数据中不存在NaN值，所以需要对数据中的NaN值进行处理，对于NaN值，取其属性值的中值
for col_index in train_raw_data.columns:
    median = np.nanmedian(train_raw_data[col_index].values)
    train_raw_data[col_index].fillna(median, inplace=True)
for col_index in test_raw_data.columns:
    median = np.nanmedian(test_raw_data[col_index].values)
    test_raw_data[col_index].fillna(median, inplace=True)

In [9]:
train_X = train_raw_data.values[:, :-1]
train_y = train_raw_data.values[:, -1]

In [10]:
# 对数据集的特征进行特征选择的过程

def variance_threshold(X, test_X=None, threshold=0):
    """
    对特征进行方差阈值筛选
    """
    sel = VarianceThreshold(threshold)
    sel.fit(X)
    if test_X is None:
        return sel.transform(X)
    else:
        return sel.transform(X), sel.transform(test_X)

In [11]:
# 检验数据集的后五分之一的类别分布和整个数据集的类别分布，因为金融数据有时间上的关系，所以需要强调接近预测时间段的数据
def check_class_distribution(ratio=0.2):
    """
    检查数据的类别分布
    """
    global train_y
    ratio_num = int(train_y.shape[0] * ratio)
    ratio_data = train_y[-ratio_num:]
    ratio_data = np.array(ratio_data, dtype=np.int32)
    
    all_data = np.array(train_y, dtype=np.int32)
    
    ratio_distribution = np.bincount(ratio_data)
    all_distribution = np.bincount(all_data)
    
    print('Last segament data distribution: ', ratio_distribution)
    print('All data distribution: ', all_distribution)
    
def data_augmentation(ratio=0.2):
    """
    对数据集的最后一部分数据进行复制"强调"
    """
    global train_X, train_y
    data_num = int(train_y.shape[0] * ratio)
    data_X = train_X[-data_num:]
    data_y = train_y[-data_num:]
    train_X = np.concatenate((train_X, data_X), axis=0)
    train_y = np.concatenate((train_y, data_y), axis=0)

In [None]:
# 进行特征重要性的筛检


In [11]:
check_class_distribution()

Last segament data distribution:  [15140 15045]
All data distribution:  [75680 75247]


In [12]:
data_augmentation()

In [12]:
test_X = test_raw_data.values[:, 1:]
test_ids = test_raw_data.values[:, 0]

In [13]:
train_X, test_X = variance_threshold(train_X, test_X, 0.01)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.3, random_state=1)

# 训练以及测试模型

In [15]:
params = {'max_depth': 3,
          'eta': 0.1126,
          'silent': 1,
          'objective': 'binary:logistic'}
num_rounds = 100

# 使用XGBoost进行模型的训练
def train_xgboost_model(train_data, train_label, test_data=None, test_label=None):
    dtrain = xgb.DMatrix(train_data, label=train_label)
    dtest = xgb.DMatrix(test_data)
    bst = xgb.train(params, dtrain, num_rounds)
    preds = bst.predict(dtest)
    
    loss = log_loss(test_label, preds)
    print('Log loss : %f' % loss)
    
def use_xgboost_model(train_data, train_label, test_data):
    dtrain = xgb.DMatrix(train_data, label=train_label)
    dtest = xgb.DMatrix(test_data)
    bst = xgb.train(params, dtrain, num_rounds)
    preds = bst.predict(dtest)
    return preds

In [16]:
train_xgboost_model(X_train, y_train, X_test, y_test)

Log loss : 0.692712


In [32]:
# Log loss : 0.692892  10    2
# Log loss : 0.692811  20    2
# Log loss : 0.692685  100   2
# Log loss : 0.692735  150   2
# Log loss : 0.692882  150   3

In [17]:
preds = use_xgboost_model(train_X, train_y, test_X)

In [42]:
n_estimators = 3000
max_depth = 2
n_jobs = -1
verbose = 1

# 使用Random Forest模型进行训练
def train_random_forest(train_data, train_label, test_data=None, test_label=None, random_state=None):
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs, verbose=verbose,
                                random_state=random_state)
    clf.fit(train_data, train_label)
    preds = clf.predict_proba(test_data)
    preds = preds[:, 1]
    
    loss = log_loss(test_label, preds)
    print('Log loss : %f' % loss)

def use_random_forest(train_data, train_label, test_data, random_state=None):
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs, verbose=verbose,
                                random_state=random_state)
    clf.fit(train_data, train_label)
    preds = clf.predict_proba(test_data)
    preds = preds[:, 1]
    return preds

In [None]:
train_random_forest(X_train, y_train, X_test, y_test, random_state=None)

In [108]:
# Log loss : 0.692715

# Log loss : 0.776200  None 6000

# Log loss : 0.693096  1    1000
# Log loss : 0.693095  1    2000
# Log loss : 0.693093  1    3000     1
# Log loss : 0.693094  1    3000
# Log loss : 0.693089  1    3000
# Log loss : 0.693092  1    4000
# Log loss : 0.693092  1    6000
# Log loss : 0.693014  2    3000
# Log loss : 0.693016  2    6000
# Log loss : 0.692947  3    3000
# Log loss : 0.692939  3    6000
# Log loss : 0.692943  3    6000
# Log loss : 0.692871  4    3000
# Log loss : 0.692804  5    3000
# Log loss : 0.692739  6    3000
# Log loss : 0.692753  6    6000

In [43]:
preds = use_random_forest(train_X, train_y, test_X)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  2.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    3.4s
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done 3000 out of 3000 | elapsed:    5.

In [31]:
# 对Random Forest和XGBoost模型的预测结果进行线性Blending
def blending_model(train_data, train_label, test_data, test_label):
    dtrain = xgb.DMatrix(train_data, label=train_label)
    dtest = xgb.DMatrix(test_data)
    bst = xgb.train(params, dtrain, num_rounds)
    preds_xgb = bst.predict(dtest)
    
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs, verbose=verbose)
    clf.fit(train_data, train_label)
    preds_rf = clf.predict_proba(test_data)
    preds_rf = preds_rf[:, 1]
    
    preds = preds_xgb * 0.3 + preds_rf * 0.7
    
    loss = log_loss(test_label, preds)
    print('Log loss : %f' % loss)
    
def use_blending_model(train_data, train_label, test_data):
    dtrain = xgb.DMatrix(train_data, label=train_label)
    dtest = xgb.DMatrix(test_data)
    bst = xgb.train(params, dtrain, num_rounds)
    preds_xgb = bst.predict(dtest)
    
    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=n_jobs, verbose=verbose)
    clf.fit(train_data, train_label)
    preds_rf = clf.predict_proba(test_data)
    preds_rf = preds_rf[:, 1]
    
    preds = preds_xgb * 0.3 + preds_rf * 0.7
    return preds

In [32]:
blending_model(X_train, y_train, X_test, y_test)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  1.3min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:    4.5s
[Parallel(n_jobs=4)]: Done 3000 out of 3000 | elapsed:    5.

Log loss : 0.692724


In [None]:
# Log loss : 0.692715
# Log loss : 0.692939
# Log loss : 0.692650 

In [33]:
preds = use_blending_model(train_X, train_y, test_X)

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:   58.5s
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:  1.6min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    2.2s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done 3000 out of 3000 | elapsed:    5.

# 输出结果

In [44]:
result_path = os.path.join(data_dir, 'prediction.csv')

In [45]:
result_file = open(result_path, 'w')

In [46]:
head = '"ID","Target"\n'
result_file.write(head)

In [47]:
for s_id, s_pro in zip(test_ids, preds):
    line = str(int(s_id)) + ',' + str(s_pro) + '\n'
    result_file.write(line)

In [48]:
result_file.close()