# Import Packages

In [1]:
import gc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import xgboost as xgb
import catboost as cat
from sklearn.metrics import roc_auc_score, roc_curve, auc
# LOCAL_QUICK = True
LOCAL_QUICK = False
sample_percent = 0.1
MORE_FE = False
# MORE_FE = True
FE_V1 = False if MORE_FE else True

In [2]:
%%time
# Read data
# User behaviors
user_log = pd.read_csv('user_log_format1.csv', dtype={'time_stamp': 'str'})

user_info = pd.read_csv('user_info_format1.csv')

train_data1 = pd.read_csv('train_format1.csv')

sub_data = pd.read_csv('test_format1.csv')

data_train = pd.read_csv('train_format2.csv')

CPU times: user 27.8 s, sys: 3.53 s, total: 31.3 s
Wall time: 31.4 s


In [3]:
%%time
# Sample test
if LOCAL_QUICK:
    print('Local quick test: {}, rate is {}'.format(LOCAL_QUICK,
                                                    sample_percent))
    data = user_log.sample(int(len(user_log) * sample_percent))
    data1 = user_info.sample(int(len(user_info) * sample_percent))
    data2 = train_data1.sample(int(len(train_data1) * sample_percent))
    submission = sub_data.copy()

else:
    print('All sample train')
    data = user_log.copy()
    data1 = user_info.copy()
    data2 = train_data1.copy()
    submission = sub_data.copy()
print('---data shape---')
for df in [data, data1, data2, submission, data_train]:
    print(df.shape)

All sample train
---data shape---
(54925330, 7)
(424170, 3)
(260864, 3)
(261477, 3)
(7030723, 6)
CPU times: user 961 ms, sys: 687 ms, total: 1.65 s
Wall time: 1.65 s


In [4]:
data2['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([data2, submission], ignore_index=True, sort=False)
matrix.drop(['prob'], axis=1, inplace=True)
# Merge user_info on user_id
matrix = matrix.merge(data1, on='user_id', how='left')
# Using merchant_id（Raw column name is seller_id）
data.rename(columns={'seller_id': 'merchant_id'}, inplace=True)

In [5]:
%%time
# format
data['user_id'] = data['user_id'].astype('int32')
data['merchant_id'] = data['merchant_id'].astype('int32')
data['item_id'] = data['item_id'].astype('int32')
data['cat_id'] = data['cat_id'].astype('int32')
data['brand_id'].fillna(0, inplace=True)
data['brand_id'] = data['brand_id'].astype('int32')
data['time_stamp'] = pd.to_datetime(data['time_stamp'], format='%m%d')
# fill na
matrix['age_range'].fillna(0, inplace=True)
matrix['gender'].fillna(2, inplace=True)

# # age : median
# matrix['age_range'].fillna(matrix['age_range'].median(),inplace=True)

matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')

CPU times: user 8.07 s, sys: 5.66 s, total: 13.7 s
Wall time: 15.4 s


# Feature Engineering

## User

In [6]:
%%time
groups = data.groupby(['user_id'])

CPU times: user 130 µs, sys: 0 ns, total: 130 µs
Wall time: 133 µs


In [7]:
# Interactive behaviors of users : u1
temp = groups.size().reset_index().rename(columns={0: 'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')

In [8]:
# Unique value of item_id, cat_id, merchant_id, brand_id for users
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

In [9]:
# Time interval : u6 (days/hours)
temp = groups['time_stamp'].agg([('F_time', 'min'),
                                 ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.days
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')

In [10]:
# count action type 0, 1, 2, 3
temp = groups['action_type'].value_counts().unstack().reset_index().rename(
    columns={
        0: 'u7',
        1: 'u8',
        2: 'u9',
        3: 'u10'
    })
matrix = matrix.merge(temp, on='user_id', how='left')

In [11]:
# Unique value of timestamp
temp = groups['time_stamp'].agg([('u11', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

## Merchant

In [205]:
%%time
groups = data.groupby(['merchant_id'])

CPU times: user 445 µs, sys: 3.9 ms, total: 4.35 ms
Wall time: 10.4 ms


In [13]:
# Interactive behavior of merchant : m1
temp = groups.size().reset_index().rename(columns={0: 'm1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [14]:
# Unique value of user_id, item_id, cat_id, brand_id for merchants
temp = groups['user_id', 'item_id', 'cat_id',
              'brand_id'].nunique().reset_index().rename(columns={
                  'user_id': 'm2',
                  'item_id': 'm3',
                  'cat_id': 'm4',
                  'brand_id': 'm5'
              })
matrix = matrix.merge(temp, on='merchant_id', how='left')

  temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={


In [15]:
# Unique value of action_type for merchants
temp = groups['action_type'].value_counts().unstack().reset_index().rename(
    columns={
        0: 'm6',
        1: 'm7',
        2: 'm8',
        3: 'm9'
    })
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [16]:
# 按照merchant_id 统计随机负采样的个数
temp = data_train[data_train['label'] == -1].groupby(
    ['merchant_id']).size().reset_index().rename(columns={0: 'm10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [206]:
temp = groups['time_stamp'].agg([('frist', 'min'),
                                 ('last', 'max')]).reset_index()
temp['m11'] = (temp['last'] - temp['frist']).dt.days
temp.drop(['frist', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['merchant_id'], how='left')

In [31]:
data[data['action_type'] == 2].groupby(['user_id', 'merchant_id',
                                        'cat_id']).size()

user_id  merchant_id  cat_id
1        925          1023      1
         1019         992       4
         4026         1252      1
2        420          602       2
                      1213      1
                               ..
424169   3760         662       1
         3898         351       3
                      812       1
         4731         351       1
424170   4268         656       1
Length: 2610210, dtype: int64

## User + Merchant

In [32]:
%%time
groups = data.groupby(['user_id', 'merchant_id'])

CPU times: user 187 µs, sys: 1e+03 ns, total: 188 µs
Wall time: 192 µs


In [None]:
# count interactive behaviors
temp = groups.size().reset_index().rename(columns={0: 'um1'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

In [None]:
# Unique value for item_id, cat_id, brand_id
temp = groups['item_id', 'cat_id',
              'brand_id'].nunique().reset_index().rename(columns={
                  'item_id': 'um2',
                  'cat_id': 'um3',
                  'brand_id': 'um4'
              })
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

In [None]:
# Count each action_type
temp = groups['action_type'].value_counts().unstack().reset_index().rename(
    columns={
        0: 'um5',
        1: 'um6',
        2: 'um7',
        3: 'um8'
    })
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

In [None]:
# Time interval (days/hours)
temp = groups['time_stamp'].agg([('frist', 'min'),
                                 ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['frist']).dt.days
temp.drop(['frist', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

In [33]:
groups = data[data['action_type'] == 2].groupby(['user_id', 'merchant_id'])
temp = groups['item_id'].nunique().reset_index().rename(
    columns={'item_id': 'um10'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

  temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={


In [None]:
# User purchase/click
matrix['r1'] = matrix['u9'] / matrix['u7']

# Merchant purchase/click
matrix['r2'] = matrix['m8'] / matrix['m6']

# U+M purchase/click
matrix['r3'] = matrix['um7'] / matrix['um5']

In [None]:
# U+M daily click
matrix['r4'] = matrix['u7'] / (matrix['u6'])

# U+M daily purchase
matrix['r5'] = matrix['u9'] / (matrix['u6'])

In [None]:
# U+M daily click
matrix['r6'] = matrix['um5'] / (matrix['um9'] + 1)

# U+M daily purchase
matrix['r7'] = matrix['um7'] / (matrix['um9'] + 1)

In [None]:
# U+M daily click
matrix['r8'] = matrix['um2'] / (matrix['um7'])

# U+M daily purchase
matrix['r9'] = matrix['um3'] / (matrix['um7'])

In [None]:
# U+M daily click
matrix['r10'] = matrix['um2'] / (matrix['um9'] + 1)

# U+M daily purchase
matrix['r11'] = matrix['um3'] / (matrix['um9'] + 1)

In [35]:
# Fillna 0
matrix.fillna(0, inplace=True)

In [None]:
%%time
# Make dummy variables of age_range
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)

# Make dummy variables of gender
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

In [222]:
%%time
# train, test data
train_data = matrix[matrix['origin'] == 'train'].drop(['origin'], axis=1)
test_data = matrix[matrix['origin'] == 'test'].drop(['label', 'origin'],
                                                    axis=1)

if not LOCAL_QUICK:
    if FE_V1:
        train_data.to_csv('train_data.csv')
        test_data.to_csv('test_data.csv')
    if MORE_FE:
        train_data.to_csv('train_data_moreFE.csv')
        test_data.to_csv('test_data_moreFE.csv')

CPU times: user 55 s, sys: 1.84 s, total: 56.9 s
Wall time: 57.7 s


In [67]:
# # matrix['u6']=np.log(matrix['u6_x'])
# matrix['um9']=np.log(matrix['um9_y']+1)
# matrix.drop(['um9_x','um9_y'], axis=1, inplace=True)

In [88]:
matrix = pd.read_pickle('matrix_2.pkl')
# matrix.drop('Unnamed: 0',inplace=True,axis=1)

In [21]:
matrix.drop(['um9', 'u6'], axis=1, inplace=True)
groups = data.groupby(['user_id'])
temp = groups['time_stamp'].agg([('F_time', 'min'),
                                 ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.days
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')

groups = data.groupby(['merchant_id'])
temp = groups['time_stamp'].agg([('frist', 'min'),
                                 ('last', 'max')]).reset_index()
temp['m11'] = (temp['last'] - temp['frist']).dt.days
temp.drop(['frist', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['merchant_id'], how='left')

groups = data.groupby(['user_id', 'merchant_id'])
temp = groups['time_stamp'].agg([('frist', 'min'),
                                 ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['frist']).dt.days
temp.drop(['frist', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

In [22]:
matrix['u12'] = matrix['u6'] + matrix['u7'] + matrix['u8'] + matrix['u9']

matrix['r4'] = matrix['u8'] / matrix['u6']
matrix['r5'] = matrix['u8'] / matrix['u7']
matrix['r6'] = matrix['u8'] / matrix['u9']
matrix['r7'] = matrix['u9'] / matrix['u6']
matrix['r8'] = matrix['u7'] / matrix['u6']
matrix['r9'] = matrix['u2'] / matrix['u3']
matrix['r10'] = matrix['u2'] / matrix['u4']
matrix['r11'] = matrix['u4'] / matrix['u2']
matrix['r12'] = matrix['m4'] / matrix['m3']
matrix['r13'] = matrix['m4'] / matrix['m5']
matrix['r14'] = matrix['m9'] / matrix['m6']
matrix['r15'] = matrix['m7'] / matrix['m6']
matrix['r16'] = matrix['m8'] / matrix['m7']
matrix['r17'] = matrix['um3'] / matrix['um2']
matrix['r18'] = matrix['um4'] / matrix['um2']
matrix['r19'] = matrix['um6'] / matrix['um5']
matrix['r20'] = matrix['um8'] / matrix['um5']
matrix['r21'] = matrix['u12'] / matrix['u10']
matrix['r22'] = matrix['u6'] / matrix['u10']
matrix['r23'] = matrix['u7'] / matrix['u10']
matrix['r24'] = matrix['u8'] / matrix['u10']
matrix['r25'] = matrix['u9'] / matrix['u10']

matrix['ri1'] = matrix['i2_y'] / matrix['i0_y']
matrix['ri2'] = matrix['i1_y'] / matrix['i0_y']
matrix['ri3'] = matrix['i3_y'] / matrix['i0_y']

# split train and test
matrix.fillna(0, inplace=True)
matrix.replace(np.inf, 0, inplace=True)

In [23]:
# U+M
matrix['r26'] = matrix['um2'] / (matrix['um7'])
matrix['r27'] = matrix['um3'] / (matrix['um7'])
matrix['r28'] = matrix['um2'] / (matrix['um9'] + 1)
matrix['r29'] = matrix['um3'] / (matrix['um9'] + 1)

matrix['r30'] = matrix['um7'] / matrix['um1']
matrix['r31'] = matrix['um2'] / (matrix['um1'])
matrix['r32'] = matrix['um3'] / (matrix['um1'])
matrix['r33'] = matrix['um1'] / (matrix['um9'] + 1)
matrix['r34'] = matrix['um4'] / (matrix['um9'] + 1)
matrix['r35'] = matrix['um10'] / (matrix['um9'] + 1)

In [220]:
matrix.drop(['age_0', 'age_7'], axis=1, inplace=True)

In [84]:
matrix.to_pickle('matrix_2.pkl')

In [221]:
matrix

Unnamed: 0,user_id,merchant_id,label,origin,u1,u2,u3,u4,u5,u7,...,r27,r28,r29,r30,r31,r32,r33,r34,um10,r35
0,34176,3906,0.0,train,451,256,109,45,108,0.0,...,6.000000,0.869565,0.26087,0.025641,0.512821,0.153846,1.695652,0.043478,1,0.043478
1,34176,121,0.0,train,451,256,109,45,108,0.0,...,1.000000,0.250000,0.25000,0.071429,0.071429,0.071429,3.500000,0.250000,1,0.250000
2,34176,4356,1.0,train,451,256,109,45,108,0.0,...,0.166667,1.000000,0.50000,0.333333,0.111111,0.055556,9.000000,0.500000,2,1.000000
3,34176,2217,0.0,train,451,256,109,45,108,0.0,...,1.000000,1.000000,1.00000,0.500000,0.500000,0.500000,2.000000,1.000000,1,1.000000
4,230784,4818,0.0,train,54,31,20,17,19,0.0,...,1.000000,0.250000,0.25000,0.125000,0.125000,0.125000,2.000000,0.250000,1,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522336,228479,3111,0.0,test,2004,1173,278,71,282,0.0,...,1.000000,1.000000,0.50000,0.200000,0.400000,0.200000,2.500000,0.500000,1,0.500000
522337,97919,2341,0.0,test,55,29,17,14,17,0.0,...,1.000000,1.000000,1.00000,0.500000,0.500000,0.500000,2.000000,1.000000,1,1.000000
522338,97919,3971,0.0,test,55,29,17,14,17,0.0,...,0.500000,0.500000,0.20000,0.250000,0.312500,0.125000,1.600000,0.100000,3,0.300000
522339,32639,3536,0.0,test,72,46,33,24,35,1.0,...,1.000000,2.000000,1.00000,0.333333,0.666667,0.333333,3.000000,1.000000,1,1.000000


# Models

## Read feature data

In [94]:
# Read data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')

# FeatureSelect_QUICK = True # Feature Select
FeatureSelect_QUICK = False
if FeatureSelect_QUICK:  # Use part of samples to select features quickly
    train_data = train_data.sample(int(len(train_data) * sample_percent))

# train_data = train_data[train_col]
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

# train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
    train_X, train_y, test_size=.2, random_state=42)  # test_size=.3

## Try

### XGBoost

In [95]:
# Read data
train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']
X_train, X_valid, y_train, y_valid = train_test_split(
    train_X, train_y, test_size=.2, random_state=42)  # test_size=.3

In [96]:
%%time
def xgb_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_xgb = xgb.XGBClassifier(
        max_depth=6,  # raw8
        n_estimators=1000,
        min_child_weight=300,
        colsample_bytree=0.8,
        subsample=0.8,
        eta=0.3,
        seed=42)

    model_xgb.fit(X_train,
                  y_train,
                  eval_metric='auc',
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  verbose=verbose,
                  early_stopping_rounds=10)
    print(model_xgb.best_score)
    return model_xgb

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs


In [97]:
model_xgb = xgb_train(X_train, y_train, X_valid, y_valid, verbose=False)



0.686327


In [35]:
%%time
prob = model_xgb.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:, 1])
submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_xgb.csv', index=False)

CPU times: user 3.45 s, sys: 1.75 s, total: 5.2 s
Wall time: 782 ms


### Cat Boost

In [36]:
def cat_train(X_train, y_train, X_valid, y_valid, verbose=True):
    model_cat = cat.CatBoostClassifier(learning_rate=0.02,
                                       iterations=5000,
                                       eval_metric='AUC',
                                       od_wait=50,
                                       od_type='Iter',
                                       random_state=10,
                                       thread_count=8,
                                       l2_leaf_reg=1,
                                       verbose=verbose)
    model_cat.fit(X_train,
                  y_train,
                  eval_set=[(X_valid, y_valid)],
                  early_stopping_rounds=50,
                  use_best_model=True)

    print(model_cat.best_score_['validation']['AUC'])
    return model_cat

In [37]:
model_cat = cat_train(X_train, y_train, X_valid, y_valid, verbose=False)

0.6837380737296375


In [38]:
%%time
prob = model_cat.predict_proba(test_data)
submission['prob'] = pd.Series(prob[:, 1])
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_cat.csv', index=False)

CPU times: user 1.16 s, sys: 69.2 ms, total: 1.23 s
Wall time: 638 ms


## StratifiedKFold

In [98]:
# Make Train set and Test set
def get_train_testDF(train_df, label_df):
    skv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    trainX = []
    trainY = []
    testX = []
    testY = []
    for train_index, test_index in skv.split(X=train_df, y=label_df):
        train_x, train_y, test_x, test_y = train_df.iloc[train_index, :], label_df.iloc[train_index], \
                                            train_df.iloc[test_index, :], label_df.iloc[test_index]

        trainX.append(train_x)
        trainY.append(train_y)
        testX.append(test_x)
        testY.append(test_y)
    return trainX, testX, trainY, testY

## Final Model

### LightGBM

In [None]:
# Read data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [242]:
# 将训练数据集划分分别训练5个lgbm,xgboost和catboost 模型
# lightgbm模型

pred_lgbms = []
for i in range(5):
    print(
        '\n============================LGB training use Data {}/5============================\n'
        .format(i + 1))
    model_lgb = lgb.LGBMClassifier(
        boosting_type='GOSS',
        max_depth=12,  # 8
        num_leaves=22,
        n_estimators=5000,
        min_child_weight=200,
        class_weight={
            1: 0.9,
            0: 0.1
        },
        colsample_bytree=0.78,
        subsample=0.8,
        learning_rate=0.007,
        seed=42,
        n_jobs=8)

    model_lgb.fit(X_train[i],
                  y_train[i],
                  eval_metric='auc',
                  eval_set=[(X_train[i], y_train[i]),
                            (X_valid[i], y_valid[i])],
                  verbose=False,
                  early_stopping_rounds=300)

    print(model_lgb.best_score_['valid_1']['auc'])

    pred = model_lgb.predict_proba(test_data)
    pred = pd.DataFrame(pred[:, 1])
    pred_lgbms.append(pred)
pred_lgbms = pd.concat(pred_lgbms, axis=1)
print(pred_lgbms)

submission['prob'] = pred_lgbms.mean(axis=1)
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_lgb.csv', index=False)

####0.6959



0.6950528930867641


0.6951859185743989


0.6889699925071766


0.6926655985690021


0.6928138338579753
               0         0         0         0         0
0       0.429448  0.362807  0.413791  0.430574  0.429927
1       0.507550  0.483903  0.506083  0.558445  0.501370
2       0.426781  0.368000  0.486253  0.418441  0.488253
3       0.271171  0.255585  0.281039  0.248218  0.230383
4       0.402136  0.372530  0.338543  0.342509  0.318103
...          ...       ...       ...       ...       ...
261472  0.458283  0.419896  0.435266  0.481487  0.360560
261473  0.139406  0.171347  0.197168  0.138879  0.147234
261474  0.517556  0.545648  0.498689  0.475901  0.484735
261475  0.319972  0.281433  0.278488  0.279710  0.363208
261476  0.422237  0.418251  0.423366  0.403213  0.442515

[261477 rows x 5 columns]


### Catgbm

In [110]:
# get data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [112]:
# Divide train set into 5 catboost models
# catgbm
pred_cats = []
for i in range(5):
    print(
        '\n============================CAT training use Data {}/5============================\n'
        .format(i + 1))
    model_cat = cat.CatBoostClassifier(
        learning_rate=0.01,
        iterations=5000,
        eval_metric='AUC',
        depth=7,
        od_wait=300,
        od_type='Iter',
        #                                 early_stopping_rounds=500,
        random_state=10,
        thread_count=8,
        l2_leaf_reg=12,
        verbose=100)
    model_cat.fit(X_train[i],
                  y_train[i],
                  eval_set=[(X_valid[i], y_valid[i])],
                  early_stopping_rounds=300,
                  use_best_model=True)
    print(model_cat.best_score_['validation']['AUC'])

    pred = model_cat.predict_proba(test_data)
    pred = pd.DataFrame(pred[:, 1])
    pred_cats.append(pred)
pred_cats = pd.concat(pred_cats, axis=1)

submission['prob'] = pred_cats.mean(axis=1)



0:	test: 0.5620029	best: 0.5620029 (0)	total: 46.2ms	remaining: 3m 51s
100:	test: 0.6471123	best: 0.6472451 (99)	total: 4.33s	remaining: 3m 30s
200:	test: 0.6540673	best: 0.6540673 (200)	total: 8.52s	remaining: 3m 23s
300:	test: 0.6615788	best: 0.6615788 (300)	total: 12.6s	remaining: 3m 16s
400:	test: 0.6678866	best: 0.6678866 (400)	total: 16.4s	remaining: 3m 8s
500:	test: 0.6727801	best: 0.6727801 (500)	total: 20.3s	remaining: 3m 2s
600:	test: 0.6759959	best: 0.6759959 (600)	total: 24.2s	remaining: 2m 56s
700:	test: 0.6785302	best: 0.6785302 (700)	total: 28s	remaining: 2m 51s
800:	test: 0.6804415	best: 0.6804415 (800)	total: 31.8s	remaining: 2m 46s
900:	test: 0.6821157	best: 0.6821157 (900)	total: 35.8s	remaining: 2m 42s
1000:	test: 0.6835887	best: 0.6835887 (1000)	total: 39.7s	remaining: 2m 38s
1100:	test: 0.6847683	best: 0.6847756 (1099)	total: 43.5s	remaining: 2m 34s
1200:	test: 0.6856614	best: 0.6856753 (1199)	total: 47.4s	remaining: 2m 30s
1300:	test: 0.6863572	best: 0.6863572 

1100:	test: 0.6803461	best: 0.6803461 (1100)	total: 42s	remaining: 2m 28s
1200:	test: 0.6811224	best: 0.6811445 (1196)	total: 45.8s	remaining: 2m 24s
1300:	test: 0.6814279	best: 0.6814338 (1299)	total: 49.7s	remaining: 2m 21s
1400:	test: 0.6817680	best: 0.6817772 (1397)	total: 53.9s	remaining: 2m 18s
1500:	test: 0.6820149	best: 0.6820149 (1500)	total: 57.9s	remaining: 2m 15s
1600:	test: 0.6822641	best: 0.6822820 (1577)	total: 1m 1s	remaining: 2m 11s
1700:	test: 0.6824839	best: 0.6824839 (1700)	total: 1m 5s	remaining: 2m 7s
1800:	test: 0.6827303	best: 0.6827303 (1800)	total: 1m 9s	remaining: 2m 3s
1900:	test: 0.6829928	best: 0.6830211 (1889)	total: 1m 13s	remaining: 1m 59s
2000:	test: 0.6832122	best: 0.6832148 (1999)	total: 1m 17s	remaining: 1m 55s
2100:	test: 0.6834069	best: 0.6834093 (2099)	total: 1m 21s	remaining: 1m 51s
2200:	test: 0.6836113	best: 0.6836113 (2200)	total: 1m 24s	remaining: 1m 47s
2300:	test: 0.6837453	best: 0.6837453 (2300)	total: 1m 28s	remaining: 1m 43s
2400:	test:

2500:	test: 0.6890476	best: 0.6890660 (2486)	total: 1m 34s	remaining: 1m 34s
2600:	test: 0.6892505	best: 0.6892721 (2584)	total: 1m 38s	remaining: 1m 30s
2700:	test: 0.6894712	best: 0.6894712 (2700)	total: 1m 42s	remaining: 1m 27s
2800:	test: 0.6896795	best: 0.6897008 (2796)	total: 1m 46s	remaining: 1m 23s
2900:	test: 0.6898034	best: 0.6898069 (2895)	total: 1m 49s	remaining: 1m 19s
3000:	test: 0.6900550	best: 0.6900550 (3000)	total: 1m 53s	remaining: 1m 15s
3100:	test: 0.6900579	best: 0.6900854 (3043)	total: 1m 57s	remaining: 1m 11s
3200:	test: 0.6901429	best: 0.6901470 (3196)	total: 2m 1s	remaining: 1m 8s
3300:	test: 0.6902805	best: 0.6902892 (3294)	total: 2m 4s	remaining: 1m 4s
3400:	test: 0.6905239	best: 0.6905239 (3400)	total: 2m 8s	remaining: 1m
3500:	test: 0.6906787	best: 0.6906893 (3488)	total: 2m 12s	remaining: 56.7s
3600:	test: 0.6908599	best: 0.6908599 (3600)	total: 2m 16s	remaining: 52.9s
3700:	test: 0.6910290	best: 0.6910393 (3697)	total: 2m 20s	remaining: 49.1s
3800:	test:

In [113]:
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_cat.csv', index=False)

### XGboost

In [114]:
# Read data
if not LOCAL_QUICK:
    if FE_V1:
        train_data = pd.read_csv('train_data.csv')
        test_data = pd.read_csv('test_data.csv')
    if MORE_FE:
        train_data = pd.read_csv('train_data_moreFE.csv')
        test_data = pd.read_csv('test_data_moreFE.csv')

train_X, train_y = train_data.drop(['label'], axis=1), train_data['label']

# Split Train&Valid Data
X_train, X_valid, y_train, y_valid = get_train_testDF(train_X, train_y)

In [120]:
# Divided train set into 5 XGBoost models
# XGBoost model
pred_xgbs = []
for i in range(5):
    print(
        '\n============================XGB training use Data {}/5============================\n'
        .format(i + 1))
    model_xgb = xgb.XGBClassifier(
        max_depth=8,  # raw8
        n_estimators=3000,
        min_child_weight=100,
        colsample_bytree=0.8,
        subsample=0.8,
        eta=0.2,
        seed=42,
        nthread=8)

    model_xgb.fit(
        X_train[i],
        y_train[i],
        eval_metric='auc',
        eval_set=[(X_train[i], y_train[i]), (X_valid[i], y_valid[i])],
        verbose=False,
        early_stopping_rounds=100  # 早停法，如果auc在10epoch没有进步就stop
    )

    print(model_xgb.best_score)

    pred = model_xgb.predict_proba(test_data)
    pred = pd.DataFrame(pred[:, 1])
    pred_xgbs.append(pred)
pred_xgbs = pd.concat(pred_xgbs, axis=1)

# make submission
submission['prob'] = pred_xgbs.mean(axis=1)
# submission.drop(['origin'], axis=1, inplace=True)
submission.to_csv('submission_KFold_xgb.csv', index=False)







0.689537


0.68872


0.682097


0.686997


0.687532


## Blending

In [179]:
xgb_1 = pd.read_csv('submission_xgb.csv')
lgb_KFold = pd.read_csv("submission_KFold_lgb.csv")
xgb_KFold = pd.read_csv("submission_KFold_xgb.csv")
cat_KFold = pd.read_csv("submission_KFold_cat.csv")

In [180]:
# correlation coefficients
df = np.array([xgb_1.prob, lgb_KFold.prob, xgb_KFold.prob, cat_KFold.prob])
np.corrcoef(df)

array([[1.        , 0.89183943, 0.87952985, 0.8750504 ],
       [0.89183943, 1.        , 0.98269074, 0.96366744],
       [0.87952985, 0.98269074, 1.        , 0.96568474],
       [0.8750504 , 0.96366744, 0.96568474, 1.        ]])

In [181]:
sub = xgb_1.copy()
# sub.drop(['origin'], axis=1, inplace=True)

sub.prob = 0.7 * lgb_KFold.prob + 0.2 * xgb_KFold.prob + 0.1 * cat_KFold.prob  # Online test score:0.6837613
sub.to_csv('sbms_xgb_cat_xK_cK.csv', index=False)