In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_columns', 100)

In [24]:
debug = False
if debug:
    NROWS = 10000
else:
    NROWS = None
    
trainset = pd.read_csv('train.csv', nrows=NROWS)
testset = pd.read_csv('test.csv', nrows=NROWS)

### Metadata 생성하기

In [3]:
#아래 코드에선 굳이 쓸 필요 없을듯!
data = []
data = []
for feature in trainset.columns:
    # Defining the role
    if feature == 'target':
        use = 'target'
    elif feature == 'id':
        use = 'id'
    else:
        use = 'input'
         
    # Defining the type
    if 'bin' in feature or feature == 'target':
        type = 'binary'
    elif 'cat' in feature or feature == 'id':
        type = 'categorical'
    elif trainset[feature].dtype == float or trainset[feature].dtype == 'float64' or isinstance(trainset[feature].dtype, float):
        type = 'real'
    elif trainset[feature].dtype == 'int64' or trainset[feature].dtype == int:
        type = 'integer'
        
    # Initialize preserve to True for all variables except for id
    preserve = True
    if feature == 'id':
        preserve = False
    
    # Defining the data type 
    dtype = trainset[feature].dtype
    
    category = 'none'
    # Defining the category
    if 'ind' in feature:
        category = 'individual'
    elif 'reg' in feature:
        category = 'registration'
    elif 'car' in feature:
        category = 'car'
    elif 'calc' in feature:
        category = 'calculated'
    
    
    # Creating a Dict that contains all the metadata for the variable
    feature_dictionary = {
        'varname': feature,
        'use': use,
        'type': type,
        'preserve': preserve,
        'dtype': dtype,
        'category' : category
    }
    data.append(feature_dictionary)
    
metadata = pd.DataFrame(data, columns=['varname', 'use', 'type', 'preserve', 'dtype', 'category'])
metadata.set_index('varname', inplace=True)
metadata

Unnamed: 0_level_0,use,type,preserve,dtype,category
varname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
id,id,categorical,False,int64,none
target,target,binary,True,int64,none
ps_ind_01,input,integer,True,int64,individual
ps_ind_02_cat,input,categorical,True,int64,individual
ps_ind_03,input,integer,True,int64,individual
ps_ind_04_cat,input,categorical,True,int64,individual
ps_ind_05_cat,input,categorical,True,int64,individual
ps_ind_06_bin,input,binary,True,int64,individual
ps_ind_07_bin,input,binary,True,int64,individual
ps_ind_08_bin,input,binary,True,int64,individual


### 전체 데이터 중 1 비율 3.64%, 아주 불균형한 분포

In [21]:
trainset['target'].value_counts()

0    573518
1     21694
Name: target, dtype: int64

### calc 피처 삭제

In [25]:
col_to_drop = trainset.columns[trainset.columns.str.startswith('ps_calc_')]

In [26]:
col_to_drop

Index(['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05',
       'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10',
       'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],
      dtype='object')

In [27]:
trainset.drop(col_to_drop, axis=1, inplace=True)
testset.drop(col_to_drop, axis=1, inplace=True)

### null값 많은 피처 삭제

In [28]:
# Droping the variables with too many missing values
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
trainset.drop(vars_to_drop, axis=1, inplace=True)
testset.drop(vars_to_drop, axis=1, inplace=True)
# Updating the meta
metadata.loc[(vars_to_drop), 'preserve'] = False

In [8]:
# Script by https://www.kaggle.com/ogrellier
# Code: https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [11]:
"""for f in f_cats:
    train_encoded, test_encoded = target_encode(trainset[f],
                                            testset[f],
                                           target=trainset.target,
                                           min_samples_leaf=100,
                                           smoothing=10,
                                           noise_level=0.01)
    trainset[f+'te']=train_encoded
    trainset.drop(f,axis=1,inplace=True)
    metadata.loc[f,'preserve']=False
    testset[f+'te']=test_encoded
    testset.drop(f,axis=1,inplace=True)
    """

In [10]:
train_encoded, test_encoded = target_encode(trainset['ps_cal_11_cat'],
                                            testset['ps_cal_11_cat'],
                                           target=trainset.target,
                                           min_samples_leaf=100,
                                           smoothing=10,
                                           noise_level=0.01)

trainset['ps_car_11_cat_te'] = train_encoded
trainset.drop('ps_car_11_cat', axis=1, inplace=True)
# Updating the metadata
metadata.loc['ps_car_11_cat', 'preserve'] = False 
testset['ps_car_11_cat_te'] = test_encoded
testset.drop('ps_car_11_cat', axis=1, inplace=True)

### 다운 샘플링

In [29]:
# Get the indices per target value
idx_0 = trainset[trainset.target == 0].index
idx_1 = trainset[trainset.target == 1].index

# Get original number of recoreds per target value
nb_0 = len(trainset.loc[idx_0])
nb_1 = len(trainset.loc[idx_1])

In [30]:
desired_apriori=0.10

# Calculate the undersampling rate and resulting number of records with target = 0
undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
undersampled_nb_0 = int(undersampling_rate*nb_0)

In [31]:
# Randomly select records with target=0 to get at the desired a priori
undersampled_idx = shuffle(idx_0, random_state=314, n_samples=undersampled_nb_0)

# Construnct list with remaining indices
idx_list = list(undersampled_idx) + list(idx_1)

# Return undersample data frame
trainset = trainset.loc[idx_list]

In [15]:
trainset

Unnamed: 0,id,target,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_ind_02_catte,ps_ind_04_catte,ps_ind_05_catte,ps_car_01_catte,ps_car_02_catte,ps_car_04_catte,ps_car_06_catte,ps_car_07_catte,ps_car_08_catte,ps_car_09_catte,ps_car_10_catte,ps_car_11_catte
128718,321914,0,1,4,0,1,0,0,0,1,0,0,1,0,0,0,1,0.9,0.2,0.436606,3,0.316228,0.662140,0.301662,3.162278,0.036978,0.037944,0.034508,0.028768,0.033833,0.034019,0.034442,0.034562,0.034631,0.036740,0.036152,0.028852
377242,942842,0,6,4,0,0,0,1,0,0,0,0,0,8,1,0,0,0.7,0.1,0.520817,2,0.400000,0.842750,0.401373,3.316625,0.035423,0.034902,0.033890,0.029616,0.034363,0.034060,0.033723,0.034938,0.034230,0.033172,0.036255,0.027317
398422,995457,0,5,9,0,0,1,0,0,0,0,0,0,10,0,0,0,0.8,1.3,1.462660,3,0.400000,0.716655,0.330908,2.645751,0.035853,0.038274,0.033914,0.042023,0.034005,0.033301,0.033099,0.051542,0.034797,0.036268,0.036313,0.031076
80791,202081,0,0,9,0,1,0,0,0,0,0,0,0,10,0,0,0,0.3,0.0,-1.000000,2,0.447214,1.108756,0.325576,3.464102,0.035610,0.038701,0.033826,0.029146,0.049834,0.032820,0.051033,0.033966,0.035700,0.036454,0.036543,0.041726
518632,1295934,0,0,10,0,0,0,1,0,0,0,0,0,10,0,0,0,0.7,0.2,0.692369,2,0.500000,0.928075,0.383275,2.828427,0.035524,0.034889,0.033397,0.041928,0.049057,0.049380,0.033637,0.034242,0.035191,0.057875,0.036080,0.045335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595006,1487473,1,5,7,0,0,1,0,0,0,0,0,0,8,0,1,0,0.6,0.4,1.537652,1,0.424264,1.269111,0.384708,3.162278,0.035473,0.034966,0.042978,0.040799,0.048738,0.033363,0.066730,0.035422,0.034758,0.036889,0.035873,0.044719
595046,1487566,1,1,5,0,0,1,0,0,0,0,0,0,4,0,0,1,0.3,0.4,-1.000000,2,0.424264,0.756979,0.400000,2.000000,0.035516,0.034502,0.033961,0.041611,0.049477,0.033624,0.051117,0.034940,0.034506,0.036899,0.036682,0.041674
595099,1487716,1,0,6,0,1,0,0,0,0,0,0,0,9,0,1,0,0.9,0.3,0.711952,3,0.400000,0.970654,0.372424,3.464102,0.036142,0.038375,0.033910,0.041463,0.033972,0.033329,0.038882,0.035085,0.035105,0.057443,0.036040,0.045096
595113,1487748,1,0,2,1,0,0,0,0,0,0,0,0,2,0,0,1,0.6,0.1,0.577170,2,0.316228,0.876295,0.320780,3.741657,0.038422,0.034330,0.033948,0.028619,0.033299,0.033685,0.032085,0.033939,0.044669,0.036361,0.037046,0.044515


In [32]:
trainset.reset_index(drop=True)

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_04_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15
0,321914,0,1,2,4,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0.9,0.2,0.436606,7,1,0,1,1,1,2,1,64,3,0.316228,0.662140,0.301662,3.162278
1,942842,0,6,1,4,0,0,0,0,0,1,0,0,0,0,0,8,1,0,0,0.7,0.1,0.520817,6,1,0,0,1,1,0,1,95,2,0.400000,0.842750,0.401373,3.316625
2,995457,0,5,1,9,1,0,0,0,1,0,0,0,0,0,0,10,0,0,0,0.8,1.3,1.462660,11,1,0,0,0,1,2,1,98,3,0.400000,0.716655,0.330908,2.645751
3,202081,0,0,1,9,1,0,0,1,0,0,0,0,0,0,0,10,0,0,0,0.3,0.0,-1.000000,7,0,0,15,1,1,2,1,5,2,0.447214,1.108756,0.325576,3.464102
4,1295934,0,0,1,10,0,0,0,0,0,1,0,0,0,0,0,10,0,0,0,0.7,0.2,0.692369,11,0,1,0,1,1,1,1,104,2,0.500000,0.928075,0.383275,2.828427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216935,1487473,1,5,1,7,0,3,0,0,1,0,0,0,0,0,0,8,0,1,0,0.6,0.4,1.537652,11,0,0,5,1,1,2,1,104,1,0.424264,1.269111,0.384708,3.162278
216936,1487566,1,1,1,5,0,0,0,0,1,0,0,0,0,0,0,4,0,0,1,0.3,0.4,-1.000000,11,0,0,15,1,1,2,1,5,2,0.424264,0.756979,0.400000,2.000000
216937,1487716,1,0,1,6,1,0,0,1,0,0,0,0,0,0,0,9,0,1,0,0.9,0.3,0.711952,11,1,0,7,1,1,1,1,104,3,0.400000,0.970654,0.372424,3.464102
216938,1487748,1,0,2,2,0,0,1,0,0,0,0,0,0,0,0,2,0,0,1,0.6,0.1,0.577170,7,1,0,11,1,0,2,1,104,2,0.316228,0.876295,0.320780,3.741657


In [33]:
trainset = trainset.replace(-1, np.nan)
testset = testset.replace(-1, np.nan)

### 카테고리 피처 더미 변수로

In [34]:
cat_features = [col for col in trainset.columns if col.endswith('cat')]

for column in cat_features:
    temp = pd.get_dummies(pd.Series(trainset[column]))
    trainset = pd.concat([trainset, temp], axis=1)
    trainset = trainset.drop([column], axis=1)
    
for column in cat_features:
    temp = pd.get_dummies(pd.Series(testset[column]))
    testset = pd.concat([testset, temp], axis=1)
    testset = testset.drop([column], axis=1)

In [35]:
id_test = testset['id'].values
target_train = trainset['target'].values

trainset.drop(['target', 'id'], axis=1, inplace=True)
testset.drop(['id'], axis=1, inplace=True)

In [36]:
trainset.head()

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,1.0,2.0,3.0,4.0,0.0,1.0.1,0.0.1,1.0.2,2.0.1,3.0.1,4.0.1,5.0,6.0,0.0.2,1.0.3,2.0.2,3.0.2,4.0.2,5.0.1,6.0.1,7.0,8.0,9.0,10.0,11.0,0.0.3,1.0.4,...,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104
128718,1,4,0,1,0,0,0,1,0,0,1,0,0,0,1,0.9,0.2,0.436606,3.0,0.316228,0.66214,0.301662,3.162278,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
377242,6,4,0,0,0,1,0,0,0,0,0,8,1,0,0,0.7,0.1,0.520817,2.0,0.4,0.84275,0.401373,3.316625,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
398422,5,9,0,0,1,0,0,0,0,0,0,10,0,0,0,0.8,1.3,1.46266,3.0,0.4,0.716655,0.330908,2.645751,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
80791,0,9,0,1,0,0,0,0,0,0,0,10,0,0,0,0.3,0.0,,2.0,0.447214,1.108756,0.325576,3.464102,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
518632,0,10,0,0,0,1,0,0,0,0,0,10,0,0,0,0.7,0.2,0.692369,2.0,0.5,0.928075,0.383275,2.828427,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [37]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=314).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]


                print ("Base model %d: fit %s model | fold %d" % (i+1, str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
                print("cross_score [roc-auc]: %.5f [gini]: %.5f" % (cross_score.mean(), 2*cross_score.mean()-1))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
        # Calculate gini factor as 2 * AUC - 1
        print("Stacker score [gini]: %.5f" % (2 * results.mean() - 1))

        self.stacker.fit(S_train, y)
        result = self.stacker.predict_proba(S_test)[:,1]
        return result

In [38]:
# LightGBM params
# lgb_1
lgb_params1 = {}
lgb_params1['learning_rate'] = 0.02
lgb_params1['n_estimators'] = 650
lgb_params1['max_bin'] = 10
lgb_params1['subsample'] = 0.8
lgb_params1['subsample_freq'] = 10
lgb_params1['colsample_bytree'] = 0.8   
lgb_params1['min_child_samples'] = 500
lgb_params1['seed'] = 99
lgb_params1['num_threads'] = 4

# lgb2
lgb_params2 = {}
lgb_params2['n_estimators'] = 1090
lgb_params2['learning_rate'] = 0.02
lgb_params2['colsample_bytree'] = 0.3   
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 2
lgb_params2['num_leaves'] = 16
lgb_params2['seed'] = 99
lgb_params2['num_threads'] = 4

# lgb3
lgb_params3 = {}
lgb_params3['n_estimators'] = 1100
lgb_params3['max_depth'] = 4
lgb_params3['learning_rate'] = 0.02
lgb_params3['seed'] = 99
lgb_params3['num_threads'] = 4

# XGBoost params
xgb_params = {}
xgb_params['objective'] = 'binary:logistic'
xgb_params['learning_rate'] = 0.04
xgb_params['n_estimators'] = 490
xgb_params['max_depth'] = 4
xgb_params['subsample'] = 0.9
xgb_params['colsample_bytree'] = 0.9  
xgb_params['min_child_weight'] = 10
#xgb_params['num_threads'] = 4

In [39]:
# Base models
lgb_model1 = LGBMClassifier(**lgb_params1)

lgb_model2 = LGBMClassifier(**lgb_params2)
       
lgb_model3 = LGBMClassifier(**lgb_params3)

xgb_model = XGBClassifier(**xgb_params)

# Stacking model
log_model = LogisticRegression()

In [40]:
stack = Ensemble(n_splits=5,
        stacker = log_model,
        base_models = (lgb_model1, lgb_model2, lgb_model3, xgb_model))

y_prediction = stack.fit_predict(trainset, target_train, testset)

submission = pd.DataFrame()
submission['id'] = id_test
submission['target'] = y_prediction
submission.to_csv('stacked.csv', index=False)