В credit_train.csv содержится 170746 строк с данными о клиентах сети магазинов электроники, в этих магазинах они подали заявки на кредит. Колонка open_account_flg содержит 1 если клиент выбрал Тинькофф и 0 в противном случае. В credit_test.csv содержится 91940 строк с данными, для каждой строки следует предсказать возьмет ли соответствующий ей человек кредит в Тинькофф.

In [1]:
import datetime
import operator

import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.cross_validation import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn

%matplotlib inline
pd.options.display.max_columns=40



____

In [2]:
def _fillNa(dt, fill_na_train, is_train=True):
    """ 1
    Fillig nan values for next columns:
    monthly_income, credit_count, overdue_credit_count and living_region.
    """
    if not is_train and (fill_na_train.__len__() == 0):
        raise Exception('Run it on train data before!')
        
    if is_train:
        fill_na_train = {}
        fill_na_train['monthly_income'] = 30000.0
        # но по хорошему надо найти ближайших по работе, 
        # месту, образованию и возрасту и заменить модой
        fill_na_train['credit_count'] = 1.0 
        fill_na_train['overdue_credit_count'] = 0.0
        fill_na_train['living_region'] = 'ОБЛ МОСКОВСКАЯ'  # 'nan'
        
    dt['is_nan_monthly_income'] = dt.monthly_income.isnull().astype(int)
    dt['is_nan_credit_count'] = dt.credit_count.isnull().astype(int)
    dt['is_nan_overdue_credit_count'] = dt.overdue_credit_count.isnull().astype(int)
    
    dt.fillna(fill_na_train, inplace=True)
    return fill_na_train

In [3]:
def _floatCorrectoin(dt):
    """ 2
    """
    # обработаем запятые и переведем во флоат
    dt.credit_sum = dt.credit_sum.apply(
        lambda x: x[:-3] + '.' + x[-2:]).astype(float)
    dt.score_shk = dt.score_shk.apply(
        lambda x: x[0] + '.' + x[2:]).astype(float)

In [4]:
def _get_new_str_features(dt):
    """ 3
    Create new features for gender, mar_status,
    liv_region and other string columns. 
    ------------
    Return

    str_cols : list of str
        Columns for encoding and deleting.
    """

    dt['is_female'] = (dt.gender == 'F').astype(int)
    
    # комбинации education , job_position , gender , living_region
    dt['edu_LivReg'] = dt.education + ' ' + dt.living_region
    dt['job_LivReg'] = dt.job_position + ' ' + dt.living_region
    dt['job_edu'] = dt.job_position + ' ' + dt.education
    dt['gender_job'] = dt.gender + ' ' + dt.job_position
    dt['gender_LivReg'] = dt.gender + ' ' + dt.living_region
    dt['gender_MarSt'] = dt.gender + ' ' + dt.marital_status
    dt['LivReg_MarSt'] = dt.gender + ' ' + \
                 dt.living_region + ' ' + dt.marital_status
    dt['job_LivReg_marSt_gen'] = dt.gender + ' ' + \
        dt.job_position + ' ' + dt.living_region + \
                            ' ' + dt.marital_status

    str_cols = [#'marital_status', 'job_position',
                #'education',      'living_region',
                #'gender',         
                'LivReg_MarSt',
                'edu_LivReg',     'job_LivReg',
                'job_edu',        'gender_job',
                'gender_LivReg',  'gender_MarSt',
                'job_LivReg_marSt_gen',
                ]
    return str_cols

In [5]:
def _counter_encoder(dt, str_cols, counter_Encoders, is_train=True):
    """ 4
    Counter Encoder. 
    Set to the categories from 'str_cols' columns some 
    numbers - frequencies in train. 
    """
    if is_train:
        counter_Encoders = {col: dt[col].value_counts().to_dict()
                            for col in str_cols}
    for column in str_cols:
        dt[column + '_enc_by_count'] = dt[column].apply(
            lambda x: counter_Encoders[column].get(x, 0))
        # TODO ровнее бы
    return counter_Encoders

In [6]:
def _onehot_encoder(dt, str_cols, onehot_Encoders, is_train=True):
    """ 4
    Counter Encoder. 
    Set to the categories from 'str_cols' columns some 
    numbers - frequencies in train. 
    """
    if is_train:
        onehot_Encoders = {col: dt[col].unique() for col in str_cols}
    for column in str_cols:
        for value in onehot_Encoders[column]:
            dt[column + '_' + value] = (dt[column]==value).astype(int)
    return onehot_Encoders

In [7]:
def _target_encoder(dt, dt_train, cols_for_encoding, targ_encoders, is_train=True, target_mean=0.176, alpha=1):
    """ 5
    Encode cat values by the mean in target.

    Params
    dt : DataFrame
        Data.
    cols_for_encoding : list
        List of columns for encode.
    targ_encoders : dict
        Encoder for test dataset.
    is_train : bool
        Flag for train/test.

    ------
    Return
    targ_encoders : dict
        Values of mean target for each category 
        in columns from cols_for_encoding.
        
    !!! TODO (mean(y) * K + glob_mean(y) * alpha) / (K + alpha) 
    """
    if is_train:
        targ_encoders = {}
        targ_means = {}
        for col in cols_for_encoding:
            mean_val = dt.groupby(col).open_account_flg.mean()\
                                      .sort_values().index.values
            targ_means[col] = mean_val

        for col in cols_for_encoding:
            targ_encoders[col] = {v: i for i, v in enumerate(targ_means[col])}

    for col in cols_for_encoding:
        column_length = dt_train[col].shape[0]
        all_values, counts = np.unique(dt_train[col].values, return_counts=True)
        count_ = {v:count for v, count in zip(all_values, counts)}
#         dt[col + '_by_mean_target'] = dt[col].apply(
#             lambda colvalue: 
#             (targ_encoders[col].get(colvalue, target_mean) * count_.get(colvalue,0) + target_mean * alpha)
#             /( count_.get(colvalue,0) + alpha))
        all_values = all_values[counts > 0.05 * column_length]
        dt[col + '_by_mean_target'] = dt[col].apply(
                lambda colvalue: targ_encoders[col].get(colvalue, target_mean)
                if colvalue in all_values else target_mean)
        
    return targ_encoders

In [8]:
def _new_money_features(dt):
    """ 6
    Money features.
    """

    # остаток денег на жизнь человеку
    dt['money_residual'] = dt.monthly_income - \
        (dt.credit_sum.values / dt.credit_month.values)
    # насколько человек хороший
    dt['debts_persent'] = (dt.overdue_credit_count.values +
                           1.0) / (dt.credit_count.values + 1.)
    # поправка на число кредитов
    # остаток денег на жизнь человеку
    dt['all_credit_money_residual'] = dt.monthly_income.values / (dt.credit_count.values + 1.) -\
        (dt.credit_sum.values * dt.tariff_id.values / (dt.credit_month.values + 1.))

    # поправочка остатка на семейное положение.
    dt['_tmp_add'] = (dt['marital_status'] == 'MAR').astype(int) * 1.3
    tmp = 1.3 / (1.0 + 0.2 * dt.is_female[dt['marital_status'] == 'DIV'].values)
    dt.set_value(dt['marital_status'] == 'DIV', '_tmp_add', tmp)
    tmp = 1.0 / (1.0 - 0.2 *  dt.is_female[dt['marital_status'] == 'WID'].values)
    dt.set_value(dt['marital_status'] == 'WID', '_tmp_add', tmp)
    dt.set_value(dt['marital_status'] == 'UNM', '_tmp_add', 1.0)
    dt.set_value(dt['marital_status'] == 'CIV', '_tmp_add', 1.2)

    dt['family_money_residual'] = dt[
        'money_residual'].values / dt['_tmp_add'].values

    dt['strange_money_residual'] = dt['family_money_residual'] / (dt.credit_count + 1.) -\
        (dt.credit_sum.values * dt.tariff_id / (dt.credit_month.values + 1.))

    dt['money_residual'] = dt['money_residual']
    dt['credit_sum'] = dt['credit_sum'] * dt.tariff_id.values
    #dt.drop(['_tmp_add'], axis=1, inplace=True)

_____

In [9]:
def preproc_pipline(dt_tr, dt_ts, cv=True):
    """
    Pipline.
    """

    data_train = dt_tr.copy()
    data_test = dt_ts.copy()
    y_tr = data_train.open_account_flg
    if cv:
        y_ts = data_test.open_account_flg
    #
    fill_na_train = _fillNa(data_train, {})
    _ = _fillNa(data_test, fill_na_train, is_train=False)
    #
    _floatCorrectoin(data_train)
    _floatCorrectoin(data_test)
    #
    str_cols = _get_new_str_features(data_train)
    _ = _get_new_str_features(data_test)
    # > все категории в числа
    counter_Encoders = _counter_encoder(data_train, str_cols, {})
    _ = _counter_encoder(data_test, str_cols, counter_Encoders, 
                                                is_train=False)

#     one_hot = [ 'marital_status',  'job_position', ]
#     onehot_Encoders = _onehot_encoder(data_train, one_hot, {}, is_train=True)
#     _ = _onehot_encoder(data_test, one_hot, onehot_Encoders, is_train=False)

    #  тут аккуратнее - некоторые приводят к оверфиту ¯\_(ツ)_/¯ таргет ведь.
    cols_for_encoding = [
                         'marital_status', 
                         'living_region',
                         'education',   
                         'job_position', 
                         'gender', 
                         'job_edu',
                         #'edu_LivReg',     
                         #'job_LivReg', 
                         #'gender_job',
                         #'gender_LivReg',  'gender_MarSt', 
                         #'LivReg_MarSt', 
                         #'job_LivReg_marSt_gen',       
                         ]

    targ_encoders = _target_encoder(data_train, data_train, 
                cols_for_encoding, {}, is_train=True)
    _ = _target_encoder(data_test, data_train, cols_for_encoding,
                        targ_encoders, is_train=False)
    #
    _new_money_features(data_train)
    _new_money_features(data_test)
    
    #
    data_train.drop(str_cols, axis=1, inplace=True)
    data_test.drop(str_cols, axis=1, inplace=True)
    data_train.drop(['open_account_flg'], axis=1, inplace=True)
    # если не использовать counter encoder для этих полей, то удалить тк не в str_cols
    data_train.drop(['marital_status',  'living_region',
                     'education',   'job_position', 'gender'], 
                    axis=1, inplace=True)
    data_test.drop(['marital_status',  'living_region',
                     'education',   'job_position', 'gender'], 
                    axis=1, inplace=True)
    #
    if cv:
        data_test.drop(['open_account_flg'], axis=1, inplace=True)
        return data_train, data_test, y_tr, y_ts
    else:
        return data_train, data_test, y_tr

_____

### CV

In [10]:
data_train = pd.read_csv('credit_train.csv', encoding='cp1251', sep=';')
data_train.drop(['client_id'], axis=1, inplace=True)

In [11]:
cv = StratifiedKFold(data_train.open_account_flg.values,
                     n_folds=5, shuffle=True, random_state=76)

In [13]:
xgb_params = {
    'objective': 'binary:logistic',
    'max_depth': 6,
    'gamma': 0.1,
    'eval_metric': 'auc',
    'eta': 0.015,
    'booster': 'gbtree',
    'seed': 1,
    'alpha': 0.1,
    'lambda': 0.2,
    'colsample_bytree': 0.9,
    'subsample': 0.9,
    'min_child_weight': 1,
    'silent': 1,
    'nthread': 9,
}

num_rounds = 1801

In [14]:
eval_res = []
for tr, ts in cv:
    eval_res.append({})
    dt_tr, dt_ts, y_tr, y_ts = preproc_pipline(data_train.loc[tr], data_train.loc[ts])
    print (dt_tr.shape)
    dtrain = xgb.DMatrix(dt_tr.values, label=y_tr, feature_names=dt_tr.columns)
    dtest = xgb.DMatrix(dt_ts.values, label=y_ts, feature_names=dt_ts.columns)
    
    watchlist = [(dtrain, 'train'), (dtest, 'eval')]

    gbdt = xgb.train(xgb_params, dtrain,
                     num_rounds, watchlist,
                     early_stopping_rounds=None,
                     verbose_eval=200,
                     evals_result=eval_res[-1])
    # plt.figure()
    # xgb.plot_importance(gbdt)

(136596, 32)
[0]	train-auc:0.704286	eval-auc:0.692771
[200]	train-auc:0.771731	eval-auc:0.751099
[400]	train-auc:0.787546	eval-auc:0.757835
[600]	train-auc:0.798182	eval-auc:0.76067
[800]	train-auc:0.806886	eval-auc:0.762102
[1000]	train-auc:0.814409	eval-auc:0.762769
[1200]	train-auc:0.821711	eval-auc:0.763395
[1400]	train-auc:0.828971	eval-auc:0.763733
[1600]	train-auc:0.835249	eval-auc:0.76386
[1800]	train-auc:0.841481	eval-auc:0.763986
(136597, 32)
[0]	train-auc:0.704267	eval-auc:0.705055
[200]	train-auc:0.770942	eval-auc:0.75852
[400]	train-auc:0.786728	eval-auc:0.764698
[600]	train-auc:0.797446	eval-auc:0.766991
[800]	train-auc:0.806432	eval-auc:0.768033
[1000]	train-auc:0.814287	eval-auc:0.76857
[1200]	train-auc:0.821831	eval-auc:0.768646
[1400]	train-auc:0.828607	eval-auc:0.768727
[1600]	train-auc:0.835291	eval-auc:0.768853
[1800]	train-auc:0.841597	eval-auc:0.768798
(136597, 32)
[0]	train-auc:0.705486	eval-auc:0.702512
[200]	train-auc:0.771025	eval-auc:0.761057
[400]	train-auc

In [15]:
cv_values = [eval_res[i]['eval']['auc'][-1] for i in range(5)]
print (str(np.mean(cv_values))[:6], '±', str(np.std(cv_values))[:6])

0.7670 ± 0.0029


### Light gbm

https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py

In [17]:
#https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-tuning.md

lgb_params = { 
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 256,
    'max_depth':7, 
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l2':0.1,
    'lambda_l1':0.1,
    'verbose': 0, 
    'nthread': 9,
}

In [18]:
eval_res = []
for tr, ts in cv:
    eval_res.append({})
    dt_tr, dt_ts, y_tr, y_ts = preproc_pipline(data_train.loc[tr], data_train.loc[ts])
    print (dt_tr.shape)
    
    lgb_train = lgb.Dataset(dt_tr.values, y_tr, free_raw_data=False)
    lgb_eval = lgb.Dataset(dt_ts.values, y_ts, reference=lgb_train, free_raw_data=False)
        
    # generate a feature name
    feature_name = ['feature_' + str(col) for col in dt_tr.columns.values]

    print('Start training...')
    gbm = lgb.train(lgb_params,
                    lgb_train,
                    num_boost_round=4001,
                    verbose_eval=500,
                    valid_sets=lgb_eval,  # eval
                    learning_rates=lambda iter: 0.1 * (0.99 ** iter),
                    feature_name=feature_name,
                    evals_result=eval_res[-1])

    # check feature name
    print('Finish first 500 rounds...')
    print('7th feature name is:', repr(lgb_train.feature_name[6]))

(136596, 32)
Start training...
[500]	valid_0's auc: 0.763051
[1000]	valid_0's auc: 0.763081
[1500]	valid_0's auc: 0.763081
[2000]	valid_0's auc: 0.763081
[2500]	valid_0's auc: 0.763081
[3000]	valid_0's auc: 0.763081
[3500]	valid_0's auc: 0.763081
[4000]	valid_0's auc: 0.763081
Finish first 500 rounds...
7th feature name is: 'feature_credit_count'
(136597, 32)
Start training...
[500]	valid_0's auc: 0.768939
[1000]	valid_0's auc: 0.768965
[1500]	valid_0's auc: 0.768966
[2000]	valid_0's auc: 0.768966
[2500]	valid_0's auc: 0.768966
[3000]	valid_0's auc: 0.768966
[3500]	valid_0's auc: 0.768966
[4000]	valid_0's auc: 0.768966
Finish first 500 rounds...
7th feature name is: 'feature_credit_count'
(136597, 32)
Start training...
[500]	valid_0's auc: 0.77023
[1000]	valid_0's auc: 0.770256
[1500]	valid_0's auc: 0.770256
[2000]	valid_0's auc: 0.770256
[2500]	valid_0's auc: 0.770256
[3000]	valid_0's auc: 0.770256
[3500]	valid_0's auc: 0.770256
[4000]	valid_0's auc: 0.770256
Finish first 500 rounds..

In [19]:
cv_values = [eval_res[i]['valid_0']['auc'][-1] for i in range(5)]
print (str(np.mean(cv_values))[:6], '±', str(np.std(cv_values))[:6])

0.7665 ± 0.0031


____

##### hyperopt for xgboost 

_____

## Training model

In [38]:
data_train = pd.read_csv('credit_train.csv', encoding='cp1251', sep=';')
data_test = pd.read_csv('credit_test.csv', encoding='cp1251', sep=';')

data_train.drop(['client_id'], axis=1, inplace=True)
data_test.drop(['client_id'], axis=1, inplace=True)

data_train, data_test, y = preproc_pipline(data_train, data_test, cv=False)

In [39]:
dtrain = xgb.DMatrix(data_train.values, label=y, 
                     feature_names=data_train.columns)
gbdt = xgb.train(xgb_params, dtrain, num_rounds)
gbdt.save_model('xgb.model') ### save model

In [40]:
#     importance = gbdt.get_fscore()
#     importance = sorted(importance.items(), key=operator.itemgetter(1))
#     df = pd.DataFrame(importance, columns=['feature', 'fscore'])
#     df['fscore'] = df['fscore'] / df['fscore'].sum()
#     pd.options.display.max_rows = 300
#     df.sort(ascending=False)

______

### test submission

In [41]:
# gbdt = xgb.Booster()  # init model
# gbdt.load_model(PATH_FOR_MODEL_DUMPING)  # load model
dtest = xgb.DMatrix(data_test.values, feature_names=data_test.columns)
ans = gbdt.predict(dtest)

In [42]:
idx = pd.read_csv('credit_test.csv', encoding='cp1251',
                  sep=';', usecols=['client_id']).client_id
answer = pd.DataFrame({'_ID_': idx, '_VAL_': ans})
answer.head()

Unnamed: 0,_ID_,_VAL_
0,170747,0.061901
1,170748,0.120279
2,170749,0.269779
3,170750,0.171574
4,170751,0.101115


In [44]:
answer.to_csv('answer__xgb' + str(datetime.datetime.now())[5:19] + \
              '__.csv', index=False)

______

# train lightgbm & submit

In [23]:
data_train = pd.read_csv('credit_train.csv', encoding='cp1251', sep=';')
data_test = pd.read_csv('credit_test.csv', encoding='cp1251', sep=';')

data_train.drop(['client_id'], axis=1, inplace=True)
data_test.drop(['client_id'], axis=1, inplace=True)

data_train, data_test, y = preproc_pipline(data_train, data_test, cv=False)


lgb_train = lgb.Dataset(data_train.values, y)

# generate a feature name
feature_name = ['feature_' + str(col) for col in data_train.columns.values]

print('Start training...')
gbm = lgb.train(lgb_params,
                lgb_train,
                num_boost_round=800,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                feature_name=feature_name)

Start training...


In [25]:
#lgb_test = lgb.Dataset(data_test.values)
ans = gbm.predict(data_test.values)

In [26]:
idx = pd.read_csv('credit_test.csv', encoding='cp1251',
                  sep=';', usecols=['client_id']).client_id
answer = pd.DataFrame({'_ID_': idx, '_VAL_': ans})
answer.head()

Unnamed: 0,_ID_,_VAL_
0,170747,0.075056
1,170748,0.146499
2,170749,0.235342
3,170750,0.183918
4,170751,0.101219


In [27]:
answer.to_csv('answer__lgbm' + str(datetime.datetime.now())[5:19] + \
              '__.csv', index=False)

### Не успел сблендить. забавно даже было бы попробовать lightgbm в действии