# HomeCredit.

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
%%time
dates = ['SK_DATE_DECISION', 'DTIME_CREDIT', 
         'DTIME_CREDIT_ENDDATE', 'DTIME_CREDIT_ENDDATE_FACT', 
         'DTIME_CREDIT_UPDATE']
train = pd.read_csv('data/train.csv', parse_dates=dates, dayfirst=True, infer_datetime_format=True)
test = pd.read_csv('data/test.csv', parse_dates=dates, dayfirst=True, infer_datetime_format=True)

CPU times: user 1min 1s, sys: 540 ms, total: 1min 2s
Wall time: 1min 2s


In [3]:
train.head()

Unnamed: 0,ID,SK_DATE_DECISION,DEF,NUM_SOURCE,CREDIT_ACTIVE,CREDIT_COLLATERAL,CREDIT_CURRENCY,DTIME_CREDIT,CREDIT_DAY_OVERDUE,DTIME_CREDIT_ENDDATE,...,CREDIT_DELAY90,CREDIT_DELAY_MORE,AMT_REQ_SOURCE_HOUR,AMT_REQ_SOURCE_DAY,AMT_REQ_SOURCE_WEEK,AMT_REQ_SOURCE_MON,AMT_REQ_SOURCE_QRT,AMT_REQ_SOURCE_YEAR,AMT_ANNUITY,TEXT_PAYMENT_DISCIPLINE
0,24368,2015-09-01,0,1,0,0,rur,2011-03-27,0,2012-01-27,...,0,0,0,0,0,0,0,3,0.0,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC0...
1,24368,2015-09-01,0,3,0,0,rur,2011-11-03,0,2014-11-30,...,0,0,0,0,0,0,0,3,0.0,CCCCCCCCCCCCCC0X000000000000000000000XXXXXXXXXX
2,24368,2015-09-01,0,4,0,0,rur,2011-11-03,0,2014-11-30,...,0,0,0,0,0,0,0,3,,CCCCCCCCCCCCCC000000000000000000000000XXXXXXXXX
3,24368,2015-09-01,0,3,0,0,rur,2012-02-24,0,2014-02-21,...,0,0,0,0,0,0,0,3,,CCCCCCCCCCCCCCCCCCCCX0X0XX00000X00X0XX0X00XX
4,24368,2015-09-01,0,4,0,0,rur,2012-02-24,0,2014-02-21,...,0,0,0,0,0,0,0,3,,CCCCCCCCCCCCCCCCCCCC000000000000000000000000


#### Выбросы в дате.

In [4]:
%%time
sorts_date = train['DTIME_CREDIT'].sort_values()
date_noise_idx = sorts_date.index[:3]
train.loc[date_noise_idx, ['DTIME_CREDIT']] = train.loc[sorts_date.index[3], ['DTIME_CREDIT']]

CPU times: user 288 ms, sys: 7.96 ms, total: 296 ms
Wall time: 295 ms


#### Замена пропущенных значений

In [5]:
%%time
idx = np.where(np.isnan(train['AMT_CREDIT_SUM']))[0]
user_id = train.loc[idx, 'ID']

for i, user in enumerate(user_id):
    user_idx = np.where(train['ID'] == user)[0]
    not_nan = np.where(np.logical_not(np.isnan(train.loc[user_idx, 'AMT_CREDIT_SUM'])))[0]
    median_not_nan = train.loc[user_idx[not_nan], 'AMT_CREDIT_SUM'].median()
    nan = np.where(np.isnan(train.loc[user_idx, 'AMT_CREDIT_SUM']))[0]
    train.loc[user_idx[nan], ['AMT_CREDIT_SUM']] = median_not_nan

CPU times: user 288 ms, sys: 10 µs, total: 288 ms
Wall time: 287 ms


In [6]:
%%time
idx = np.where(np.isnan(test['AMT_CREDIT_SUM']))[0]
user_id = test.loc[idx, 'ID']

for i, user in enumerate(user_id):
    user_idx = np.where(test['ID'] == user)[0]
    not_nan = np.where(np.logical_not(np.isnan(test.loc[user_idx, 'AMT_CREDIT_SUM'])))[0]
    median_not_nan = test.loc[user_idx[not_nan], 'AMT_CREDIT_SUM'].median()
    nan = np.where(np.isnan(test.loc[user_idx, 'AMT_CREDIT_SUM']))[0]
    test.loc[user_idx[nan], ['AMT_CREDIT_SUM']] = median_not_nan

CPU times: user 386 ms, sys: 9 µs, total: 386 ms
Wall time: 384 ms


## Добавляем признаки

In [7]:
%%time
train['CREDIT_MONTH'] = (train['DTIME_CREDIT_ENDDATE'] - train['DTIME_CREDIT']).dt.days // 30
train['CREDIT_MONTH_FACT'] = (train['DTIME_CREDIT_ENDDATE_FACT'] - train['DTIME_CREDIT']).dt.days // 30
train['CREDIT_MONTH_DECISION'] = (train['DTIME_CREDIT_ENDDATE'] - train['SK_DATE_DECISION']).dt.days // 30
train['CREDIT_MONTH_DECISION_FACT'] = (train['DTIME_CREDIT_ENDDATE_FACT'] - train['SK_DATE_DECISION']).dt.days // 30
train['CREDIT_LAST_UPDATE'] = (train['DTIME_CREDIT_UPDATE'] - train['DTIME_CREDIT']).dt.days
train['CREDIT_DECISION'] = (train['SK_DATE_DECISION'] - train['DTIME_CREDIT']).dt.days

test['CREDIT_MONTH'] = (test['DTIME_CREDIT_ENDDATE'] - test['DTIME_CREDIT']).dt.days // 30
test['CREDIT_MONTH_FACT'] = (test['DTIME_CREDIT_ENDDATE_FACT'] - test['DTIME_CREDIT']).dt.days // 30
test['CREDIT_MONTH_DECISION'] = (test['DTIME_CREDIT_ENDDATE'] - test['SK_DATE_DECISION']).dt.days // 30
test['CREDIT_MONTH_DECISION_FACT'] = (test['DTIME_CREDIT_ENDDATE_FACT'] - test['SK_DATE_DECISION']).dt.days // 30
test['CREDIT_LAST_UPDATE'] = (test['DTIME_CREDIT_UPDATE'] - test['DTIME_CREDIT']).dt.days
test['CREDIT_DECISION'] = (test['SK_DATE_DECISION'] - test['DTIME_CREDIT']).dt.days

CPU times: user 2min 40s, sys: 31.7 ms, total: 2min 40s
Wall time: 2min 40s


In [8]:
%%time
train.loc[train['CREDIT_MONTH'] < 0, 'CREDIT_MONTH'] *= -1
train.loc[train['CREDIT_MONTH_FACT'] < 0, 'CREDIT_MONTH_FACT'] *= -1
train.loc[train['CREDIT_MONTH_DECISION'] < 0, 'CREDIT_MONTH_DECISION'] *= -1
train.loc[train['CREDIT_MONTH_DECISION_FACT'] < 0, 'CREDIT_MONTH_DECISION_FACT'] *= -1

test.loc[test['CREDIT_MONTH'] < 0, 'CREDIT_MONTH'] *= -1
test.loc[test['CREDIT_MONTH_FACT'] < 0, 'CREDIT_MONTH_FACT'] *= -1
test.loc[test['CREDIT_MONTH_DECISION'] < 0, 'CREDIT_MONTH_DECISION'] *= -1
test.loc[test['CREDIT_MONTH_DECISION_FACT'] < 0, 'CREDIT_MONTH_DECISION_FACT'] *= -1

CPU times: user 419 ms, sys: 96 ms, total: 515 ms
Wall time: 501 ms


In [9]:
%%time
train.loc[train['CREDIT_MONTH'] == 0, 'CREDIT_MONTH'] = 1
train.loc[train['CREDIT_MONTH_FACT'] == 0, 'CREDIT_MONTH_FACT'] = 1
train.loc[train['CREDIT_MONTH_DECISION'] == 0, 'CREDIT_MONTH_DECISION'] = 1
train.loc[train['CREDIT_MONTH_DECISION_FACT'] == 0, 'CREDIT_MONTH_DECISION_FACT'] = 1

test.loc[test['CREDIT_MONTH'] == 0, 'CREDIT_MONTH'] = 1
test.loc[test['CREDIT_MONTH_FACT'] == 0, 'CREDIT_MONTH_FACT'] = 1
test.loc[test['CREDIT_MONTH_DECISION'] == 0, 'CREDIT_MONTH_DECISION'] = 1
test.loc[test['CREDIT_MONTH_DECISION_FACT'] == 0, 'CREDIT_MONTH_DECISION_FACT'] = 1

CPU times: user 96.8 ms, sys: 4 ms, total: 101 ms
Wall time: 100 ms


In [10]:
%%time
train['MONTH_PAY'] = train['AMT_CREDIT_SUM'] / train['CREDIT_MONTH']
train['MONTH_PAY_FACT'] = train['AMT_CREDIT_SUM'] / train['CREDIT_MONTH_FACT']

test['MONTH_PAY'] = test['AMT_CREDIT_SUM'] / test['CREDIT_MONTH']
test['MONTH_PAY_FACT'] = test['AMT_CREDIT_SUM'] / test['CREDIT_MONTH_FACT']

CPU times: user 128 ms, sys: 312 ms, total: 440 ms
Wall time: 354 ms


#### Удаляем столбец, так как признак не несёт никакой информации

In [11]:
%%time
train = train.drop("CREDIT_COLLATERAL", axis=1)
test = test.drop("CREDIT_COLLATERAL", axis=1)

CPU times: user 448 ms, sys: 572 ms, total: 1.02 s
Wall time: 1.28 s


#### Извлекаем признаки из text_payment_discipline

In [12]:
text_features = {0:'COUNT_0', 1:'COUNT_X', 2:'COUNT_1', 3:'COUNT_2', 4:'COUNT_3', 5:'COUNT_4', 6:'COUNT_5',
                 7: 'COUNT_10', 8: 'COUNT_20', 9: 'COUNT_30', 10: 'COUNT_40', 11: 'COUNT_50', 12: 'COUNT_X0',
                 13: 'COUNT_C', 14: 'WEIGHTS_0', 15: 'WEIGHTS_0_DESC', 16: 'WEIGHTS_MORE', 17: 'WEIGHTS_MORE_DESC',
                 18: 'SUM_INT', 19: 'WEIGHTS_X', 20: 'WEIGHTS_X_DESC', 
                 21: 'FIRST_0', 22: 'FIRST_1', 23: 'FIRST_2', 24: 'FIRST_3', 25: 'FIRST_4', 26: 'FIRST_5',
                 27: 'FIRST_C', 28: 'FIRST_10', 29: 'FIRST_20', 30: 'FIRST_30', 31: 'FIRST_40', 32: 'FIRST_50', 33: 'FIRST_X0', 
                 34: 'FIRST_0_DESC', 35: 'FIRST_1_DESC', 36: 'FIRST_2_DESC', 37: 'FIRST_3_DESC', 38: 'FIRST_4_DESC',
                 39: 'FIRST_5_DESC', 40: 'FIRST_C_DESC', 41: 'FIRST_10_DESC', 42: 'FIRST_20_DESC', 43: 'FIRST_30_DESC',
                 44: 'FIRST_40_DESC', 45: 'FIRST_50_DESC', 46: 'FIRST_X0_DESC', 47: 'LEN_TEXT'}

In [13]:
%%time
masks = train['TEXT_PAYMENT_DISCIPLINE'].notnull()
text_payment_features = train['TEXT_PAYMENT_DISCIPLINE'][masks].apply(
    lambda s:(s.count('0'),
              s.count('X'),
              s.count('1'), s.count('2'), s.count('3'), s.count('4'), s.count('5'),
              s.count('10'),
              s.count('20'),
              s.count('30'),
              s.count('40'),
              s.count('50'),
              s.count('X0'),
              s.count('C'),
              sum([1 / (i + 1) for i, ch in enumerate(s) if ch == '0']),
              sum([1 / (i + 1) for i, ch in enumerate(s[::-1]) if ch == '0']),
              sum([1 / (i + 1) for i, ch in enumerate(s) if ch in {'1', '2', '3', '4', '5'}]),
              sum([1 / (i + 1) for i, ch in enumerate(s[::-1]) if ch in {'1', '2', '3', '4', '5'}]),
              sum([int(ch) for ch in s if ch in {'1', '2', '3', '4', '5'}]),
              sum([1 / (i + 1) for i, ch in enumerate(s) if ch == 'X']),
              sum([1 / (i + 1) for i, ch in enumerate(s[::-1]) if ch == 'X']),
              s.find('0') / len(s),
              s.find('1') / len(s),
              s.find('2') / len(s),
              s.find('3') / len(s),
              s.find('4') / len(s),
              s.find('5') / len(s),
              s.find('C') / len(s),
              s.find('10') / len(s),
              s.find('20') / len(s),
              s.find('30') / len(s),
              s.find('40') / len(s),
              s.find('50') / len(s),
              s.find('X0') / len(s),
              s[::-1].find('0') / len(s),
              s[::-1].find('1') / len(s),
              s[::-1].find('2') / len(s),
              s[::-1].find('3') / len(s),
              s[::-1].find('4') / len(s),
              s[::-1].find('5') / len(s),
              s[::-1].find('C') / len(s),
              s[::-1].find('01') / len(s),
              s[::-1].find('02') / len(s),
              s[::-1].find('03') / len(s),
              s[::-1].find('04') / len(s),
              s[::-1].find('05') / len(s),
              s[::-1].find('0X') / len(s),
              len(s)))
text_payment_features = pd.DataFrame(np.vstack(text_payment_features), train[masks].index)
text_payment_features = text_payment_features.rename_axis(text_features, axis=1)
train_tmp = train.join(text_payment_features)

CPU times: user 1min 39s, sys: 2.45 s, total: 1min 41s
Wall time: 2min 34s


In [14]:
%%time
masks = test['TEXT_PAYMENT_DISCIPLINE'].notnull()
text_payment_features = test['TEXT_PAYMENT_DISCIPLINE'][masks].apply(
    lambda s:(s.count('0'),
              s.count('X'),
              s.count('1'), s.count('2'), s.count('3'), s.count('4'), s.count('5'),
              s.count('10'),
              s.count('20'),
              s.count('30'),
              s.count('40'),
              s.count('50'),
              s.count('X0'),
              s.count('C'),
              sum([1 / (i + 1) for i, ch in enumerate(s) if ch == '0']),
              sum([1 / (i + 1) for i, ch in enumerate(s[::-1]) if ch == '0']),
              sum([1 / (i + 1) for i, ch in enumerate(s) if ch in {'1', '2', '3', '4', '5'}]),
              sum([1 / (i + 1) for i, ch in enumerate(s[::-1]) if ch in {'1', '2', '3', '4', '5'}]),
              sum([int(ch) for ch in s if ch in {'1', '2', '3', '4', '5'}]),
              sum([1 / (i + 1) for i, ch in enumerate(s) if ch == 'X']),
              sum([1 / (i + 1) for i, ch in enumerate(s[::-1]) if ch == 'X']),
              s.find('0') / len(s),
              s.find('1') / len(s),
              s.find('2') / len(s),
              s.find('3') / len(s),
              s.find('4') / len(s),
              s.find('5') / len(s),
              s.find('C') / len(s),
              s.find('10') / len(s),
              s.find('20') / len(s),
              s.find('30') / len(s),
              s.find('40') / len(s),
              s.find('50') / len(s),
              s.find('X0') / len(s),
              s[::-1].find('0') / len(s),
              s[::-1].find('1') / len(s),
              s[::-1].find('2') / len(s),
              s[::-1].find('3') / len(s),
              s[::-1].find('4') / len(s),
              s[::-1].find('5') / len(s),
              s[::-1].find('C') / len(s),
              s[::-1].find('01') / len(s),
              s[::-1].find('02') / len(s),
              s[::-1].find('03') / len(s),
              s[::-1].find('04') / len(s),
              s[::-1].find('05') / len(s),
              s[::-1].find('0X') / len(s),
              len(s)))
text_payment_features = pd.DataFrame(np.vstack(text_payment_features), test[masks].index)
text_payment_features = text_payment_features.rename_axis(text_features, axis=1)
test_tmp = test.join(text_payment_features)

CPU times: user 1min 20s, sys: 1.95 s, total: 1min 22s
Wall time: 1min 44s


In [15]:
train_tmp.drop('CREDIT_CURRENCY', axis=1, inplace=True)
test_tmp.drop('CREDIT_CURRENCY', axis=1, inplace=True)

train_tmp.drop(dates, axis=1, inplace=True)
test_tmp.drop(dates, axis=1, inplace=True)

train_tmp.drop('TEXT_PAYMENT_DISCIPLINE', axis=1, inplace=True)
test_tmp.drop('TEXT_PAYMENT_DISCIPLINE', axis=1, inplace=True)

train_tmp.drop('CREDIT_FACILITY', axis=1, inplace=True)
test_tmp.drop('CREDIT_FACILITY', axis=1, inplace=True)

In [16]:
%%time
id_group_train = train_tmp.groupby('ID')
id_group_test = test_tmp.groupby('ID')

np_y_train = id_group_train['DEF'].first().values
train_tmp.drop('DEF', axis=1, inplace=True)

CPU times: user 228 ms, sys: 180 ms, total: 409 ms
Wall time: 2.4 s


#### "Тупые" счётчики

In [17]:
%%time
counts = id_group_train['NUM_SOURCE'].value_counts().reset_index(name='Count')
users = id_group_train['ID'].first().values
count_train_source = np.zeros((users.shape[0], 4))
for i, user in enumerate(users):
    user_idx = np.where(counts['ID'] == user)[0]
    for idx in user_idx:
        count_train_source[i, counts.loc[idx, 'NUM_SOURCE'] - 1] = counts.loc[idx, 'Count']
        
counts = id_group_test['NUM_SOURCE'].value_counts().reset_index(name='Count')
users = id_group_test['ID'].first().values
count_test_source = np.zeros((users.shape[0], 4))
for i, user in enumerate(users):
    user_idx = np.where(counts['ID'] == user)[0]
    for idx in user_idx:
        count_test_source[i, counts.loc[idx, 'NUM_SOURCE'] - 1] = counts.loc[idx, 'Count']

CPU times: user 2min 43s, sys: 62.4 ms, total: 2min 43s
Wall time: 2min 45s


In [18]:
%%time
dict_type = dict(np.vstack((np.unique(test['CREDIT_TYPE']), 
                 np.arange(len(np.unique(test['CREDIT_TYPE']))))).T)
counts = id_group_train['CREDIT_TYPE'].value_counts().reset_index(name='Count')
users = id_group_train['ID'].first().values
count_train_type = np.zeros((users.shape[0], 14))
for i, user in enumerate(users):
    user_idx = np.where(counts['ID'] == user)[0]
    for idx in user_idx:
        count_train_type[i, dict_type[counts.loc[idx, 'CREDIT_TYPE']]] = counts.loc[idx, 'Count']
        
counts = id_group_test['CREDIT_TYPE'].value_counts().reset_index(name='Count')
users = id_group_test['ID'].first().values
count_test_type = np.zeros((users.shape[0], 14))
for i, user in enumerate(users):
    user_idx = np.where(counts['ID'] == user)[0]
    for idx in user_idx:
        count_test_type[i, dict_type[counts.loc[idx, 'CREDIT_TYPE']]] = counts.loc[idx, 'Count']

CPU times: user 1min 28s, sys: 0 ns, total: 1min 28s
Wall time: 1min 28s


In [19]:
%%time
counts = id_group_train['CREDIT_ACTIVE'].value_counts().reset_index(name='Count')
users = id_group_train['ID'].first().values
count_train_active = np.zeros((users.shape[0], 4))
for i, user in enumerate(users):
    user_idx = np.where(counts['ID'] == user)[0]
    for idx in user_idx:
        count_train_active[i, counts.loc[idx, 'CREDIT_ACTIVE']] = counts.loc[idx, 'Count']
        
counts = id_group_test['CREDIT_ACTIVE'].value_counts().reset_index(name='Count')
users = id_group_test['ID'].first().values
count_test_active = np.zeros((users.shape[0], 4))
for i, user in enumerate(users):
    user_idx = np.where(counts['ID'] == user)[0]
    for idx in user_idx:
        count_test_active[i, counts.loc[idx, 'CREDIT_ACTIVE']] = counts.loc[idx, 'Count']

CPU times: user 1min 22s, sys: 15.6 ms, total: 1min 22s
Wall time: 1min 22s


In [20]:
%%time
train_tmp.drop('NUM_SOURCE', axis=1, inplace=True)
test_tmp.drop('NUM_SOURCE', axis=1, inplace=True)

train_tmp.drop('CREDIT_TYPE', axis=1, inplace=True)
test_tmp.drop('CREDIT_TYPE', axis=1, inplace=True)

train_tmp.drop('CREDIT_ACTIVE', axis=1, inplace=True)
test_tmp.drop('CREDIT_ACTIVE', axis=1, inplace=True)

CPU times: user 972 ms, sys: 864 ms, total: 1.84 s
Wall time: 5.63 s


In [21]:
%%time
train_tmp['PAYMENT_MONTH'] = train_tmp['MONTH_PAY'] * train_tmp['WEIGHTS_0']
train_tmp['PAYMENT_MONTH_FACT'] = train_tmp['MONTH_PAY_FACT'] * train_tmp['WEIGHTS_0']

test_tmp['PAYMENT_MONTH'] = test_tmp['MONTH_PAY'] * test_tmp['WEIGHTS_0']
test_tmp['PAYMENT_MONTH_FACT'] = test_tmp['MONTH_PAY_FACT'] * test_tmp['WEIGHTS_0']

CPU times: user 50.1 ms, sys: 7.91 ms, total: 58 ms
Wall time: 504 ms


#### Агрегация данных

In [22]:
%%time
id_group_train = train_tmp.groupby('ID')
id_group_test = test_tmp.groupby('ID')

median_train = id_group_train.median()
mean_train = id_group_train.mean()
var_train = id_group_train.var()
max_train = id_group_train.max()
min_train = id_group_train.min()

median_test = id_group_test.median()
mean_test = id_group_test.mean()
var_test = id_group_test.var()
max_test = id_group_test.max()
min_test = id_group_test.min()

CPU times: user 12.7 s, sys: 3.34 s, total: 16.1 s
Wall time: 2min 47s


In [23]:
sum_train = id_group_train['PAYMENT_MONTH'].sum() / id_group_train['MONTH_PAY'].sum()
sum_train_fact = id_group_train['PAYMENT_MONTH_FACT'].sum() / id_group_train['MONTH_PAY_FACT'].sum()

sum_test = id_group_test['PAYMENT_MONTH'].sum() / id_group_test['MONTH_PAY'].sum()
sum_test_fact = id_group_test['PAYMENT_MONTH_FACT'].sum() / id_group_test['MONTH_PAY_FACT'].sum()

In [24]:
%%time
median_train.columns = ['MEDIAN_' + column for column in median_train.columns]
mean_train.columns = ['MEAN_' + column for column in mean_train.columns]
var_train.columns = ['VAR_' + column for column in var_train.columns]
max_train.columns = ['MAX_' + column for column in max_train.columns]
min_train.columns = ['MIN_' + column for column in min_train.columns]
sum_train = pd.DataFrame(sum_train, columns=['SUM_PAYMENT_MONTH_PAY'])
sum_train_fact = pd.DataFrame(sum_train_fact, columns=['SUM_PAYMENT_MONTH_PAY_FACT'])


median_test.columns = ['MEDIAN_' + column for column in median_test.columns]
mean_test.columns = ['MEAN_' + column for column in mean_test.columns]
var_test.columns = ['VAR_' + column for column in var_test.columns]
max_test.columns = ['MAX_' + column for column in max_test.columns]
min_test.columns = ['MIN_' + column for column in min_test.columns]
sum_test = pd.DataFrame(sum_test, columns=['SUM_PAYMENT_MONTH_PAY'])
sum_test_fact = pd.DataFrame(sum_test_fact, columns=['SUM_PAYMENT_MONTH_PAY_FACT'])

CPU times: user 2.7 ms, sys: 0 ns, total: 2.7 ms
Wall time: 150 ms


In [25]:
%%time
sum_train = pd.DataFrame(sum_train, columns=['SUM_PAYMENT_MONTH_PAY'])
sum_train_fact = pd.DataFrame(sum_train_fact, columns=['SUM_PAYMENT_MONTH_PAY_FACT'])

sum_test = pd.DataFrame(sum_test, columns=['SUM_PAYMENT_MONTH_PAY'])
sum_test_fact = pd.DataFrame(sum_test_fact, columns=['SUM_PAYMENT_MONTH_PAY_FACT'])

CPU times: user 1.66 ms, sys: 0 ns, total: 1.66 ms
Wall time: 39.2 ms


In [26]:
%%time
count_train_source = pd.DataFrame(count_train_source, 
                                  columns=['NUM_SOURCE_' + str(i) for i in range(count_train_source.shape[1])])
count_train_type = pd.DataFrame(count_train_type, 
                                columns=['TYPE_' + str(i) for i in range(count_train_type.shape[1])])
count_train_active = pd.DataFrame(count_train_active, 
                                  columns=['ACTIVE_' + str(i) for i in range(count_train_active.shape[1])])

count_test_source = pd.DataFrame(count_test_source, 
                                  columns=['NUM_SOURCE_' + str(i) for i in range(count_test_source.shape[1])])
count_test_type = pd.DataFrame(count_test_type, 
                                columns=['TYPE_' + str(i) for i in range(count_test_type.shape[1])])
count_test_active = pd.DataFrame(count_test_active, 
                                  columns=['ACTIVE_' + str(i) for i in range(count_test_active.shape[1])])

CPU times: user 1.88 ms, sys: 0 ns, total: 1.88 ms
Wall time: 27.8 ms


#### Соединение таблиц

In [27]:
%%time
np_train = median_train.join(
           mean_train).join(
           var_train).join(
           max_train).join(
           min_train).join(
           sum_train).join(
           sum_train_fact).join(
           count_train_source).join(
           count_train_type).join(
           count_train_active)

np_test = median_test.join(
           mean_test).join(
           var_test).join(
           max_test).join(
           min_test).join(
           sum_test).join(
           sum_test_fact).join(
           count_test_source).join(
           count_test_type).join(
           count_test_active)

CPU times: user 1.49 s, sys: 2.57 s, total: 4.05 s
Wall time: 1min 11s


In [29]:
dtrain = lgb.Dataset(np_train, label=np_y_train)

In [30]:
%%time
params = {'max_depth':2, 
         'learning_rate':0.05, 
         'silent':1,
         'objective':'binary',
         'bagging_freq':1,
         'bagging_seed':517,
         'bagging_fraction':0.8,
         'feature_fraction_seed':417,
         'feature_fraction':0.6,
         'reg_lambda':3.0, 
         'reg_alpha':1.0,
         'seed':0,
         'metric':'auc'
        }
n_estimators = 1500
cv2 = lgb.cv(params, dtrain, metrics='auc',
            early_stopping_rounds=25, num_boost_round=n_estimators, verbose_eval=25)

[25]	cv_agg's auc: 0.644514 + 0.00935859
[50]	cv_agg's auc: 0.652913 + 0.00919033
[75]	cv_agg's auc: 0.66226 + 0.00821438
[100]	cv_agg's auc: 0.671426 + 0.00777875
[125]	cv_agg's auc: 0.678871 + 0.00680375
[150]	cv_agg's auc: 0.684078 + 0.00667705
[175]	cv_agg's auc: 0.687105 + 0.0065074
[200]	cv_agg's auc: 0.689092 + 0.00623089
[225]	cv_agg's auc: 0.690147 + 0.00625043
[250]	cv_agg's auc: 0.691332 + 0.00663036
[275]	cv_agg's auc: 0.692359 + 0.00668136
[300]	cv_agg's auc: 0.693197 + 0.00668115
[325]	cv_agg's auc: 0.693745 + 0.00706376
[350]	cv_agg's auc: 0.694359 + 0.00699342
[375]	cv_agg's auc: 0.695117 + 0.00702162
[400]	cv_agg's auc: 0.695414 + 0.00744108
[425]	cv_agg's auc: 0.695606 + 0.00749309
CPU times: user 6min 11s, sys: 1min, total: 7min 11s
Wall time: 2min 31s


In [31]:
%%time
params = {'max_depth':2, 
         'learning_rate':0.05, 
         'silent':1,
         'objective':'binary',
         'bagging_freq':1,
         'bagging_seed':517,
         'bagging_fraction':0.8,
         'feature_fraction_seed':417,
         'feature_fraction':0.8,
         'reg_lambda':3.0, 
         'reg_alpha':1.0,
         'seed':0,
         'metric':'auc'
        }
n_estimators = 1500
cv2 = lgb.cv(params, dtrain, metrics='auc',
            early_stopping_rounds=25, num_boost_round=n_estimators, verbose_eval=25)

[25]	cv_agg's auc: 0.645646 + 0.0107004
[50]	cv_agg's auc: 0.653389 + 0.00935794
[75]	cv_agg's auc: 0.663623 + 0.00772412
[100]	cv_agg's auc: 0.671514 + 0.007375
[125]	cv_agg's auc: 0.679146 + 0.0069445
[150]	cv_agg's auc: 0.683841 + 0.00699095
[175]	cv_agg's auc: 0.686515 + 0.00726227
[200]	cv_agg's auc: 0.688758 + 0.00699081
[225]	cv_agg's auc: 0.690124 + 0.00700885
[250]	cv_agg's auc: 0.691519 + 0.00723159
[275]	cv_agg's auc: 0.692607 + 0.00725552
[300]	cv_agg's auc: 0.693412 + 0.00722445
[325]	cv_agg's auc: 0.693881 + 0.00748369
[350]	cv_agg's auc: 0.694398 + 0.00741349
[375]	cv_agg's auc: 0.695135 + 0.00739855
[400]	cv_agg's auc: 0.695517 + 0.00750578
[425]	cv_agg's auc: 0.695833 + 0.00761385
[450]	cv_agg's auc: 0.696064 + 0.0076694
[475]	cv_agg's auc: 0.696211 + 0.00765792
[500]	cv_agg's auc: 0.696398 + 0.00759021
[525]	cv_agg's auc: 0.696652 + 0.00768809
[550]	cv_agg's auc: 0.696684 + 0.00757114
CPU times: user 7min 55s, sys: 1min 9s, total: 9min 4s
Wall time: 2min 27s


In [32]:
%%time
params = {'max_depth':3, 
         'learning_rate':0.05, 
         'silent':1,
         'objective':'binary',
         'bagging_freq':1,
         'bagging_seed':517,
         'bagging_fraction':0.8,
         'feature_fraction_seed':417,
         'feature_fraction':0.8,
         'reg_lambda':3.0, 
         'reg_alpha':1.0,
         'seed':0,
         'metric':'auc'
        }
n_estimators = 1500
cv3 = lgb.cv(params, dtrain, metrics='auc',
            early_stopping_rounds=25, num_boost_round=n_estimators, verbose_eval=25)

[25]	cv_agg's auc: 0.652359 + 0.0107474
[50]	cv_agg's auc: 0.662015 + 0.00872783
[75]	cv_agg's auc: 0.671749 + 0.00776616
[100]	cv_agg's auc: 0.680869 + 0.00725063
[125]	cv_agg's auc: 0.68701 + 0.0072419
[150]	cv_agg's auc: 0.690348 + 0.00766499
[175]	cv_agg's auc: 0.691771 + 0.00802829
[200]	cv_agg's auc: 0.693048 + 0.00816948
[225]	cv_agg's auc: 0.694003 + 0.00838487
[250]	cv_agg's auc: 0.694455 + 0.00922243
[275]	cv_agg's auc: 0.694817 + 0.00944424
[300]	cv_agg's auc: 0.695259 + 0.00963349
[325]	cv_agg's auc: 0.695733 + 0.00983453
[350]	cv_agg's auc: 0.695834 + 0.00989603
[375]	cv_agg's auc: 0.696108 + 0.00951338
CPU times: user 6min 42s, sys: 56.7 s, total: 7min 39s
Wall time: 2min 3s


In [33]:
%%time
params = {'max_depth':4, 
         'learning_rate':0.05, 
         'silent':1,
         'objective':'binary',
         'bagging_freq':1,
         'bagging_seed':517,
         'bagging_fraction':0.8,
         'feature_fraction_seed':417,
         'feature_fraction':0.8,
         'reg_lambda':3.0, 
         'reg_alpha':1.0,
         'seed':0,
         'metric':'auc'
        }
n_estimators = 1500
cv4 = lgb.cv(params, dtrain, metrics='auc',
            early_stopping_rounds=25, num_boost_round=n_estimators, verbose_eval=25)

[25]	cv_agg's auc: 0.654524 + 0.00980555
[50]	cv_agg's auc: 0.666646 + 0.00899023
[75]	cv_agg's auc: 0.677204 + 0.00830135
[100]	cv_agg's auc: 0.687282 + 0.00796424
[125]	cv_agg's auc: 0.692347 + 0.00795889
[150]	cv_agg's auc: 0.694328 + 0.00794711
[175]	cv_agg's auc: 0.695782 + 0.00816395
[200]	cv_agg's auc: 0.69661 + 0.00845973
[225]	cv_agg's auc: 0.697353 + 0.00918043
[250]	cv_agg's auc: 0.696996 + 0.00860766
CPU times: user 5min 47s, sys: 41.9 s, total: 6min 29s
Wall time: 1min 42s


#### Итоговая модель

In [36]:
%%time
params = {'max_depth':2, 
         'learning_rate':0.05, 
         'silent':1,
         'objective':'binary',
         'bagging_freq':1,
         'bagging_seed':517,
         'bagging_fraction':0.7,
         'feature_fraction_seed':517,
         'feature_fraction':0.7,
         'reg_lambda':3.0, 
         'reg_alpha':1.0,
         'seed':0,
         'metric':'auc'
        }
models = []
for seed in range(100, 110):
    params['feature_fraction_seed'] = seed
    models.append(lgb.train(params, dtrain, num_boost_round=500))
    print('seed:', seed)

seed: 100
seed: 101
seed: 102
seed: 103
seed: 104
seed: 105
seed: 106
seed: 107
seed: 108
seed: 109
CPU times: user 15min 56s, sys: 4min 33s, total: 20min 29s
Wall time: 6min 42s


#### Сохраняем предсказания

In [46]:
%%time
y_pred = np.zeros(np_test.shape[0])
for model in models:
    y_pred += model.predict(np_test)


id_test = id_group_test['ID'].first().values
pd.DataFrame({'ID': id_test, 'Score': y_pred}).to_csv('final/msu_submit.csv', index=False)

CPU times: user 4.37 s, sys: 2.53 s, total: 6.9 s
Wall time: 43.7 s
