In [2]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np

from category_encoders.target_encoder import TargetEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

In [3]:
train = pd.read_csv('train_base.csv', sep=';')
test = pd.read_csv('test_base.csv', sep=';')

In [3]:
import analytics as als

In [4]:
bureau = pd.read_csv('bureau.csv')

In [5]:
bureau_balance = pd.read_csv('bureau_balance.csv')

In [6]:
list(bureau)

['SK_ID_CURR',
 'SK_ID_BUREAU',
 'CREDIT_ACTIVE',
 'CREDIT_CURRENCY',
 'DAYS_CREDIT',
 'CREDIT_DAY_OVERDUE',
 'DAYS_CREDIT_ENDDATE',
 'DAYS_ENDDATE_FACT',
 'AMT_CREDIT_MAX_OVERDUE',
 'CNT_CREDIT_PROLONG',
 'AMT_CREDIT_SUM',
 'AMT_CREDIT_SUM_DEBT',
 'AMT_CREDIT_SUM_LIMIT',
 'AMT_CREDIT_SUM_OVERDUE',
 'CREDIT_TYPE',
 'DAYS_CREDIT_UPDATE',
 'AMT_ANNUITY']

In [7]:
bureau.shape

(1716428, 17)

In [8]:
als.check_missing_values(bureau, 10, 0)

AMT_ANNUITY               1226791
AMT_CREDIT_MAX_OVERDUE    1124488
DAYS_ENDDATE_FACT          633653
AMT_CREDIT_SUM_LIMIT       591780
AMT_CREDIT_SUM_DEBT        257669
DAYS_CREDIT_ENDDATE        105553
AMT_CREDIT_SUM                 13
CREDIT_TYPE                     0
AMT_CREDIT_SUM_OVERDUE          0
CNT_CREDIT_PROLONG              0
dtype: int64

In [9]:
als.check_missing_values(bureau, 15, 1)

793046     6
773821     6
871866     6
468959     6
701505     6
871863     6
1291779    6
552590     6
311889     6
1315567    6
1174161    6
871824     6
871823     6
311862     6
1011092    6
dtype: int64

In [10]:
als.get_unique_value(bureau, None)

SK_ID_CURR <class 'numpy.int64'> [215354 162297 402440 ... 448157 345866 235871]
SK_ID_BUREAU <class 'numpy.int64'> [5714462 5714463 5714464 ... 5057762 5057770 5057778]
CREDIT_ACTIVE <class 'str'> ['Closed' 'Active' 'Sold' 'Bad debt']
CREDIT_CURRENCY <class 'str'> ['currency 1' 'currency 2' 'currency 4' 'currency 3']
DAYS_CREDIT <class 'numpy.int64'> [-497 -208 -203 ...    0   -3   -1]
CREDIT_DAY_OVERDUE <class 'numpy.int64'> [   0 2603    6   30 2156  496  186 2264   41    8   71   13  252 1201
   12   38 1777  150   18   37   64 2050    9    4   29   22   49 1930
   34  951  936  664   45 2625  837   21   15   31 1715   19   67    7
   26   23   48   46   73   90 1127   53   33 1572   39 2582   14   25
   10   50   17  193   40   20   28  123  138  700  122 1504   32  670
   75  140   24  277 2019  161  962  187   58  106  634   83  120   60
   93 1694  227   44  344 2085 1248  124 1651 2541 1541  642  745  163
  234 1472  688 2455  614  319 2187   68  831  139  283  180   47  164
 

# Анализ данных

## Категориальные признаки:
### Агрегирование:
#### Для агрегирования категориальных признаков можно попробовать создать признаки для каждого значения категориальных фичей (признак_категория_i) и присвоить ему количество данных категорий для каждого клиента. Также можно упорядочить статус кредита и подсчитать среднее значение (не стоит считать сумму, так как у всех клиентов разное количество строк и результаты будут искажаться)

#### CREDIT_ACTIVE - статус кредита

#### CREDIT_CURRENCY - валюта кредита

#### CREDIT_TYPE - тип кредита 


## Числовые признаки: 
### Агрегирование:
#### Для агрегирования числовых признаков можно создавать признаки с минимальным, средним и максимальным значение для каждого клиента. 

#### SK_ID_CURR - номер клиента

#### SK_ID_BUREAU - номер заявки

#### DAYS_CREDIT - количество прошедших дней с подачи заявки в отношении текущей заявки

#### CREDIT_DAY_OVERDUE - количество просроченных дней

#### DAYS_CREDIT_ENDDATE - количество дней, в течении которых кредит действителен 

#### DAYS_ENDDATE_FACT - количество дней с момента окончания срока кредита 

#### AMT_CREDIT_MAX_OVERDUE - максимальная задолженность по кредиту

#### CNT_CREDIT_PROLONG - сколько раз была продлена дата до погашения

#### AMT_CREDIT_SUM - сумма кредитов

#### AMT_CREDIT_SUM_DEBT - сумма задолженностей

#### AMT_CREDIT_SUM_LIMIT - текущий кредит

#### AMT_CREDIT_SUM_OVERDUE - текущий просроченный платеж по кредиту

#### DAYS_CREDIT_UPDATE - количество дней, когда было получено последнее обновление по кредиту

#### AMT_ANNUITY - аннуитет данных кредитных бюро. 

In [6]:
import treatment as trm

In [7]:
feature_str, feature_int = als.get_lists_type_feature(bureau, list(bureau))

In [8]:
import aggregation as agt

In [9]:
agr_str = agt.aggregate_category_entries(bureau, feature_str)

In [10]:
agr_str.to_csv('agr_str.csv', sep=',', index=False)

In [11]:
agr_str.shape

(305811, 23)

### Модель

In [12]:
trm.fill_gaps(agr_str, list(agr_str), 0)

In [13]:
train

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,Target
0,100002,0.071954,0.088068,0.072761,0.068401,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,100003,0.071954,0.059883,0.072761,0.070987,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,100016,0.071954,0.059883,0.072761,0.068401,0,67500.0,80865.0,5881.5,67500.0,...,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,100017,0.071954,0.088068,0.061824,0.070987,1,225000.0,918468.0,28966.5,697500.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,100018,0.071954,0.059883,0.072761,0.068401,0,189000.0,773680.5,32778.0,679500.0,...,0,0,0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146789,456249,0.071954,0.059883,0.072761,0.068401,0,112500.0,225000.0,22050.0,225000.0,...,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0,0
146790,456251,0.071954,0.088068,0.072761,0.070987,0,157500.0,254700.0,27558.0,225000.0,...,0,0,0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0
146791,456252,0.071954,0.059883,0.072761,0.068401,0,72000.0,269550.0,12001.5,225000.0,...,0,0,0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0
146792,456253,0.071954,0.059883,0.072761,0.068401,0,153000.0,677664.0,29979.0,585000.0,...,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0,0


In [14]:
agr_train = agr_str.loc[agr_str.index.isin(list(train.SK_ID_CURR.unique()))]

In [15]:
agr_test = agr_str.loc[agr_str.index.isin(list(test.SK_ID_CURR.unique()))]

In [16]:
agr_train.shape

(128833, 23)

In [33]:
train.shape

(146794, 103)

In [34]:
agr_test.shape

(42320, 23)

In [35]:
test.shape

(48744, 102)

In [38]:
agr_train

Unnamed: 0_level_0,Closed,Active,Sold,Bad debt,currency 1,currency 2,currency 4,currency 3,Consumer credit,Credit card,...,Loan for working capital replenishment,Loan for business development,Real estate loan,Unknown type of loan,Another type of loan,Cash loan (non-earmarked),Loan for the purchase of equipment,Mobile operator loan,Interbank credit,Loan for purchase of shares (margin lending)
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
402440,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
426155,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136226,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
389599,5.0,3.0,0.0,0.0,8.0,0.0,0.0,0.0,6.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
311918,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412937,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
322864,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207190,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
324956,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Создаем агрегированные данные

In [17]:
agr_min = agt.aggregate_min_value(bureau, feature_int)

In [18]:
agr_min.to_csv('agr_min.csv', sep=',', index=False)

In [19]:
agr_max = agt.aggregate_max_value(bureau, feature_int)

In [20]:
agr_max.to_csv('agr_max.csv', sep=',', index=False)

In [21]:
agr_mean = agt.aggregate_mean_value(bureau, feature_int)

In [22]:
agr_mean.to_csv('agr_mean.csv', sep=',', index=False)

In [46]:
agr_min.shape

(305811, 14)

In [23]:
agr_max.shape  

(305811, 14)

In [45]:
agr_str.shape

(305811, 23)

In [24]:
agr_min_df = agr_min.loc[agr_min.index.isin(list(train.SK_ID_CURR.unique()))]

In [25]:
agr_min_df.shape

(128833, 14)

In [26]:
agr_max_df = agr_max.loc[agr_max.index.isin(list(train.SK_ID_CURR.unique()))]

In [27]:
agr_mean_df = agr_mean.loc[agr_mean.index.isin(list(train.SK_ID_CURR.unique()))]

In [28]:
# Добавляет минимальное значение и создаем бинарный признак для наличия кредита

In [29]:
train.shape

(146794, 103)

In [30]:
without_credit = list(set(train.SK_ID_CURR) - set(agr_min_df.index))

In [31]:
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_min_df))

In [32]:
empty_df.fillna(9999, inplace=True)

In [33]:
empty_df['dummy_credit_history'] = 0

In [34]:
agr_min_df['dummy_credit_history'] = 1

In [35]:
min_features = pd.concat([empty_df, agr_min_df])

In [36]:
min_features['SK_ID_CURR'] = min_features.index

In [37]:
del min_features['SK_ID_CURR_MIN']

In [38]:
min_df = train.merge(min_features, on='SK_ID_CURR', how='left')

In [79]:
# Анализ пропусков

In [81]:
als.check_missing_values(min_df, 15, 1)

124775    6
139961    6
135091    6
68101     6
30096     6
82063     6
20955     6
112294    6
72623     6
103027    6
106272    6
143206    6
137502    6
138399    6
38368     6
dtype: int64

In [None]:
# Заполняем пропуски отдельным числом 9999

In [40]:
min_df.fillna(9999, inplace=True)

In [83]:
#### Модель

In [84]:
min_df

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,DAYS_ENDDATE_FACT_MIN,AMT_CREDIT_MAX_OVERDUE_MIN,CNT_CREDIT_PROLONG_MIN,AMT_CREDIT_SUM_MIN,AMT_CREDIT_SUM_DEBT_MIN,AMT_CREDIT_SUM_LIMIT_MIN,AMT_CREDIT_SUM_OVERDUE_MIN,DAYS_CREDIT_UPDATE_MIN,AMT_ANNUITY_MIN,dummy_credit_history
0,100002,0.071954,0.088068,0.072761,0.068401,0,202500.0,406597.5,24700.5,351000.0,...,-1185.0,0.0,0.0,0.0,0.0,0.0,0.0,-1185.0,0.0,1
1,100003,0.071954,0.059883,0.072761,0.070987,0,270000.0,1293502.5,35698.5,1129500.0,...,-2131.0,0.0,0.0,22248.0,0.0,0.0,0.0,-2131.0,9999.0,1
2,100016,0.071954,0.059883,0.072761,0.068401,0,67500.0,80865.0,5881.5,67500.0,...,-1369.0,0.0,0.0,22086.0,0.0,0.0,0.0,-1367.0,9999.0,1
3,100017,0.071954,0.088068,0.061824,0.070987,1,225000.0,918468.0,28966.5,697500.0,...,-2575.0,0.0,0.0,25231.5,0.0,0.0,0.0,-2575.0,9999.0,1
4,100018,0.071954,0.059883,0.072761,0.068401,0,189000.0,773680.5,32778.0,679500.0,...,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146789,456249,0.071954,0.059883,0.072761,0.068401,0,112500.0,225000.0,22050.0,225000.0,...,-2525.0,0.0,0.0,43447.5,0.0,0.0,0.0,-2498.0,9999.0,1
146790,456251,0.071954,0.088068,0.072761,0.070987,0,157500.0,254700.0,27558.0,225000.0,...,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0
146791,456252,0.071954,0.059883,0.072761,0.068401,0,72000.0,269550.0,12001.5,225000.0,...,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,9999.0,0
146792,456253,0.071954,0.059883,0.072761,0.068401,0,153000.0,677664.0,29979.0,585000.0,...,-794.0,9999.0,0.0,360000.0,0.0,0.0,0.0,-701.0,58369.5,1


In [119]:
X = min_df.drop(columns=['Target'])
y = min_df['Target']

In [47]:
kf = StratifiedKFold(n_splits=5,  shuffle=True)

In [121]:
for train_indices, test_indices in kf.split(X, y):
    X_train, X_val = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[test_indices]
    lgbmc = lgb.LGBMClassifier(mobjective='binary',
                               max_depth=47, learning_rate=0.05,
                               n_estimators=100, class_weight='balanced',
                               subsample=0.8,
                               colsample_bytree=0.8,
                               random_state=42,
                               importance_type='gain')
    model = lgbmc.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    score_train = roc_auc_score(y_train, y_pred_train)
    print(f'train_score {score_train}')   
    y_pred_val = model.predict(X_val)
    score_val = roc_auc_score(y_val, y_pred_val)
    print(f'test_score {score_val}')

train_score 0.7264852781888265
test_score 0.6911618585495027
train_score 0.7284481055242524
test_score 0.6844759293739773
train_score 0.7279626651186524
test_score 0.6934403636203254
train_score 0.7281409621021702
test_score 0.683263723451506
train_score 0.7277401482040815
test_score 0.6848992822821611


In [94]:
# Прогнозируем на тест

In [122]:
agr_min_test = agr_min.loc[agr_min.index.isin(list(test.SK_ID_CURR.unique()))]

In [123]:
agr_min_test.shape

(42320, 14)

In [124]:
without_credit = list(set(test.SK_ID_CURR) - set(agr_min_test.index))

In [125]:
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_min_test))

In [126]:
empty_df.fillna(9999, inplace=True)

In [127]:
empty_df['dummy_credit_history'] = 0

In [128]:
agr_min_test['dummy_credit_history'] = 1

In [129]:
min_features = pd.concat([empty_df, agr_min_df])

In [130]:
min_features['SK_ID_CURR'] = min_features.index

In [131]:
del min_features['SK_ID_CURR_MIN']

In [132]:
min_df_test = test.merge(min_features, on='SK_ID_CURR', how='left')

In [133]:
min_df_test.fillna(9999, inplace=True)

In [134]:
pred = model.predict(min_df_test)

In [135]:
submission = {
    'SK_ID_CURR': min_df_test.SK_ID_CURR.values,
    'TARGET': pred}

In [136]:
solution = pd.DataFrame(submission)

In [137]:
solution.to_csv('submission_db.csv',index=False)

In [138]:
# ROC-AUC = 0.67276

In [139]:
### Добавим сразу максимум и медиану

In [78]:
without_credit = list(set(train.SK_ID_CURR) - set(agr_max_df.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_max_df))
empty_df.fillna(0, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_max_df['dummy_credit_history'] = 1
max_features = pd.concat([empty_df, agr_max_df])
max_features['SK_ID_CURR'] = max_features.index
del max_features['SK_ID_CURR_MAX']
max_df = train.merge(max_features, on='SK_ID_CURR', how='left')

In [79]:
without_credit = list(set(train.SK_ID_CURR) - set(agr_mean_df.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_mean_df))
empty_df.fillna(0, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_mean_df['dummy_credit_history'] = 1
mean_features = pd.concat([empty_df, agr_mean_df])
mean_features['SK_ID_CURR'] = mean_features.index
del mean_features['SK_ID_CURR_MEAN']
mean_df = train.merge(mean_features, on='SK_ID_CURR', how='left')

In [80]:
df = max_df.merge(mean_features, on='SK_ID_CURR', how='left')

In [44]:
df.shape

(146794, 131)

In [45]:
X = df.drop(columns=['Target'])
y = df['Target']

In [49]:
for train_indices, test_indices in kf.split(X, y):
    X_train, X_val = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[test_indices]
    lgbmc = lgb.LGBMClassifier(mobjective='binary',
                               max_depth=47, learning_rate=0.05,
                               n_estimators=100, class_weight='balanced',
                               subsample=0.8,
                               colsample_bytree=0.8,
                               random_state=42,
                               importance_type='gain')
    model = lgbmc.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    score_train = roc_auc_score(y_train, y_pred_train)
    print(f'train_score {score_train}')   
    y_pred_val = model.predict(X_val)
    score_val = roc_auc_score(y_val, y_pred_val)
    print(f'test_score {score_val}')

train_score 0.7339726153436151
test_score 0.6782367720184194
train_score 0.730125424847575
test_score 0.6983104279741293
train_score 0.7332765105318176
test_score 0.6876765034385065
train_score 0.7327816494437821
test_score 0.6989668295243477
train_score 0.7343749866792819
test_score 0.6867429136719696


In [81]:
without_credit = list(set(train.SK_ID_CURR) - set(agr_train.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_train))
empty_df.fillna(0, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_train['dummy_credit_history'] = 1
max_features = pd.concat([empty_df, agr_train])
max_features['SK_ID_CURR'] = max_features.index
df2 = df.merge(max_features, on='SK_ID_CURR', how='left')

In [82]:
without_credit = list(set(train.SK_ID_CURR) - set(agr_min_df.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_min_df))
empty_df.fillna(0, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_min_df['dummy_credit_history'] = 1
min_features = pd.concat([empty_df, agr_min_df])
min_features['SK_ID_CURR'] = min_features.index
del min_features['SK_ID_CURR_MIN']
df3 = df2.merge(min_features, on='SK_ID_CURR', how='left')

In [83]:
df2.shape

(146794, 155)

In [84]:
X = df3.drop(columns=['Target'])
y = df3['Target']

In [86]:
for train_indices, test_indices in kf.split(X, y):
    X_train, X_val = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[test_indices]
    lgbmc = lgb.LGBMClassifier(mobjective='binary',
                               max_depth=50, learning_rate=0.05,
                               n_estimators=100, class_weight='balanced',
                               subsample=0.8,
                               colsample_bytree=0.8,
                               random_state=42,
                               importance_type='gain',
                              reg_alpha=20,
                             reg_lambda=30)
    model = lgbmc.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    score_train = roc_auc_score(y_train, y_pred_train)
    print(f'train_score {score_train}')   
    y_pred_val = model.predict(X_val)
    score_val = roc_auc_score(y_val, y_pred_val)
    print(f'test_score {score_val}')

train_score 0.7259893465618291
test_score 0.695458095736221
train_score 0.7234559463835757
test_score 0.6951632923007205
train_score 0.7252883673819198
test_score 0.6919549258881944
train_score 0.7295376164179805
test_score 0.6836030243075184
train_score 0.7272338626546149
test_score 0.6862997658079625


In [87]:
agr_min_test = agr_min.loc[agr_min.index.isin(list(test.SK_ID_CURR.unique()))]
without_credit = list(set(test.SK_ID_CURR) - set(agr_min_test.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_min_test))
empty_df.fillna(0, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_min_test['dummy_credit_history'] = 1
min_features = pd.concat([empty_df, agr_min_df])
min_features['SK_ID_CURR'] = min_features.index
del min_features['SK_ID_CURR_MIN']
min_df_test = test.merge(min_features, on='SK_ID_CURR', how='left')


agr_max_test = agr_max.loc[agr_max.index.isin(list(test.SK_ID_CURR.unique()))]
without_credit = list(set(test.SK_ID_CURR) - set(agr_max_test.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_max_test))
empty_df.fillna(0, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_max_test['dummy_credit_history'] = 1
max_features = pd.concat([empty_df, agr_max_df])
max_features['SK_ID_CURR'] = max_features.index
del max_features['SK_ID_CURR_MAX']
max_df_test = min_df_test.merge(max_features, on='SK_ID_CURR', how='left')


agr_mean_test = agr_mean.loc[agr_mean.index.isin(list(test.SK_ID_CURR.unique()))]
without_credit = list(set(test.SK_ID_CURR) - set(agr_mean_test.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_mean_test))
empty_df.fillna(0, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_mean_test['dummy_credit_history'] = 1
mean_features = pd.concat([empty_df, agr_mean_df])
mean_features['SK_ID_CURR'] = mean_features.index
del mean_features['SK_ID_CURR_MEAN']
mean_df_test = max_df_test.merge(mean_features, on='SK_ID_CURR', how='left')


agr_s_test = agr_test.loc[agr_test.index.isin(list(test.SK_ID_CURR.unique()))]
without_credit = list(set(test.SK_ID_CURR) - set(agr_s_test.index))
empty_df = pd.DataFrame(np.nan, index=without_credit, columns=list(agr_s_test))
empty_df.fillna(0, inplace=True)
empty_df['dummy_credit_history'] = 0
agr_s_test['dummy_credit_history'] = 1
s_features = pd.concat([empty_df, agr_s_test])
s_features['SK_ID_CURR'] = s_features.index
s_test = mean_df_test.merge(s_features, on='SK_ID_CURR', how='left')
s_test.fillna(0, inplace=True)

In [88]:
s_test.shape

(48744, 168)

In [89]:
pred = model.predict(s_test)

In [90]:
submission = {
    'SK_ID_CURR': min_df_test.SK_ID_CURR.values,
    'TARGET': pred}
solution = pd.DataFrame(submission)
solution.to_csv('submission_db.csv',index=False)

In [9]:
# тут аук 0.65