In [65]:
import pandas as pd
import numpy as np
# from sklearn.model_selection import StratifiedKFold
# from sklearn import metrics

from catboost import CatBoostClassifier, Pool
import random
import warnings

from pandas.tseries.offsets import DateOffset
from sklearn.model_selection import train_test_split

warnings.simplefilter('ignore')

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

random.seed(42)
np.random.seed(42)

In [3]:
PATH = '../data/'
clients = pd.read_csv(PATH + '/clients.csv')
report_dates = pd.read_csv(PATH + 'report_dates.csv', parse_dates=['report_dt'])

transactions = pd.read_csv(PATH + 'transactions.csv', parse_dates=['transaction_dttm'])
train_data = pd.read_csv(PATH + 'train.csv')

In [4]:
transactions[:3]

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
0,3,3,1,-183.883957,2022-01-28 12:05:33
1,3,3,1,-3206.437012,2022-01-28 12:52:30
2,3,16,1,-153866.890625,2022-02-16 14:45:56


In [7]:
transactions['positive'] = transactions['transaction_amt'].apply(lambda x: x if x>0 else  0)
transactions['negative'] = transactions['transaction_amt'].apply(lambda x: x if x<0 else  0)

In [23]:
all_positive = transactions.groupby('user_id')['positive'].agg(['sum','mean','count'])
all_negative = transactions.groupby('user_id')['negative'].agg(['sum','mean','count'])
all_negative

Unnamed: 0_level_0,sum,mean,count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,-172401.813156,-15672.892105,11
9,-323434.666813,-3593.718520,90
13,-253484.063477,-11522.002885,22
37,-342598.388037,-1087.613930,315
41,-108586.614166,-6786.663385,16
...,...,...,...
562043,-29581.256115,-799.493409,37
562205,-42460.814964,-281.197450,151
562312,-18537.821270,-331.032523,56
562721,-198395.925579,-2334.069713,85


In [48]:
pivot_positive_currency = transactions.pivot_table(
    index='user_id',
    columns='currency_rk',
    values='positive',
    aggfunc=['count', 'median', 'sum']
)

pivot_negative_currency = transactions.pivot_table(
    index='user_id',
    columns='currency_rk',
    values='negative',
    aggfunc=['count', 'median', 'sum']
)


In [49]:
pivot_negative_currency.columns = ['negative_currency_' + '_'.join(map(str, x)) for x in pivot_negative_currency.columns]
pivot_positive_currency.columns = ['positive_currency_' + '_'.join(map(str, x)) for x in pivot_positive_currency.columns]


In [50]:
data = clients.merge(all_positive,how='left', on='user_id')
data = data.merge(all_negative,how='left', on='user_id', suffixes=('_pos','_neg'))
data = data.merge(pivot_positive_currency,how='left', on='user_id')
data = data.merge(pivot_negative_currency,how='left', on='user_id')

#employee_count code labls
data.replace({'employee_count_nm':{'ОТ 101 ДО 500':1,'БОЛЕЕ 1001':2,'ОТ 501 ДО 1000':3,'ДО 10':4,
                                      'ОТ 11 ДО 50':5,'ОТ 51 ДО 100':6,'БОЛЕЕ 500':7,'ОТ 11 ДО 30':8,
                                      'ОТ 31 ДО 50':9}}, inplace=True)
data['employee_count_nm'].fillna(0, inplace = True)

data.fillna(0, inplace=True)
data.head()

Unnamed: 0,user_id,report,employee_count_nm,bankemplstatus,customer_age,sum_pos,mean_pos,count_pos,sum_neg,mean_neg,count_neg,positive_currency_count_0,positive_currency_count_1,positive_currency_count_2,positive_currency_count_3,positive_currency_median_0,positive_currency_median_1,positive_currency_median_2,positive_currency_median_3,positive_currency_sum_0,positive_currency_sum_1,positive_currency_sum_2,positive_currency_sum_3,negative_currency_count_0,negative_currency_count_1,negative_currency_count_2,negative_currency_count_3,negative_currency_median_0,negative_currency_median_1,negative_currency_median_2,negative_currency_median_3,negative_currency_sum_0,negative_currency_sum_1,negative_currency_sum_2,negative_currency_sum_3
0,3,2,1.0,0,3,186108.229797,16918.929982,11,-172401.813156,-15672.892105,11,0.0,11.0,0.0,0.0,0.0,4549.455078,0.0,0.0,0.0,186108.229797,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-172401.813156,0.0,0.0
1,9,1,2.0,0,3,0.0,0.0,90,-323434.666813,-3593.71852,90,0.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90.0,0.0,0.0,0.0,-840.509674,0.0,0.0,0.0,-323434.666813,0.0,0.0
2,13,6,3.0,0,2,128766.684326,5853.031106,22,-253484.063477,-11522.002885,22,1.0,21.0,0.0,0.0,10772.799805,0.0,0.0,0.0,10772.799805,117993.884521,0.0,0.0,1.0,21.0,0.0,0.0,0.0,-10642.210938,0.0,0.0,0.0,-253484.063477,0.0,0.0
3,37,5,2.0,0,2,10738.788574,34.091392,315,-342598.388037,-1087.61393,315,0.0,315.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10738.788574,0.0,0.0,0.0,315.0,0.0,0.0,0.0,-236.420776,0.0,0.0,0.0,-342598.388037,0.0,0.0
4,41,1,1.0,0,2,0.0,0.0,16,-108586.614166,-6786.663385,16,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,-6328.293701,0.0,0.0,0.0,-108586.614166,0.0,0.0


In [51]:
data = data.merge(train_data,how='left', on='user_id')



In [125]:
test_idx = data[data['target'].isna()].index
train_idx = data[data['target'].notna()].index

test_features = data.iloc[test_idx].drop(columns=['target', 'time', 'user_id', 'preds'])
train_features = data.iloc[train_idx].drop(columns=['target', 'time', 'user_id', 'preds'])
X_train, X_valid, y_train, y_valid = train_test_split(train_features, data['target'][train_idx].to_numpy(), test_size=0.2, random_state=1)

In [126]:
cat_cols = ['report', 'employee_count_nm', 'bankemplstatus', 'customer_age']
data['employee_count_nm']= data['employee_count_nm'].astype(int)

In [127]:
model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,

    eval_metric='AUC',
    cat_features = cat_cols,
    thread_count=6,
    early_stopping_rounds=200,
)
model.fit(Pool(X_train, y_train, cat_features = cat_cols),
          eval_set=Pool(X_valid, y_valid, cat_features=cat_cols),
           verbose=100)


df_imp = pd.DataFrame({
    'name': X_train.columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)
# display(df_imp) # Можно посмотреть на предварительный feature_importance()

# df_imp = df_imp[df_imp['imp'] > 0.3] # Берем все фичи, у которых важность больше 0.3

# # Добавляем статистические фичи, их нельзя было использовать для тренировки здесь, т.к. получился бы лик в данных
# good_cols = df_imp['name'].tolist() + ['group_employee_age_mean', 'group_report_age_mean']
df_imp

0:	test: 0.6154307	best: 0.6154307 (0)	total: 39.9ms	remaining: 55.9s
100:	test: 0.7107350	best: 0.7108182 (99)	total: 2.04s	remaining: 26.2s
200:	test: 0.7189026	best: 0.7189026 (200)	total: 4.14s	remaining: 24.7s
300:	test: 0.7234805	best: 0.7234937 (299)	total: 6.19s	remaining: 22.6s
400:	test: 0.7267112	best: 0.7267112 (400)	total: 9.11s	remaining: 22.7s
500:	test: 0.7282214	best: 0.7282214 (500)	total: 13s	remaining: 23.4s
600:	test: 0.7287093	best: 0.7287151 (599)	total: 17.1s	remaining: 22.7s
700:	test: 0.7296129	best: 0.7297030 (696)	total: 19.8s	remaining: 19.7s
800:	test: 0.7302941	best: 0.7303629 (790)	total: 21.9s	remaining: 16.4s
900:	test: 0.7305709	best: 0.7307666 (880)	total: 24.3s	remaining: 13.5s
1000:	test: 0.7306180	best: 0.7307666 (880)	total: 26.4s	remaining: 10.5s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7307666325
bestIteration = 880

Shrink model to first 881 iterations.


Unnamed: 0,name,imp
1,employee_count_nm,14.835986
3,customer_age,12.615139
27,negative_currency_median_1,8.236785
5,mean_pos,7.969178
31,negative_currency_sum_1,6.851138
8,mean_neg,6.749628
7,sum_neg,6.656726
4,sum_pos,5.352975
19,positive_currency_sum_1,4.239781
0,report,3.70839


In [68]:
import xgboost as xgb

In [69]:
# 4-by-2 Data matrix
X = np.array([[1, -1], [-1, 1], [0, 1], [1, 0]])
dtrain = xgb.DMatrix(X)

# Associate ranged labels with the data matrix.
# This example shows each kind of censored labels.
#                         uncensored    right     left  interval
y_lower_bound = np.array([      2.0,     3.0,     0.0,     4.0])
y_upper_bound = np.array([      2.0, +np.inf,     4.0,     5.0])
dtrain.set_float_info('label_lower_bound', y_lower_bound)
dtrain.set_float_info('label_upper_bound', y_upper_bound)

In [113]:
data

Unnamed: 0,user_id,report,employee_count_nm,bankemplstatus,customer_age,sum_pos,mean_pos,count_pos,sum_neg,mean_neg,count_neg,positive_currency_count_0,positive_currency_count_1,positive_currency_count_2,positive_currency_count_3,positive_currency_median_0,positive_currency_median_1,positive_currency_median_2,positive_currency_median_3,positive_currency_sum_0,positive_currency_sum_1,positive_currency_sum_2,positive_currency_sum_3,negative_currency_count_0,negative_currency_count_1,negative_currency_count_2,negative_currency_count_3,negative_currency_median_0,negative_currency_median_1,negative_currency_median_2,negative_currency_median_3,negative_currency_sum_0,negative_currency_sum_1,negative_currency_sum_2,negative_currency_sum_3,target,time
0,3,2,1,0,3,186108.229797,16918.929982,11,-172401.813156,-15672.892105,11,0.0,11.0,0.0,0.0,0.000000,4549.455078,0.0,0.0,0.000000,186108.229797,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,-172401.813156,0.0,0.0,0.0,77.0
1,9,1,2,0,3,0.000000,0.000000,90,-323434.666813,-3593.718520,90,0.0,90.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,90.0,0.0,0.0,0.0,-840.509674,0.0,0.0,0.0,-323434.666813,0.0,0.0,,
2,13,6,3,0,2,128766.684326,5853.031106,22,-253484.063477,-11522.002885,22,1.0,21.0,0.0,0.0,10772.799805,0.000000,0.0,0.0,10772.799805,117993.884521,0.0,0.0,1.0,21.0,0.0,0.0,0.0,-10642.210938,0.0,0.0,0.0,-253484.063477,0.0,0.0,0.0,86.0
3,37,5,2,0,2,10738.788574,34.091392,315,-342598.388037,-1087.613930,315,0.0,315.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,10738.788574,0.0,0.0,0.0,315.0,0.0,0.0,0.0,-236.420776,0.0,0.0,0.0,-342598.388037,0.0,0.0,0.0,89.0
4,41,1,1,0,2,0.000000,0.000000,16,-108586.614166,-6786.663385,16,0.0,16.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,16.0,0.0,0.0,0.0,-6328.293701,0.0,0.0,0.0,-108586.614166,0.0,0.0,0.0,57.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95995,562043,12,0,0,2,0.000000,0.000000,37,-29581.256115,-799.493409,37,0.0,37.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,37.0,0.0,0.0,0.0,-239.170517,0.0,0.0,0.0,-29581.256115,0.0,0.0,0.0,75.0
95996,562205,12,0,0,1,1968.906334,13.039115,151,-42460.814964,-281.197450,151,0.0,151.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,1968.906334,0.0,0.0,0.0,151.0,0.0,0.0,0.0,-147.305283,0.0,0.0,0.0,-42460.814964,0.0,0.0,,
95997,562312,12,0,0,0,0.000000,0.000000,56,-18537.821270,-331.032523,56,0.0,56.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,56.0,0.0,0.0,0.0,-271.973526,0.0,0.0,0.0,-18537.821270,0.0,0.0,0.0,91.0
95998,562721,12,0,0,2,34391.163893,404.601928,85,-198395.925579,-2334.069713,85,0.0,85.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,34391.163893,0.0,0.0,0.0,85.0,0.0,0.0,0.0,-399.343048,0.0,0.0,0.0,-198395.925579,0.0,0.0,0.0,29.0


In [97]:
train_data['upper']=train_data.apply(lambda row: +np.inf if row['target']==0 else row['time'], axis=1)

In [101]:
# 4-by-2 Data matrix
dtrain = xgb.DMatrix(train_features)

# Associate ranged labels with the data matrix.
# This example shows each kind of censored labels.
#                         uncensored    right     left  interval
y_lower_bound = np.array(data['time'][train_idx])
y_upper_bound = np.array(train_data['upper'])
dtrain.set_float_info('label_lower_bound', y_lower_bound)
dtrain.set_float_info('label_upper_bound', y_upper_bound)

In [102]:
params = {'objective': 'survival:aft',
          'eval_metric': 'aft-nloglik',
          'aft_loss_distribution': 'normal',
          'aft_loss_distribution_scale': 1.20,
          'tree_method': 'hist', 'learning_rate': 0.05, 'max_depth': 2}
bst = xgb.train(params, dtrain, num_boost_round=5,
                evals=[(dtrain, 'train')])

[0]	train-aft-nloglik:10.43721
[1]	train-aft-nloglik:9.55558
[2]	train-aft-nloglik:8.75650
[3]	train-aft-nloglik:8.03200
[4]	train-aft-nloglik:7.37488


In [114]:
dtrain = xgb.DMatrix(data.drop(columns = ['target', 'time']))
predictions = bst.predict(dtrain)


In [116]:
data['preds'] = predictions

In [120]:
test_features = data.iloc[test_idx].drop(columns=['target', 'time','user_id'])
train_features = data.iloc[train_idx].drop(columns=['target', 'time','user_id'])
X_train, X_valid, y_train, y_valid = train_test_split(train_features, data['target'][train_idx].to_numpy(), test_size=0.2, random_state=1)

In [121]:
model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,

    eval_metric='AUC',
    cat_features = cat_cols,
    thread_count=6,
    early_stopping_rounds=200,
)
model.fit(Pool(X_train, y_train, cat_features = cat_cols),
          eval_set=Pool(X_valid, y_valid, cat_features=cat_cols),
           verbose=100)


df_imp = pd.DataFrame({
    'name': X_train.columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)

0:	test: 0.5981734	best: 0.5981734 (0)	total: 21.8ms	remaining: 30.5s
100:	test: 0.7132454	best: 0.7132454 (100)	total: 2.14s	remaining: 27.5s
200:	test: 0.7251990	best: 0.7252115 (197)	total: 4.31s	remaining: 25.7s
300:	test: 0.7304153	best: 0.7304153 (300)	total: 6.32s	remaining: 23.1s
400:	test: 0.7337792	best: 0.7337792 (400)	total: 8.39s	remaining: 20.9s
500:	test: 0.7354089	best: 0.7354089 (500)	total: 10.5s	remaining: 18.9s
600:	test: 0.7365494	best: 0.7366484 (587)	total: 12.6s	remaining: 16.8s
700:	test: 0.7371679	best: 0.7371679 (700)	total: 14.7s	remaining: 14.7s
800:	test: 0.7375825	best: 0.7376302 (798)	total: 16.8s	remaining: 12.5s
900:	test: 0.7375485	best: 0.7376356 (828)	total: 18.8s	remaining: 10.4s
1000:	test: 0.7378321	best: 0.7381056 (964)	total: 20.9s	remaining: 8.33s
1100:	test: 0.7377093	best: 0.7381056 (964)	total: 23.1s	remaining: 6.26s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7381055653
bestIteration = 964

Shrink model to first 96

In [123]:
df_imp

Unnamed: 0,name,imp
34,preds,17.189065
1,employee_count_nm,10.781627
31,negative_currency_sum_1,6.971022
5,mean_pos,6.762263
3,customer_age,6.758532
27,negative_currency_median_1,6.560736
7,sum_neg,5.656189
8,mean_neg,5.179847
19,positive_currency_sum_1,4.753823
4,sum_pos,4.727602
