In [1]:

import pickle

import hydra
import pandas as pd
from hydra.utils import to_absolute_path
from omegaconf import OmegaConf

from preprocess import  trans_preprocess
from train import training_with_resampling




In [3]:
@hydra.main(version_base=None,config_path='config', config_name='config')
def main(config):

    print(OmegaConf.to_yaml(config))

    with open(to_absolute_path("../data/transactions.pkl"), "rb") as file_:
        transactions_data = pickle.load(file_)


    matching = pd.read_csv(to_absolute_path('../data/train.csv'))

    



    df_trans = trans_preprocess(transactions_data[config.trans_data], **config.trans_params)
    print(df_trans.shape)

    for feature_group in config.trans_time_features:
        df_trans = df_trans.join(transactions_data[feature_group])
    print('df_trans', df_trans.shape)




    clf = training_with_resampling(
        matching, test=None, df_trans=df_trans,
        catboost_params=config.catboost_params, **config.train_params)

    clf.save_model(to_absolute_path(f'submit/data/model_{config.run_number}.cbm'))
    trans_filename = f'submit/data/trans_features_{config.run_number}.pkl'

    with open(to_absolute_path(trans_filename), 'wb') as file_:
        pickle.dump(df_trans.columns.tolist(), file_)


if __name__ == '__main__':

    main()

In [5]:
with open(to_absolute_path("../data/transactions.pkl"), "rb") as file_:
        transactions_data = pickle.load(file_)

In [37]:
df_trans = trans_preprocess(transactions_data['grouped'])
print(df_trans.shape)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  transactions['sum'].loc[idx] = transactions['sum'].loc[idx] * 100
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

(96000, 277)


In [38]:
df_trans

mcc_code,count-mcc0,count-mcc1,count-mcc2,count-mcc3,count-mcc4,count-mcc6,count-mcc7,count-mcc8,count-mcc9,count-mcc10,...,count-mcc303,count-mcc305,count-mcc316,count-mcc326,count-mcc328,count-mcc332,count-mcc334,count-mcc360,count-mcc364,count-mcc413
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,7.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,27.0,11.0,5.0,3.0,2.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,14.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,2.0,97.0,129.0,28.0,1.0,2.0,0.0,7.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
41,0.0,2.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
562043,0.0,19.0,0.0,1.0,0.0,1.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562205,30.0,33.0,2.0,0.0,1.0,10.0,0.0,6.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562312,0.0,24.0,3.0,0.0,1.0,6.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562721,19.0,12.0,6.0,29.0,0.0,0.0,0.0,2.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
trans_time_features = ['hour']
for feature_group in trans_time_features:
    df_trans = df_trans.join(transactions_data[feature_group])
print('df_trans', df_trans.shape)

df_trans (96000, 302)


In [40]:
matching = pd.read_csv(to_absolute_path('../data/train.csv'))
clients = pd.read_csv(to_absolute_path('../data/clients.csv')).fillna(0)

df_trans = df_trans.merge(clients, how='left', on='user_id')
df_trans = df_trans.merge(matching, how='left', on='user_id')


In [34]:
df_trans
train_df = df_trans[df_trans['target'].notna()]
submit_df = df_trans[df_trans['target'].isna()]
train_df

Unnamed: 0,user_id,count-mcc0,count-mcc1,count-mcc2,count-mcc3,count-mcc4,count-mcc6,count-mcc7,count-mcc8,count-mcc9,...,trans_hour_21,trans_hour_22,trans_hour_23,trans_hour_total,report,employee_count_nm,bankemplstatus,customer_age,target,time
0,3,7.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,8.0,2,ОТ 101 ДО 500,0,3,0.0,77.0
2,13,0.0,0.0,0.0,14.0,0.0,0.0,1.0,0.0,1.0,...,0.095238,0.000000,0.000000,21.0,6,ОТ 501 ДО 1000,0,2,0.0,86.0
3,37,2.0,97.0,129.0,28.0,1.0,2.0,0.0,7.0,1.0,...,0.014545,0.007273,0.000000,275.0,5,БОЛЕЕ 1001,0,2,0.0,89.0
4,41,0.0,2.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.066667,0.066667,0.000000,15.0,1,ОТ 101 ДО 500,0,2,0.0,57.0
5,42,1.0,13.0,8.0,0.0,2.0,0.0,1.0,2.0,5.0,...,0.036364,0.000000,0.000000,55.0,12,ДО 10,0,3,0.0,84.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95992,561824,4.0,35.0,16.0,23.0,4.0,6.0,1.0,7.0,1.0,...,0.015873,0.015873,0.000000,126.0,12,0,0,0,0.0,91.0
95995,562043,0.0,19.0,0.0,1.0,0.0,1.0,3.0,4.0,0.0,...,0.000000,0.000000,0.000000,32.0,12,0,0,2,0.0,75.0
95997,562312,0.0,24.0,3.0,0.0,1.0,6.0,0.0,4.0,0.0,...,0.180000,0.140000,0.080000,50.0,12,0,0,0,0.0,91.0
95998,562721,19.0,12.0,6.0,29.0,0.0,0.0,0.0,2.0,4.0,...,0.000000,0.000000,0.012658,79.0,12,0,0,2,0.0,29.0


In [41]:
from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split

In [43]:
X_train, X_valid, y_train, y_valid = train_test_split(train_df.iloc[:,1:-2], train_df['target'].to_numpy(), test_size=0.25, random_state=1)
cat_cols = ['employee_count_nm', 'bankemplstatus', 'customer_age', 'report']

model = CatBoostClassifier(
    iterations = 1400,
    depth=5,
    learning_rate=0.03,

    eval_metric='AUC',
    cat_features = cat_cols,
    thread_count=-1,
    early_stopping_rounds=200,
)
model.fit(Pool(X_train, y_train,cat_features = cat_cols,),
          eval_set=Pool(X_valid, y_valid,cat_features = cat_cols,),
           verbose=100)


df_imp = pd.DataFrame({
    'name': X_train.columns,
    'imp': model.get_feature_importance()
}).sort_values('imp', ascending=False)
# display(df_imp) # Можно посмотреть на предварительный feature_importance()

# df_imp = df_imp[df_imp['imp'] > 0.3] # Берем все фичи, у которых важность больше 0.3

# # Добавляем статистические фичи, их нельзя было использовать для тренировки здесь, т.к. получился бы лик в данных
# good_cols = df_imp['name'].tolist() + ['group_employee_age_mean', 'group_report_age_mean']
df_imp

0:	test: 0.5009770	best: 0.5009770 (0)	total: 34.5ms	remaining: 48.2s
100:	test: 0.7038738	best: 0.7038738 (100)	total: 2.76s	remaining: 35.5s
200:	test: 0.7142951	best: 0.7143160 (193)	total: 5.47s	remaining: 32.6s
300:	test: 0.7166487	best: 0.7166487 (300)	total: 8.14s	remaining: 29.7s
400:	test: 0.7176711	best: 0.7176764 (398)	total: 10.8s	remaining: 26.9s
500:	test: 0.7186587	best: 0.7186761 (495)	total: 13.4s	remaining: 24.1s
600:	test: 0.7199619	best: 0.7199619 (600)	total: 16.2s	remaining: 21.5s
700:	test: 0.7208518	best: 0.7208739 (691)	total: 19.6s	remaining: 19.5s
800:	test: 0.7215773	best: 0.7217762 (784)	total: 22.3s	remaining: 16.7s
900:	test: 0.7215687	best: 0.7218441 (885)	total: 25s	remaining: 13.8s
1000:	test: 0.7219673	best: 0.7222438 (969)	total: 27.7s	remaining: 11s
1100:	test: 0.7224684	best: 0.7224684 (1100)	total: 30.4s	remaining: 8.24s
1200:	test: 0.7231175	best: 0.7231642 (1197)	total: 33s	remaining: 5.47s
1300:	test: 0.7226472	best: 0.7231642 (1197)	total: 35.

Unnamed: 0,name,imp
303,employee_count_nm,14.594790
305,customer_age,9.701391
154,count-mcc155,5.113162
0,count-mcc0,4.263799
50,count-mcc51,3.940006
...,...,...
265,count-mcc297,0.000000
267,count-mcc303,0.000000
269,count-mcc316,0.000000
194,count-mcc195,0.000000


In [None]:
clf = training_with_resampling(
        matching, test=None, df_trans=df_trans,
        catboost_params=config.catboost_params, **config.train_params)