In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
import joblib
import os

from tqdm.notebook import tqdm_notebook
from warnings import filterwarnings
filterwarnings('ignore')




In [35]:
PATH_DATA = '../data'
MODEL = 'model_94.84' 
MODEL_PATH = "../submission/models/"



In [36]:
model = CatBoostClassifier()  
model.load_model(os.path.join(MODEL_PATH,'model_94.84'))

<catboost.core.CatBoostClassifier at 0x186309a2860>

In [7]:
tr_mcc_codes = pd.read_csv(os.path.join(PATH_DATA, 'mcc_codes.csv'), sep=';', index_col='mcc_code')
tr_types = pd.read_csv(os.path.join(PATH_DATA, 'trans_types.csv'), sep=';', index_col='trans_type')

transactions = pd.read_csv(os.path.join(PATH_DATA, 'transactions.csv'), index_col='client_id')
gender_train = pd.read_csv(os.path.join(PATH_DATA, 'train.csv'), index_col='client_id')
gender_test = pd.read_csv(os.path.join(PATH_DATA, 'test.csv'), index_col='client_id')
transactions_train = transactions.join(gender_train, how='inner')
transactions_test = transactions.join(gender_test, how='inner')

In [9]:
for df in [transactions_test]:
    df['day'] = df['trans_time'].str.split().apply(lambda x: int(x[0]) % 7)
    df['hour'] = df['trans_time'].apply(lambda x: re.search(' \d*', x).group(0)).astype(int)
    df['night'] = ~df['hour'].between(6, 22).astype(int)

transactions_train.head()

Unnamed: 0_level_0,trans_time,mcc_code,trans_type,amount,term_id,trans_city,Unnamed: 0,gender
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0002cf30347684df542e1a931f356875,313 14:52:03,4829,2370,-2170.07,888990.0,Saint Petersburg,6806,0
0002cf30347684df542e1a931f356875,150 14:10:49,6011,2010,-1445.64,,Saint Petersburg,6806,0
0002cf30347684df542e1a931f356875,122 12:38:32,5912,1010,-107.07,,Saint Petersburg,6806,0
0002cf30347684df542e1a931f356875,159 13:22:34,6011,2010,-2892.86,,Saint Petersburg,6806,0
0002cf30347684df542e1a931f356875,257 12:06:54,5912,1010,-164.49,469965.0,Saint Petersburg,6806,0


In [10]:
def features_creation_advanced(x): 
    features = []
    features.append(pd.Series(x['day'].value_counts(normalize=True).add_prefix('day_')))
    features.append(pd.Series(x['hour'].value_counts(normalize=True).add_prefix('hour_')))
    features.append(pd.Series(x['night'].value_counts(normalize=True).add_prefix('night_')))
    
    features.append(pd.Series(x[x['amount']>=0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count', 'sum'])\
                                                        .add_prefix('positive_transactions_')))
    features.append(pd.Series(x[x['amount']<0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count', 'sum'])\
                                                        .add_prefix('negative_transactions_')))

    features.append(pd.Series(x['mcc_code'].value_counts(normalize=True).add_prefix('mcc_')))
    
    return pd.concat(features)

In [12]:
data_test = transactions_test.groupby(transactions_test.index).apply(features_creation_advanced).unstack(-1)

In [19]:
predict= model.predict_proba(data_test)

In [21]:
submission = pd.DataFrame(index=data_test.index)
submission['probability'] = predict[:,1]
submission.to_csv('result.csv')