In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
import joblib

from tqdm.notebook import tqdm_notebook
from warnings import filterwarnings
filterwarnings('ignore')
import os

In [2]:
PATH_DATA = './data'
MODEL_PATH = "./submission/models/"

clf = joblib.load(os.path.join(MODEL_PATH, "model.pkl"))

FileNotFoundError: [Errno 2] No such file or directory: './submission/models/model.pkl'

In [None]:
tr_mcc_codes = pd.read_csv(os.path.join(PATH_DATA, 'mcc_codes.csv'), sep=';', index_col='mcc_code')
tr_types = pd.read_csv(os.path.join(PATH_DATA, 'trans_types.csv'), sep=';', index_col='trans_type')
transactions = pd.read_csv(os.path.join(PATH_DATA, 'transactions.csv'), index_col='client_id')
gender_test = pd.read_csv(os.path.join(PATH_DATA, 'test.csv'), index_col='client_id')
transactions_test = transactions.join(gender_test, how='inner')

In [None]:
def conditions_day(x):
    if x == 4:
        return 'friday'
    elif (x == 5 or x == 6):
        return 'weekend'
    else:
        return 'weekday'
conds_d = np.vectorize(conditions_day)

def conditions_hour(x):
    if (x >= 6) and (x < 12):
        return 'morning'
    elif (x >= 12) and (x < 18):
        return 'daytime'
    elif (x >= 18) and (x <= 23):
        return 'evening'
    else:
        return 'night'
conds_h = np.vectorize(conditions_hour)

In [None]:
for idx, row in transactions_test.iterrows():
    transactions_test.at[idx, 'day_month'] = int(row['trans_time'].split()[0]) % 30
    transactions_test.at[idx, 'month'] = (int(row['trans_time'].split()[0]) // 30) % 12
    transactions_test.at[idx, 'day'] = int(row['trans_time'].split()[0]) % 7
    transactions_test.at[idx, 'hour'] = int(re.search(' \d*', row['trans_time']).group(0))
    type_day = conds_d(transactions_test.at[idx, 'day'])
    transactions_test.at[idx, 'type_day'] = type_day
    part_day = conds_h(transactions_test.at[idx, 'hour'])
    transactions_test.at[idx, 'part_day'] = part_day

In [None]:
tqdm_notebook.pandas(desc="Progress:")

In [None]:
def features_advanced(x): 
    features = []
    features.append(pd.Series(x['day_month'].value_counts(normalize=True).add_prefix('day_month_')))
    features.append(pd.Series(x['month'].value_counts(normalize=True).add_prefix('month_')))
    features.append(pd.Series(x['day'].value_counts(normalize=True).add_prefix('day_')))
    features.append(pd.Series(x['hour'].value_counts(normalize=True).add_prefix('hour_')))
    features.append(pd.Series(x['type_day'].value_counts(normalize=True).add_prefix('type_day_')))
    features.append(pd.Series(x['part_day'].value_counts(normalize=True).add_prefix('part_day_')))
    
    features.append(pd.Series(x[x['amount']>0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count', 'sum'])\
                                                        .add_prefix('positive_transactions_')))
    features.append(pd.Series(x[x['amount']<0]['amount'].agg(['min', 'max', 'mean', 'median', 'std', 'count', 'sum'])\
                                                        .add_prefix('negative_transactions_')))

    features.append(pd.Series(x['mcc_code'].value_counts(normalize=True).add_prefix('mcc_')))
    features.append(pd.Series(x['trans_type'].value_counts(normalize=True).add_prefix('tr_')))
    features.append(pd.Series(x[['day_month', 'part_day']].value_counts(normalize=True).add_prefix('dm_pd_')))
    
    return pd.concat(features)

data_test = transactions_test.groupby(transactions_test.index)\
                            .progress_apply(features_advanced).unstack(-1)

target = data_test.join(gender_test, how='inner')['gender']
target.value_counts()

In [None]:

y_pred = clf.predict(data_test)

In [None]:

submission = pd.DataFrame(index=data_test.index)
submission['probability'] = y_pred[:,1]
submission.to_csv('result.csv')