In [1]:
#####################
# IMPORT LIBS
#####################

import pandas as pd
import numpy as np
from pathlib import Path
import wandb
import datetime
import os
import random
import joblib
import shutil

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scoring import local_scorer

from lightgbm import LGBMClassifier


#####################
# SET CONSTANTS
#####################

INPUT_PATH = Path('../input')
OUTPUT_PATH = Path('../output')
TRAIN_PATH = INPUT_PATH 

TARGET_COLUMNS = ['sale_flg', 'sale_amount', 'contacts']
FIXED_SEEDS = [948, 534, 432, 597, 103, 21, 2242, 17, 20, 29]

RANDOM_SEED = 4444
USE_WANDB = False
CURRENT_TIME = str(datetime.datetime.now()).replace(' ', '_').split('.')[0]

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(RANDOM_SEED)

In [2]:
###############
# Config
###############

n_seed = 3
n_fold = 3
prediction_threshold = 0.2
retrain_after_valid = True
make_submission = True

In [3]:
if USE_WANDB:
    wandb.login()
    run = wandb.init(project="idao-2021-finals", name = f'{CURRENT_TIME}') # todo add config here

In [4]:
%%time

transactions = pd.read_csv(INPUT_PATH / 'trxn.csv')
assets_under_management = pd.read_csv(INPUT_PATH / 'aum.csv')
balance = pd.read_csv(INPUT_PATH / 'balance.csv')
client = pd.read_csv(INPUT_PATH / 'client.csv')
campaigns = pd.read_csv(INPUT_PATH / 'com.csv')
deals = pd.read_csv(INPUT_PATH / 'deals.csv')
dict_merchant_category_code = pd.read_csv(INPUT_PATH / 'dict_mcc.csv')
payments = pd.read_csv(INPUT_PATH / 'payments.csv')
funnel = pd.read_csv(INPUT_PATH / 'funnel.csv')
appl = pd.read_csv(INPUT_PATH / 'appl.csv')

In [5]:
payments = payments.sort_values(by='day_dt', ascending = False).reset_index(drop = True)

In [6]:
def most_common(x, default='unknown'):
    try:
        return x.value_counts().index[0] 
    except: 
        return default


def create_features_transactions(data):
    
    data = data.copy()
    
    # transaction features
    data['cards_count'] = data['client_id'].map(transactions.groupby('client_id')['card_id'].count()).fillna(0)
    data['total_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].sum()).fillna(0) # add monthly, daily, etc
    data['mean_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].mean()).fillna(0) # add monthly, daily, etc
    data['std_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].std()).fillna(0) # add monthly, daily, etc
    
    data['most_common_mcc_cd'] = data['client_id'].map(transactions.groupby('client_id')['mcc_cd'].agg(lambda x:most_common(x))).fillna(-1)
    data['most_common_txn_comment_1'] = data['client_id'].map(transactions.groupby('client_id')['txn_comment_1'].agg(lambda x:most_common(x, 'unknown'))).fillna('unknown')
    data['most_common_txn_city'] = data['client_id'].map(transactions.groupby('client_id')['txn_city'].agg(lambda x: most_common(x, 'unknown'))).fillna('unknown')
    data['most_common_txn_country'] = data['client_id'].map(transactions.groupby('client_id')['txn_country'].agg(lambda x:most_common(x, 'unknown'))).fillna('unknown')
    
    data['number_of_transaction_countries'] = data['client_id'].map(transactions.groupby('client_id')['txn_city'].agg(lambda x:x.value_counts().shape[0]))
    data['number_of_transaction_cities'] = data['client_id'].map(transactions.groupby('client_id')['txn_country'].agg(lambda x:x.value_counts().shape[0]))
    
    return data

def create_feautures_payments(data):
    data = data.copy()
    
    # payments 
    data['last_known_salary'] = data['client_id'].map(payments.groupby('client_id').apply(lambda x: x['sum_rur'].iloc[0])).fillna(-1)
    data['total_recieved_salary'] = data['client_id'].map(payments.groupby('client_id').apply(lambda x: x['sum_rur'].sum())).fillna(-1)
    
    return data

def create_features_deals(data):
    data = data.copy()
    
    data['number_of_deals'] = data['client_id'].map(deals.groupby('client_id')['crncy_cd'].count()).fillna(0)
    data['mean_deal_sum'] = data['client_id'].map(deals.groupby('client_id')['agrmnt_sum_rur'].mean()).fillna(0)
    
    return data

In [7]:
# data merge
data = funnel.copy()
data = funnel.merge(client, on=['client_id'])

In [8]:
%%time
# create features

# data = create_features_transactions(data)
# data = create_feautures_payments(data)
# data = create_features_deals(data)

In [9]:
try:
    os.mkdir(OUTPUT_PATH / 'preprocessors')
except:
    shutil.rmtree(OUTPUT_PATH / 'preprocessors')
    os.mkdir(OUTPUT_PATH / 'preprocessors')

In [10]:
data

                 client_id  sale_flg  sale_amount  contacts  feature_1  \
0      7513301859607023584         0          NaN         1          7   
1      9157009756404187626         0          NaN         1          3   
2     -1893104556496814867         0          NaN         1          5   
3      6886062013213911831         0          NaN         1          4   
4     -8156468515495593794         1    138018.05         1          7   
...                    ...       ...          ...       ...        ...   
21493  4662551505651924284         0          NaN         2          3   
21494 -8968737688687691353         0          NaN         1          9   
21495 -5799097497074119478         0          NaN         1          2   
21496 -2264802671063321355         1     43882.85         2          8   
21497  -174115151336149439         0          NaN         1          3   

       client_segment  feature_2  feature_3  feature_4  feature_5  ...  \
0                13.0   571533.0    1

In [11]:
for c in fill_cols:
    col_type = data[c].dtype
    print(col_type)

In [12]:
fill_cols = ['gender', 'citizenship', 'education', 'job_type']
for c in fill_cols:
    col_type = data[c].dtype
    print(col_type)

In [13]:
fill_cols = ['gender', 'citizenship', 'education', 'job_type']
for c in data.columns:
    col_type = data[c].dtype
    print(col_type)

In [14]:
#####################
# IMPORT LIBS
#####################

import pandas as pd
import numpy as np
from pathlib import Path
import wandb
import datetime
import os
import random
import joblib
import shutil

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scoring import local_scorer

from lightgbm import LGBMClassifier


#####################
# SET CONSTANTS
#####################

INPUT_PATH = Path('../input')
OUTPUT_PATH = Path('../output')
TRAIN_PATH = INPUT_PATH 

TARGET_COLUMNS = ['sale_flg', 'sale_amount', 'contacts']
FIXED_SEEDS = [948, 534, 432, 597, 103, 21, 2242, 17, 20, 29]

RANDOM_SEED = 4444
USE_WANDB = True
CURRENT_TIME = str(datetime.datetime.now()).replace(' ', '_').split('.')[0]

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(RANDOM_SEED)

In [15]:
###############
# Config
###############

n_seed = 3
n_fold = 3
prediction_threshold = 0.2
retrain_after_valid = True
make_submission = True

In [16]:
if USE_WANDB:
    wandb.login()
    run = wandb.init(project="idao-2021-finals", name = f'{CURRENT_TIME}') # todo add config here

In [17]:
%%time

transactions = pd.read_csv(INPUT_PATH / 'trxn.csv')
assets_under_management = pd.read_csv(INPUT_PATH / 'aum.csv')
balance = pd.read_csv(INPUT_PATH / 'balance.csv')
client = pd.read_csv(INPUT_PATH / 'client.csv')
campaigns = pd.read_csv(INPUT_PATH / 'com.csv')
deals = pd.read_csv(INPUT_PATH / 'deals.csv')
dict_merchant_category_code = pd.read_csv(INPUT_PATH / 'dict_mcc.csv')
payments = pd.read_csv(INPUT_PATH / 'payments.csv')
funnel = pd.read_csv(INPUT_PATH / 'funnel.csv')
appl = pd.read_csv(INPUT_PATH / 'appl.csv')

In [18]:
payments = payments.sort_values(by='day_dt', ascending = False).reset_index(drop = True)

In [19]:
def most_common(x, default='unknown'):
    try:
        return x.value_counts().index[0] 
    except: 
        return default


def create_features_transactions(data):
    
    data = data.copy()
    
    # transaction features
    data['cards_count'] = data['client_id'].map(transactions.groupby('client_id')['card_id'].count()).fillna(0)
    data['total_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].sum()).fillna(0) # add monthly, daily, etc
    data['mean_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].mean()).fillna(0) # add monthly, daily, etc
    data['std_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].std()).fillna(0) # add monthly, daily, etc
    
    data['most_common_mcc_cd'] = data['client_id'].map(transactions.groupby('client_id')['mcc_cd'].agg(lambda x:most_common(x))).fillna(-1)
    data['most_common_txn_comment_1'] = data['client_id'].map(transactions.groupby('client_id')['txn_comment_1'].agg(lambda x:most_common(x, 'unknown'))).fillna('unknown')
    data['most_common_txn_city'] = data['client_id'].map(transactions.groupby('client_id')['txn_city'].agg(lambda x: most_common(x, 'unknown'))).fillna('unknown')
    data['most_common_txn_country'] = data['client_id'].map(transactions.groupby('client_id')['txn_country'].agg(lambda x:most_common(x, 'unknown'))).fillna('unknown')
    
    data['number_of_transaction_countries'] = data['client_id'].map(transactions.groupby('client_id')['txn_city'].agg(lambda x:x.value_counts().shape[0]))
    data['number_of_transaction_cities'] = data['client_id'].map(transactions.groupby('client_id')['txn_country'].agg(lambda x:x.value_counts().shape[0]))
    
    return data

def create_feautures_payments(data):
    data = data.copy()
    
    # payments 
    data['last_known_salary'] = data['client_id'].map(payments.groupby('client_id').apply(lambda x: x['sum_rur'].iloc[0])).fillna(-1)
    data['total_recieved_salary'] = data['client_id'].map(payments.groupby('client_id').apply(lambda x: x['sum_rur'].sum())).fillna(-1)
    
    return data

def create_features_deals(data):
    data = data.copy()
    
    data['number_of_deals'] = data['client_id'].map(deals.groupby('client_id')['crncy_cd'].count()).fillna(0)
    data['mean_deal_sum'] = data['client_id'].map(deals.groupby('client_id')['agrmnt_sum_rur'].mean()).fillna(0)
    
    return data

In [20]:
# data merge
data = funnel.copy()
data = funnel.merge(client, on=['client_id'])

In [21]:
%%time
# create features

# data = create_features_transactions(data)
# data = create_feautures_payments(data)
# data = create_features_deals(data)

In [22]:
try:
    os.mkdir(OUTPUT_PATH / 'preprocessors')
except:
    shutil.rmtree(OUTPUT_PATH / 'preprocessors')
    os.mkdir(OUTPUT_PATH / 'preprocessors')

In [23]:
# data encode

fill_cols = ['gender', 'citizenship', 'education', 'job_type']
for c in fill_cols:
    col_type = data[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        data[c] = data[c].astype('category')
# for col in fill_cols:
#     try:
#         le = LabelEncoder()
#         data[col] = le.fit_transform(data[col].astype(str))
#         joblib.dump(le, OUTPUT_PATH / 'preprocessors' / f'{col}.pkl')
#     except:
#         print(f'{col} is missing')

In [24]:
X = data.drop(columns = TARGET_COLUMNS + ['client_id'])
Y = data[TARGET_COLUMNS[0]]

In [25]:
import shutil
try:
    os.mkdir(OUTPUT_PATH / 'models')
except:
    shutil.rmtree(OUTPUT_PATH / 'models')
    os.mkdir(OUTPUT_PATH / 'models')

In [26]:
def running_train(X_train, Y_train, X_val, Y_val, i_fold=None, seed=None, params = None):
    # prepare for train
    
    params = {
              "n_jobs":-1,
              "random_state": seed,
              }
    
    
    model = LGBMClassifier(**params) # define model here
    
    # Fit and save model
    
    if X_val is None:
        model.fit(X_train, Y_train, verbose=False)
    else:
        model.fit(X_train, Y_train,   eval_set=(X_val, Y_val), early_stopping_rounds=500, verbose=False)
    joblib.dump(model, OUTPUT_PATH / 'models' / f'lightgbm_{i_fold}_{seed}_{CURRENT_TIME}.pkl')

In [27]:
oof = np.zeros((X.shape[0], n_seed)) # cv_score
seeds = []
for i_seed in range(n_seed):
    seed = FIXED_SEEDS[i_seed]
    seed_everything(seed)

    seeds.append(seed)
    print('Seed: {}, {}/{}'.format(seed, i_seed + 1, n_seed))
    
    if n_fold != 1:
        kf = KFold(n_splits=n_fold, random_state=seed, shuffle=True)
        split_indexes = kf.split(X, Y)
    else:
        split_indexes = [train_test_split(np.arange(X.shape[0]), random_state=seed, shuffle = True)]
    
    for i_fold, (train_idx, val_idx) in enumerate(split_indexes):
        print("# Fold: {}/{} (seed: {}/{})".format(i_fold + 1, n_fold, i_seed + 1, n_seed))

        # dataset
        X_train, Y_train = X.iloc[train_idx], Y[train_idx]
        X_val, Y_val = X.iloc[val_idx], Y[val_idx]


        # train
        running_train(X_train, Y_train, X_val, Y_val, i_fold=i_fold, seed=seed)

        # predict on oof
        print('predict on oof...', end='')
        model = joblib.load( OUTPUT_PATH / 'models' / f'lightgbm_{i_fold}_{seed}_{CURRENT_TIME}.pkl')

        prediction = model.predict_proba(X_val)[:, 1]

        oof[val_idx, i_seed] = prediction
        print('  done.')

In [28]:
if n_fold != 1:
    Y_predicted = (np.mean(oof, axis = 1) > prediction_threshold).astype(int)
    Y_test = funnel[['client_id', 'sale_flg']].set_index('client_id')
    test_funnel =  funnel.set_index('client_id')
if n_fold == 1 and n_seed == 1:
    Y_predicted = (prediction > prediction_threshold).astype(int)
    Y_test = funnel[['client_id', 'sale_flg']].iloc[split_indexes[0][1]].set_index('client_id')
    test_funnel = funnel.iloc[split_indexes[0][1]].set_index('client_id')

In [29]:
try: 
    os.mkdir(OUTPUT_PATH / 'scoring')
except:
    shutil.rmtree(OUTPUT_PATH / 'scoring')
    os.mkdir(OUTPUT_PATH / 'scoring')

In [30]:
public_score, private_score = local_scorer.get_score(test_funnel, Y_predicted, Y_test)

In [31]:
validation_accuracy = accuracy_score(Y_test['sale_flg'], Y_predicted)
print(f'Public ANIC {public_score} Private ANIC {private_score}')
print(f'ANIC {1/3*public_score+ 2/3 * private_score}')
print(f'Accuracy score: {validation_accuracy}')

In [32]:
if retrain_after_valid:
    running_train(X, Y, None, None, i_fold=-1, seed=4444)

In [33]:
if USE_WANDB:
    wandb.run.summary["validation_accuracy"] = validation_accuracy
    wandb.run.summary["anic"] = (public_score + private_score) / 2

In [34]:
if make_submission:
    public_anic = float(input())
    wandb.run.summary["public_anic"] = public_anic

In [35]:
if USE_WANDB:
    run.finish()

In [36]:
#####################
# IMPORT LIBS
#####################

import pandas as pd
import numpy as np
from pathlib import Path
import wandb
import datetime
import os
import random
import joblib
import shutil

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scoring import local_scorer

from lightgbm import LGBMClassifier


#####################
# SET CONSTANTS
#####################

INPUT_PATH = Path('../input')
OUTPUT_PATH = Path('../output')
TRAIN_PATH = INPUT_PATH 

TARGET_COLUMNS = ['sale_flg', 'sale_amount', 'contacts']
FIXED_SEEDS = [948, 534, 432, 597, 103, 21, 2242, 17, 20, 29]

RANDOM_SEED = 4444
USE_WANDB = True
CURRENT_TIME = str(datetime.datetime.now()).replace(' ', '_').split('.')[0]

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(RANDOM_SEED)

In [37]:
###############
# Config
###############

n_seed = 3
n_fold = 3
prediction_threshold = 0.1
retrain_after_valid = True
make_submission = False

In [38]:
if USE_WANDB:
    wandb.login()
    run = wandb.init(project="idao-2021-finals", name = f'{CURRENT_TIME}') # todo add config here

In [39]:
%%time

transactions = pd.read_csv(INPUT_PATH / 'trxn.csv')
assets_under_management = pd.read_csv(INPUT_PATH / 'aum.csv')
balance = pd.read_csv(INPUT_PATH / 'balance.csv')
client = pd.read_csv(INPUT_PATH / 'client.csv')
campaigns = pd.read_csv(INPUT_PATH / 'com.csv')
deals = pd.read_csv(INPUT_PATH / 'deals.csv')
dict_merchant_category_code = pd.read_csv(INPUT_PATH / 'dict_mcc.csv')
payments = pd.read_csv(INPUT_PATH / 'payments.csv')
funnel = pd.read_csv(INPUT_PATH / 'funnel.csv')
appl = pd.read_csv(INPUT_PATH / 'appl.csv')

In [40]:
payments = payments.sort_values(by='day_dt', ascending = False).reset_index(drop = True)

In [41]:
def most_common(x, default='unknown'):
    try:
        return x.value_counts().index[0] 
    except: 
        return default


def create_features_transactions(data):
    
    data = data.copy()
    
    # transaction features
    data['cards_count'] = data['client_id'].map(transactions.groupby('client_id')['card_id'].count()).fillna(0)
    data['total_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].sum()).fillna(0) # add monthly, daily, etc
    data['mean_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].mean()).fillna(0) # add monthly, daily, etc
    data['std_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].std()).fillna(0) # add monthly, daily, etc
    
    data['most_common_mcc_cd'] = data['client_id'].map(transactions.groupby('client_id')['mcc_cd'].agg(lambda x:most_common(x))).fillna(-1)
    data['most_common_txn_comment_1'] = data['client_id'].map(transactions.groupby('client_id')['txn_comment_1'].agg(lambda x:most_common(x, 'unknown'))).fillna('unknown')
    data['most_common_txn_city'] = data['client_id'].map(transactions.groupby('client_id')['txn_city'].agg(lambda x: most_common(x, 'unknown'))).fillna('unknown')
    data['most_common_txn_country'] = data['client_id'].map(transactions.groupby('client_id')['txn_country'].agg(lambda x:most_common(x, 'unknown'))).fillna('unknown')
    
    data['number_of_transaction_countries'] = data['client_id'].map(transactions.groupby('client_id')['txn_city'].agg(lambda x:x.value_counts().shape[0]))
    data['number_of_transaction_cities'] = data['client_id'].map(transactions.groupby('client_id')['txn_country'].agg(lambda x:x.value_counts().shape[0]))
    
    return data

def create_feautures_payments(data):
    data = data.copy()
    
    # payments 
    data['last_known_salary'] = data['client_id'].map(payments.groupby('client_id').apply(lambda x: x['sum_rur'].iloc[0])).fillna(-1)
    data['total_recieved_salary'] = data['client_id'].map(payments.groupby('client_id').apply(lambda x: x['sum_rur'].sum())).fillna(-1)
    
    return data

def create_features_deals(data):
    data = data.copy()
    
    data['number_of_deals'] = data['client_id'].map(deals.groupby('client_id')['crncy_cd'].count()).fillna(0)
    data['mean_deal_sum'] = data['client_id'].map(deals.groupby('client_id')['agrmnt_sum_rur'].mean()).fillna(0)
    
    return data

In [42]:
# data merge
data = funnel.copy()
data = funnel.merge(client, on=['client_id'])

In [43]:
%%time
# create features

# data = create_features_transactions(data)
# data = create_feautures_payments(data)
# data = create_features_deals(data)

In [44]:
try:
    os.mkdir(OUTPUT_PATH / 'preprocessors')
except:
    shutil.rmtree(OUTPUT_PATH / 'preprocessors')
    os.mkdir(OUTPUT_PATH / 'preprocessors')

In [45]:
# data encode

fill_cols = ['gender', 'citizenship', 'education', 'job_type']
for c in fill_cols:
    col_type = data[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        data[c] = data[c].astype('category')
# for col in fill_cols:
#     try:
#         le = LabelEncoder()
#         data[col] = le.fit_transform(data[col].astype(str))
#         joblib.dump(le, OUTPUT_PATH / 'preprocessors' / f'{col}.pkl')
#     except:
#         print(f'{col} is missing')

In [46]:
X = data.drop(columns = TARGET_COLUMNS + ['client_id'])
Y = data[TARGET_COLUMNS[0]]

In [47]:
import shutil
try:
    os.mkdir(OUTPUT_PATH / 'models')
except:
    shutil.rmtree(OUTPUT_PATH / 'models')
    os.mkdir(OUTPUT_PATH / 'models')

In [48]:
def running_train(X_train, Y_train, X_val, Y_val, i_fold=None, seed=None, params = None):
    # prepare for train
    
    params = {
              "n_jobs":-1,
              "random_state": seed,
              }
    
    
    model = LGBMClassifier(**params) # define model here
    
    # Fit and save model
    
    if X_val is None:
        model.fit(X_train, Y_train, verbose=False)
    else:
        model.fit(X_train, Y_train,   eval_set=(X_val, Y_val), early_stopping_rounds=500, verbose=False)
    joblib.dump(model, OUTPUT_PATH / 'models' / f'lightgbm_{i_fold}_{seed}_{CURRENT_TIME}.pkl')

In [49]:
oof = np.zeros((X.shape[0], n_seed)) # cv_score
seeds = []
for i_seed in range(n_seed):
    seed = FIXED_SEEDS[i_seed]
    seed_everything(seed)

    seeds.append(seed)
    print('Seed: {}, {}/{}'.format(seed, i_seed + 1, n_seed))
    
    if n_fold != 1:
        kf = KFold(n_splits=n_fold, random_state=seed, shuffle=True)
        split_indexes = kf.split(X, Y)
    else:
        split_indexes = [train_test_split(np.arange(X.shape[0]), random_state=seed, shuffle = True)]
    
    for i_fold, (train_idx, val_idx) in enumerate(split_indexes):
        print("# Fold: {}/{} (seed: {}/{})".format(i_fold + 1, n_fold, i_seed + 1, n_seed))

        # dataset
        X_train, Y_train = X.iloc[train_idx], Y[train_idx]
        X_val, Y_val = X.iloc[val_idx], Y[val_idx]


        # train
        running_train(X_train, Y_train, X_val, Y_val, i_fold=i_fold, seed=seed)

        # predict on oof
        print('predict on oof...', end='')
        model = joblib.load( OUTPUT_PATH / 'models' / f'lightgbm_{i_fold}_{seed}_{CURRENT_TIME}.pkl')

        prediction = model.predict_proba(X_val)[:, 1]

        oof[val_idx, i_seed] = prediction
        print('  done.')

In [50]:
if n_fold != 1:
    Y_predicted = (np.mean(oof, axis = 1) > prediction_threshold).astype(int)
    Y_test = funnel[['client_id', 'sale_flg']].set_index('client_id')
    test_funnel =  funnel.set_index('client_id')
if n_fold == 1 and n_seed == 1:
    Y_predicted = (prediction > prediction_threshold).astype(int)
    Y_test = funnel[['client_id', 'sale_flg']].iloc[split_indexes[0][1]].set_index('client_id')
    test_funnel = funnel.iloc[split_indexes[0][1]].set_index('client_id')

In [51]:
try: 
    os.mkdir(OUTPUT_PATH / 'scoring')
except:
    shutil.rmtree(OUTPUT_PATH / 'scoring')
    os.mkdir(OUTPUT_PATH / 'scoring')

In [52]:
public_score, private_score = local_scorer.get_score(test_funnel, Y_predicted, Y_test)

In [53]:
validation_accuracy = accuracy_score(Y_test['sale_flg'], Y_predicted)
print(f'Public ANIC {public_score} Private ANIC {private_score}')
print(f'ANIC {1/3*public_score+ 2/3 * private_score}')
print(f'Accuracy score: {validation_accuracy}')

In [54]:
if retrain_after_valid:
    running_train(X, Y, None, None, i_fold=-1, seed=4444)

In [55]:
if USE_WANDB:
    wandb.run.summary["validation_accuracy"] = validation_accuracy
    wandb.run.summary["anic"] = 1/3*public_score+ 2/3 * private_score

In [56]:
if make_submission:
    public_anic = float(input())
    wandb.run.summary["public_anic"] = public_anic

In [57]:
if USE_WANDB:
    run.finish()