In [1]:
#####################
# IMPORT LIBS
#####################

import pandas as pd
import numpy as np
from pathlib import Path
import wandb
import datetime
import os
import random
import joblib

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scoring import local_scorer

from lightgbm import LGBMClassifier


#####################
# SET CONSTANTS
#####################

INPUT_PATH = Path('../input')
OUTPUT_PATH = Path('../output')
TRAIN_PATH = INPUT_PATH 

TARGET_COLUMNS = ['sale_flg', 'sale_amount', 'contacts']
FIXED_SEEDS = [948, 534, 432, 597, 103, 21, 2242, 17, 20, 29]

RANDOM_SEED = 4444
USE_WANDB = False
CURRENT_TIME = str(datetime.datetime.now()).replace(' ', '_').split('.')[0]

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(RANDOM_SEED)

In [151]:
###############
# Config
###############

n_seed = 3
n_fold = 3
prediction_threshold = 0.2
retrain_after_valid = True

In [3]:
if USE_WANDB:
    wandb.login()
    run = wandb.init(project="idao-2021-finals", name = f'{CURRENT_TIME}') # todo add config here

In [4]:
%%time

transactions = pd.read_csv(INPUT_PATH / 'trxn.csv')
assets_under_management = pd.read_csv(INPUT_PATH / 'aum.csv')
balance = pd.read_csv(INPUT_PATH / 'balance.csv')
client = pd.read_csv(INPUT_PATH / 'client.csv')
campaigns = pd.read_csv(INPUT_PATH / 'com.csv')
deals = pd.read_csv(INPUT_PATH / 'deals.csv')
dict_merchant_category_code = pd.read_csv(INPUT_PATH / 'dict_mcc.csv')
payments = pd.read_csv(INPUT_PATH / 'payments.csv')
funnel = pd.read_csv(INPUT_PATH / 'funnel.csv')
appl = pd.read_csv(INPUT_PATH / 'appl.csv')

  return caller(func, *(extras + args), **kw)


CPU times: user 6.58 s, sys: 1.17 s, total: 7.75 s
Wall time: 7.75 s


In [None]:
payments = payments.sort_values(by='day_dt', ascending = False).reset_index(drop = True)

## Data prep

In [197]:
deals

Unnamed: 0,client_id,agrmnt_start_dt,agrmnt_close_dt,crncy_cd,agrmnt_rate_active,agrmnt_rate_passive,agrmnt_sum_rur,prod_type_name
0,7513301859607023584,2010-08-12,2014-10-30,810.0,,,0.0,Cash on demand
1,7513301859607023584,2013-02-15,2013-08-16,810.0,,,0.0,Cash on demand
2,7513301859607023584,2013-08-16,2014-02-14,810.0,,,0.0,Cash on demand
3,7513301859607023584,2015-07-12,2015-07-12,810.0,,,0.0,Cash on demand
4,7513301859607023584,2015-07-12,2015-07-12,810.0,,,0.0,Cash on demand
...,...,...,...,...,...,...,...,...
109011,-8242641659611256965,2011-08-10,2011-08-10,810.0,,,0.0,POST OFFICE
109012,-8242641659611256965,2011-08-10,2018-07-03,810.0,,3.25,13089.0,POST OFFICE
109013,-8242641659611256965,2011-08-10,2011-08-10,810.0,,,0.0,POST OFFICE
109014,-8242641659611256965,2011-08-23,2012-09-18,810.0,,7.00,5403.0,POST OFFICE


In [198]:
deals.groupby('client_id')['agrmnt_sum_rur'].mean()

client_id
-9221941791080978530    0.000000e+00
-9220369594510368140    6.634224e+07
-9220236243053692422    0.000000e+00
-9220233431709087652    9.242713e+04
-9219699286371310531    0.000000e+00
                            ...     
 9218801691173598782    2.723500e+03
 9219024469308275500    0.000000e+00
 9219968212912398941    0.000000e+00
 9220335314469087849    9.004667e+03
 9223107459698100059    0.000000e+00
Name: agrmnt_sum_rur, Length: 18652, dtype: float64

In [199]:
def most_common(x, default='unknown'):
    try:
        return x.value_counts().index[0] 
    except: 
        return default


def create_features_transactions(data):
    
    data = data.copy()
    
    # transaction features
    data['cards_count'] = data['client_id'].map(transactions.groupby('client_id')['card_id'].count()).fillna(0)
    data['total_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].sum()).fillna(0) # add monthly, daily, etc
    data['mean_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].mean()).fillna(0) # add monthly, daily, etc
    data['std_transaction_amount'] = data['client_id'].map(transactions.groupby('client_id')['tran_amt_rur'].std()).fillna(0) # add monthly, daily, etc
    
    data['most_common_mcc_cd'] = data['client_id'].map(transactions.groupby('client_id')['mcc_cd'].agg(lambda x:most_common(x))).fillna(-1)
    data['most_common_txn_comment_1'] = data['client_id'].map(transactions.groupby('client_id')['txn_comment_1'].agg(lambda x:most_common(x, 'unknown'))).fillna('unknown')
    data['most_common_txn_city'] = data['client_id'].map(transactions.groupby('client_id')['txn_city'].agg(lambda x: most_common(x, 'unknown'))).fillna('unknown')
    data['most_common_txn_country'] = data['client_id'].map(transactions.groupby('client_id')['txn_country'].agg(lambda x:most_common(x, 'unknown'))).fillna('unknown')
    
    data['number_of_transaction_countries'] = data['client_id'].map(transactions.groupby('client_id')['txn_city'].agg(lambda x:x.value_counts().shape[0]))
    data['number_of_transaction_cities'] = data['client_id'].map(transactions.groupby('client_id')['txn_country'].agg(lambda x:x.value_counts().shape[0]))
    
    return data

def create_feautures_payments(data):
    data = data.copy()
    
    # payments 
    data['last_known_salary'] = data['client_id'].map(payments.groupby('client_id').apply(lambda x: x['sum_rur'].iloc[0])).fillna(-1)
    data['total_recieved_salary'] = data['client_id'].map(payments.groupby('client_id').apply(lambda x: x['sum_rur'].sum())).fillna(-1)
    
    return data

def create_features_deals(data):
    data = data.copy()
    
    data['number_of_deals'] = data['client_id'].map(deals.groupby('client_id')['crncy_cd'].count()).fillna(0)
    data['mean_deal_sum'] = data['client_id'].map(deals.groupby('client_id')['agrmnt_sum_rur'].mean()).fillna(0)
    
    return data

In [200]:
# data merge

data = funnel.merge(client, on=['client_id'])

In [201]:
%%time
# create features

data = create_features_transactions(data)
data = create_feautures_payments(data)

CPU times: user 38 s, sys: 196 ms, total: 38.2 s
Wall time: 38.1 s


In [212]:
data = create_features_deals(data)

In [213]:
# data encode

le = LabelEncoder()
fill_cols = ['gender', 'citizenship', 'education', 'job_type', 'most_common_txn_comment_1', 'most_common_txn_city', 'most_common_txn_country']
for col in fill_cols:
    data[col] = le.fit_transform(data[col].astype(str))
    joblib.dump(le, OUTPUT_PATH / 'preprocessors' / f'{col}.pkl')

In [214]:
X = data.drop(columns = TARGET_COLUMNS + ['client_id'])
Y = data[TARGET_COLUMNS[0]]

## Train

In [215]:
import shutil
try:
    os.mkdir(OUTPUT_PATH / 'models')
except:
    shutil.rmtree(OUTPUT_PATH / 'models')
    os.mkdir(OUTPUT_PATH / 'models')
    
try:
    os.mkdir(OUTPUT_PATH / 'preprocessors')
except:
    shutil.rmtree(OUTPUT_PATH / 'preprocessors')
    os.mkdir(OUTPUT_PATH / 'preprocessors')

In [216]:
def running_train(X_train, Y_train, X_val, Y_val, i_fold=None, seed=None, params = None):
    # prepare for train
    
    params = {
              "n_jobs":-1,
              "random_state": seed
              }
    
    
    model = LGBMClassifier(**params) # define model here
    
    # Fit and save model
    
    if X_val is None:
        model.fit(X_train, Y_train, verbose=False)
    else:
        model.fit(X_train, Y_train,   eval_set=(X_val, Y_val), early_stopping_rounds=500, verbose=False)
    joblib.dump(model, OUTPUT_PATH / 'models' / f'lightgbm_{i_fold}_{seed}_{CURRENT_TIME}.pkl')

In [217]:
oof = np.zeros((X.shape[0], n_seed)) # cv_score
seeds = []
for i_seed in range(n_seed):
    seed = FIXED_SEEDS[i_seed]
    seed_everything(seed)

    seeds.append(seed)
    print('Seed: {}, {}/{}'.format(seed, i_seed + 1, n_seed))
    
    if n_fold != 1:
        kf = KFold(n_splits=n_fold, random_state=seed, shuffle=True)
        split_indexes = kf.split(X, Y)
    else:
        split_indexes = [train_test_split(np.arange(X.shape[0]), random_state=seed, shuffle = True)]
    
    for i_fold, (train_idx, val_idx) in enumerate(split_indexes):
        print("# Fold: {}/{} (seed: {}/{})".format(i_fold + 1, n_fold, i_seed + 1, n_seed))

        # dataset
        X_train, Y_train = X.iloc[train_idx], Y[train_idx]
        X_val, Y_val = X.iloc[val_idx], Y[val_idx]


        # train
        running_train(X_train, Y_train, X_val, Y_val, i_fold=i_fold, seed=seed)

        # predict on oof
        print('predict on oof...', end='')
        model = joblib.load( OUTPUT_PATH / 'models' / f'lightgbm_{i_fold}_{seed}_{CURRENT_TIME}.pkl')

        prediction = model.predict_proba(X_val)[:, 1]

        oof[val_idx, i_seed] = prediction
        print('  done.')

Seed: 948, 1/3
# Fold: 1/3 (seed: 1/3)
predict on oof...  done.
# Fold: 2/3 (seed: 1/3)
predict on oof...  done.
# Fold: 3/3 (seed: 1/3)
predict on oof...  done.
Seed: 534, 2/3
# Fold: 1/3 (seed: 2/3)
predict on oof...  done.
# Fold: 2/3 (seed: 2/3)
predict on oof...  done.
# Fold: 3/3 (seed: 2/3)
predict on oof...  done.
Seed: 432, 3/3
# Fold: 1/3 (seed: 3/3)
predict on oof...  done.
# Fold: 2/3 (seed: 3/3)
predict on oof...  done.
# Fold: 3/3 (seed: 3/3)
predict on oof...  done.


In [218]:
if n_fold != 1:
    Y_predicted = (np.mean(oof, axis = 1) > prediction_threshold).astype(int)
    Y_test = funnel[['client_id', 'sale_flg']].set_index('client_id')
    test_funnel =  funnel.set_index('client_id')
if n_fold == 1 and n_seed == 1:
    Y_predicted = (prediction > prediction_threshold).astype(int)
    Y_test = funnel[['client_id', 'sale_flg']].iloc[split_indexes[0][1]].set_index('client_id')
    test_funnel = funnel.iloc[split_indexes[0][1]].set_index('client_id')

In [219]:
try: 
    os.mkdir(OUTPUT_PATH / 'scoring')
except:
    shutil.rmtree(OUTPUT_PATH / 'scoring')
    os.mkdir(OUTPUT_PATH / 'scoring')

In [220]:
public_score, private_score = local_scorer.get_score(test_funnel, Y_predicted, Y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

In [221]:
validation_accuracy = accuracy_score(Y_test['sale_flg'], Y_predicted)
print(f'Public ANIC {public_score} Private ANIC {private_score}')
print(f'ANIC {1/3*public_score+ 2/3 * private_score}')
print(f'Accuracy score: {validation_accuracy}')

Public ANIC 5365.358979905113 Private ANIC 5653.409841613172
ANIC 5557.392887710485
Accuracy score: 0.8324495301888548


In [211]:
if USE_WANDB:
    wandb.run.summary["validation_accuracy"] = validation_accuracy
    wandb.run.summary["anic"] = (public_score + private_score) / 2

In [22]:
if retrain_after_valid:
    running_train(X, Y, None, None, i_fold=-1, seed=4444)

In [23]:
if USE_WANDB:
    run.finish()