In [1]:
import os
import gc
import re
import sys
import math
import json
import time
import eli5
import lofo
import optuna
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from glob import glob
from pathlib import Path
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from BorutaShap import BorutaShap
from sklearn.metrics import f1_score, fbeta_score, roc_auc_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from lofo import LOFOImportance, Dataset, plot_importance
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold
from eli5.sklearn import PermutationImportance

from sklearn.preprocessing import StandardScaler

warnings.filterwarnings("ignore", module="lightgbm")

plt.rcParams["font.size"] = 13
sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

  from tqdm.autonotebook import tqdm


# Config

In [2]:
CFG = Namespace(
    train = True,
    full = False,
    debug = False,
    optimize = True,
    select_features = False,
    selection_type = 'corr', # feasible values: lofo, perm, shap, corr, gain
    test = False,
    folds = 0,
    seed = 42,
    pos_frac = 0,
    target = 'label',
    threshold = 0.5,
    train_path = 'train_dataset',
    model_dir = 'fsq_lgbm_models',
    es_rounds = 50
)

bad_features = ['text_sim'] + ['main_categories_te', 'city_decoded_te', 'categories_te', 'country_te'] + ['main_categories_vc', 'city_decoded_vc', 'categories_vc', 'country_vc']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG.seed)

# Prepare data

## Load train dataset

In [3]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']

def downcast_floats(df):
    floats = ['float32', 'float64']
    float_features = list(df.select_dtypes(include=floats).columns)
    for f in float_features:
        df[f] = df[f].astype('float16')
    return df
    
if CFG.full or CFG.folds:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))
    train_files = train_files + valid_files
else:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))

train = list()
for filename in tqdm(train_files):
    # load data
    df = pd.read_parquet(filename)
    
    # set features
    features = list(df.select_dtypes(include=numerics).columns)
    features.remove(CFG.target)
    
    if CFG.debug:
        df = df.sample(n = 10000, random_state = CFG.seed)
        df = df.reset_index(drop = True)
    df = downcast_floats(df)
    train.append(df)

train = pd.concat(train, axis=0, ignore_index=True)

train['kdist'] = train['kdist'].fillna(train['kdist_country'])
train['kdist_country'] = train['kdist_country'].fillna(train['kdist'])

gc.collect()

  0%|          | 0/20 [00:00<?, ?it/s]

20

## Load validation dataset

In [4]:
if not CFG.full and not CFG.folds:
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))

    valid = list()
    for filename in tqdm(valid_files):
        # load data
        df = pd.read_parquet(filename)
        # set features
        features = list(df.select_dtypes(include=numerics).columns)
        features.remove(CFG.target)
        
        if CFG.debug:
            df = df.sample(n = 10000, random_state = CFG.seed)
            df = df.reset_index(drop = True)
        df = downcast_floats(df)
        valid.append(df)

    valid = pd.concat(valid, axis=0, ignore_index=True)

    valid['kdist'] = valid['kdist'].fillna(valid['kdist_country'])
    valid['kdist_country'] = valid['kdist_country'].fillna(valid['kdist'])
    
    gc.collect()

  0%|          | 0/20 [00:00<?, ?it/s]

## Add the rest matches

In [5]:
# if not CFG.full and not CFG.folds:
#     all_matches_files = glob(os.path.join(CFG.train_path, "all_matches_*.parquet"))
    
#     all_matches = list()
#     for filename in tqdm(all_matches_files):
#         df = pd.read_parquet(filename)
#         if CFG.debug:
#             df = df.sample(n = 10000, random_state = CFG.seed)
#             df = df.reset_index(drop = True)
#         df = downcast_floats(df)
#         all_matches.append(df)

#     all_matches = pd.concat(all_matches, axis=0, ignore_index=True)
    
    
# all_matches['label'] = 1
# all_matches = all_matches[all_matches['id'] != all_matches['match_id']]

## Add matches to train and test dataset

In [6]:
# %%time

# all_matches_ids = set(all_matches['id'].unique())

# train_ids = set(train['id'].unique())
# all_matches_train_ids = list(train_ids.intersection(all_matches_ids))

# if not CFG.full and not CFG.folds:
#     valid_ids = set(valid['id'].unique())
#     all_matches_valid_ids = list(valid_ids.intersection(all_matches_ids))

# all_matches = all_matches.set_index('id')
# all_matches_train = all_matches.loc[all_matches_train_ids]
# all_matches_train = all_matches_train.reset_index()
# train = pd.concat([train, all_matches_train], axis=0, ignore_index=True)
# train = train.drop_duplicates(['id', 'match_id'])
# del train_ids, all_matches_ids, all_matches_train_ids

# if not CFG.full and not CFG.folds:
#     all_matches_valid = all_matches.loc[all_matches_valid_ids]
#     all_matches_valid = all_matches_valid.reset_index()
#     valid = pd.concat([valid, all_matches_valid], axis=0, ignore_index=True)
#     valid = valid.drop_duplicates(['id', 'match_id'])
#     del valid_ids, all_matches_valid_ids, all_matches_valid

# del all_matches, all_matches_train
# gc.collect()

## Increase fraction of positive targets

In [7]:
%%time

if CFG.pos_frac:
    train_pos_index = train[train['label'] == 1].index
    train_neg_index = train[train['label'] == 0].index
    train_neg_index = np.random.choice(train_neg_index, size=int(len(train_pos_index)*((1-CFG.pos_frac)/CFG.pos_frac)))
    train_pos_index = np.concatenate([train_pos_index, train_neg_index])
    np.random.shuffle(train_pos_index)
    train = train.loc[train_pos_index].reset_index(drop=True)
    del train_pos_index, train_neg_index
    gc.collect()

    if not CFG.full and not CFG.folds:
        valid_pos_index = valid[valid['label'] == 1].index
        valid_neg_index = valid[valid['label'] == 0].index
        valid_neg_index = np.random.choice(valid_neg_index, size=int(len(valid_pos_index)*((1-CFG.pos_frac)/CFG.pos_frac)))
        valid_pos_index = np.concatenate([valid_pos_index, valid_neg_index])
        np.random.shuffle(valid_pos_index)
        valid = valid.loc[valid_pos_index].reset_index(drop=True)
        del valid_pos_index, valid_neg_index
        gc.collect() 

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.48 µs


##  Drop bad features

In [8]:
if bad_features:
    train = train.drop(bad_features, axis=1)
    if not CFG.full and not CFG.folds:
        valid = valid.drop(bad_features, axis=1)
    for f in bad_features:
        features.remove(f)
        
gc.collect()

0

## Split dataset by folds

In [9]:
if CFG.folds > 0:
    kf = StratifiedGroupKFold(n_splits=CFG.folds, shuffle=True, random_state=CFG.seed)
    for i, (trn_idx, val_idx) in tqdm(enumerate(kf.split(train, train["label"], train["id"]))):
        train.loc[val_idx, "fold"] = i

# Optimize with Optuna

In [None]:
# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).
global global_preds

def objective(trial):
    global global_preds
    dtrain = lgb.Dataset(train[features], label=train[CFG.target])
    dvalid = lgb.Dataset(valid[features], label=valid[CFG.target])

    param = {
        'seed': CFG.seed,
#         'device': 'gpu',
#         'gpu_platform_id': 0,
#         'gpu_device_id': 0,
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': trial.suggest_categorical("boosting_type", ['gbdt']),# 'dart', 'goss']),
        'force_col_wise': False, # Use only with CPU devices
        'subsample_for_bin': 300000, # Number of data that sampled to construct feature discrete bins; setting this 
                                     # to larger value will give better training result but may increase train time
        'n_estimators': 300, #trial.suggest_int('n_estimators', 300, 1000),      
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 3e-1),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256), # Max number of leaves in one tree
        'max_bin': trial.suggest_int('max_bin', 32, 255), # Max number of bins that feature values will be 
                                                           # bucketed in. small number of bins may reduce training 
                                                           # accuracy but may deal with overfitting
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0), # Randomly select a subset of features 
                                                                               # if feature_fraction < 1.0
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0), # Randomly select part of data without 
                                                                               # resampling if bagging_fraction < 1.0
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7), # Perform bagging at every k iteration
        'min_data_in_leaf': trial.suggest_int('min_child_samples', 5, 64), # Minimal number of data in one leaf
                                                                            # aliases: min_child_samples, 
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-4, 1e-1), # Stop trying to split 
                                                                                               # leave if sum of it's
                                                                                               # hessian less than k
#         'cat_smooth': trial.suggest_float('cat_smooth', 10.0, 100.0), # this can reduce the effect of noises in 
#                                                                       # categorical features, especially for 
#                                                                       # categories with few data
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'auc')
    gbm = lgb.train(
        param, 
        dtrain, 
        valid_sets=[dvalid],
        callbacks = [lgb.log_evaluation(100), 
                     lgb.early_stopping(stopping_rounds=50)]
    )

    # Evaluation
    preds = gbm.predict(valid[features])
    global_preds = preds
    roc_auc = roc_auc_score(valid[CFG.target], preds)
    return roc_auc


if CFG.optimize:
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="maximize"
    )
    study.optimize(objective, timeout=9*3600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
        
    # Save study to dataframe
    study_df = study.trials_dataframe()
    study_df.to_csv('optuna_lgbm.csv')

[32m[I 2022-06-19 12:37:39,289][0m A new study created in memory with name: no-name-f152b86c-0e02-4fba-81ea-9d088ceb3b3d[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.99284
[200]	valid_0's auc: 0.993092
[300]	valid_0's auc: 0.993185
Did not meet early stopping. Best iteration is:
[296]	valid_0's auc: 0.993186


[32m[I 2022-06-19 12:53:24,079][0m Trial 0 finished with value: 0.9931859538457324 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.10004951803381777, 'lambda_l1': 1.1314188547416955e-07, 'lambda_l2': 1.3913200302937685e-05, 'num_leaves': 214, 'max_bin': 162, 'feature_fraction': 0.5432595795095669, 'bagging_fraction': 0.5654056931776374, 'bagging_freq': 6, 'min_child_samples': 62, 'min_sum_hessian_in_leaf': 0.062014166639611325}. Best is trial 0 with value: 0.9931859538457324.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.988857
[200]	valid_0's auc: 0.989891
[300]	valid_0's auc: 0.990659
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.990659


[32m[I 2022-06-19 13:07:27,428][0m Trial 1 finished with value: 0.9906587780644746 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.009186596187905877, 'lambda_l1': 0.03668989971358215, 'lambda_l2': 5.8466895187326795, 'num_leaves': 100, 'max_bin': 177, 'feature_fraction': 0.41950027723877675, 'bagging_fraction': 0.9016507264931654, 'bagging_freq': 1, 'min_child_samples': 37, 'min_sum_hessian_in_leaf': 0.09466005539904067}. Best is trial 0 with value: 0.9931859538457324.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.988732
[200]	valid_0's auc: 0.989748
[300]	valid_0's auc: 0.990548
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.990548


[32m[I 2022-06-19 13:22:46,683][0m Trial 2 finished with value: 0.9905482940335918 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.007615292094901635, 'lambda_l1': 0.347733110095675, 'lambda_l2': 4.826131967459972e-05, 'num_leaves': 185, 'max_bin': 105, 'feature_fraction': 0.7508316736623503, 'bagging_fraction': 0.8933627966990414, 'bagging_freq': 4, 'min_child_samples': 41, 'min_sum_hessian_in_leaf': 0.07612104627235977}. Best is trial 0 with value: 0.9931859538457324.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.988694
[200]	valid_0's auc: 0.989709
[300]	valid_0's auc: 0.990273
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.990273


[32m[I 2022-06-19 13:37:31,816][0m Trial 3 finished with value: 0.9902728797254972 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.006237581565606524, 'lambda_l1': 4.675311171728331e-06, 'lambda_l2': 0.001130973876577991, 'num_leaves': 179, 'max_bin': 77, 'feature_fraction': 0.646212767102985, 'bagging_fraction': 0.905818155588652, 'bagging_freq': 4, 'min_child_samples': 64, 'min_sum_hessian_in_leaf': 0.046425116351134346}. Best is trial 0 with value: 0.9931859538457324.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.988623
[200]	valid_0's auc: 0.989361
[300]	valid_0's auc: 0.989846
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.989846


[32m[I 2022-06-19 13:50:23,193][0m Trial 4 finished with value: 0.9898461367245553 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.003814445509142491, 'lambda_l1': 0.0001228165164517989, 'lambda_l2': 0.7315135619904908, 'num_leaves': 169, 'max_bin': 155, 'feature_fraction': 0.4725725835931315, 'bagging_fraction': 0.5217881202465326, 'bagging_freq': 1, 'min_child_samples': 13, 'min_sum_hessian_in_leaf': 0.021463939375485908}. Best is trial 0 with value: 0.9931859538457324.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.985943
[200]	valid_0's auc: 0.987978
[300]	valid_0's auc: 0.98887
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.98887


[32m[I 2022-06-19 14:01:51,497][0m Trial 5 finished with value: 0.9888695534789962 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.004094627778282805, 'lambda_l1': 6.311473328883248e-07, 'lambda_l2': 0.13954072426335165, 'num_leaves': 134, 'max_bin': 231, 'feature_fraction': 0.8800930434222787, 'bagging_fraction': 0.7096779098929258, 'bagging_freq': 6, 'min_child_samples': 57, 'min_sum_hessian_in_leaf': 0.06893846654806618}. Best is trial 0 with value: 0.9931859538457324.[0m


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[46]	valid_0's auc: 0.991891


[32m[I 2022-06-19 14:07:16,253][0m Trial 6 finished with value: 0.9918908813273462 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.13875721932561777, 'lambda_l1': 0.00018893687572404892, 'lambda_l2': 1.3625128377372447e-06, 'num_leaves': 253, 'max_bin': 119, 'feature_fraction': 0.9109771750880176, 'bagging_fraction': 0.749581797460271, 'bagging_freq': 6, 'min_child_samples': 12, 'min_sum_hessian_in_leaf': 0.015143891394664682}. Best is trial 0 with value: 0.9931859538457324.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.992412
Early stopping, best iteration is:
[77]	valid_0's auc: 0.992623


[32m[I 2022-06-19 14:14:39,029][0m Trial 7 finished with value: 0.9926230202353975 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.13846726791513625, 'lambda_l1': 1.7028908972727315, 'lambda_l2': 3.0582119990359434e-07, 'num_leaves': 136, 'max_bin': 189, 'feature_fraction': 0.4020444778209793, 'bagging_fraction': 0.9928425657019126, 'bagging_freq': 4, 'min_child_samples': 30, 'min_sum_hessian_in_leaf': 0.015543522693083473}. Best is trial 0 with value: 0.9931859538457324.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.992643
[200]	valid_0's auc: 0.993291
[300]	valid_0's auc: 0.993442
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.993442


[32m[I 2022-06-19 14:31:39,202][0m Trial 8 finished with value: 0.9934420824158684 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.06412538431756477, 'lambda_l1': 0.5075360165194788, 'lambda_l2': 2.1181029181051983e-07, 'num_leaves': 240, 'max_bin': 133, 'feature_fraction': 0.9401108993466838, 'bagging_fraction': 0.842829737558284, 'bagging_freq': 3, 'min_child_samples': 58, 'min_sum_hessian_in_leaf': 0.05798770200720065}. Best is trial 8 with value: 0.9934420824158684.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.992706
[200]	valid_0's auc: 0.99333
[300]	valid_0's auc: 0.993442
Did not meet early stopping. Best iteration is:
[300]	valid_0's auc: 0.993442


[32m[I 2022-06-19 14:46:36,818][0m Trial 9 finished with value: 0.9934420196592714 and parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.07046907775298634, 'lambda_l1': 0.5275446153443103, 'lambda_l2': 0.03315437976194136, 'num_leaves': 231, 'max_bin': 146, 'feature_fraction': 0.5088425783894458, 'bagging_fraction': 0.4685956731814398, 'bagging_freq': 3, 'min_child_samples': 51, 'min_sum_hessian_in_leaf': 0.046049126977302124}. Best is trial 8 with value: 0.9934420824158684.[0m


Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.94314
[200]	valid_0's auc: 0.949908


# Train

In [None]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'f1', f1_score(y_true, y_hat), True

def lgb_f2_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'f2', fbeta_score(y_true, y_hat, beta=2), True

def fit_lgbm(X_train, y_train, X_val, y_val, init_model=None, 
             params=None, es_rounds=50, num_iter=0):
    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        params,
        train_set = train_dataset, 
        valid_sets = [train_dataset, valid_dataset],
        init_model = init_model,
        callbacks = [lgb.log_evaluation(10), 
                     lgb.early_stopping(stopping_rounds=es_rounds),
                    ]
        )

    file = f'{CFG.model_dir}/lgbm.pkl'
    pickle.dump(model, open(file, 'wb'))

    return model

def fit_lgbm_folds(X, y, folds, init_model=None, params=None, es_rounds=50, num_iter=0):
    models = []
    
    for i in tqdm(range(CFG.folds)):
        print(f"== fold {i} ==")
        trn_idx = folds != i
        val_idx = folds == i
    
        train_dataset = lgb.Dataset(X.iloc[trn_idx], y.iloc[trn_idx])
        valid_dataset = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

        model = lgb.train(
            params,
            train_set = train_dataset, 
            valid_sets = [train_dataset, valid_dataset],
            init_model = init_model,
            callbacks = [lgb.log_evaluation(50), 
                         lgb.early_stopping(stopping_rounds=es_rounds),
                        ]
            )

        models.append(model)
    
        file = f'{CFG.model_dir}/lgbm_fold_{i}.pkl'
        pickle.dump(model, open(file, 'wb'))

    return models

def predict_(model, X_val, y_val, threshold):
    pred = model.predict(X_val)
    return pred

def predict_folds(models, X, y, folds, threshold):
    oof = np.zeros((len(y)), dtype=np.float64)
    
    for i in tqdm(range(CFG.folds)):
        trn_idx = folds != i
        val_idx = folds == i
        
        pred = models[i].predict(X.iloc[val_idx])
        oof[val_idx] = pred
    
    return oof

def show_metrics(pred, threshold, y):
    y_hat = np.where(pred < threshold, 0, 1)  
    acc = (y_hat == y).mean()
    f1 = f1_score(y, y_hat)
    f2 = fbeta_score(y, y_hat, beta=2)
    return acc, f1, f2

## Load best LGBM parameters

In [None]:
lgb_params = pd.read_csv('optuna_lgbm.csv')
# lgb_params.to_pickle('LGBM_Optuna_params.pkl')

param_cols = [c for c in lgb_params.columns if c.startswith('params_')]
lgb_params = lgb_params.sort_values('value')[param_cols].head(10)

best_params = list()

def param_to_set(row):
    row_dict = {k[7:]: v for k, v in row.items()}
    row_dict['seed'] = CFG.seed
    row_dict['objective'] = 'binary'
    row_dict['metric'] = 'auc'
    row_dict['n_estimators'] = 1500
    row_dict['verbose'] = -1
#     row_dict['device'] = 'gpu'
#     row_dict['gpu_platform_id'] = 0
#     row_dict['gpu_device_id'] = 0
    best_params.append(row_dict)
    
x = lgb_params.apply(param_to_set, axis=1)

## Set parameters

In [None]:
warnings.filterwarnings("ignore", module="lightgbm")

# params = best_params[0]
params = {
    'seed': CFG.seed,
#     'device': 'gpu',
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'max_depth': 7,   
    'num_leaves': 35, 
    'n_estimators': 1500, 
    'colsample_bytree': 0.9,
    'verbose': -1,
}  

if CFG.test:
    params['n_estimators'] = 400
    
if CFG.select_features:
    # extract a sample of the data
    train = train.sample(frac=0.1, random_state=CFG.seed)
    valid = valid.sample(frac=0.1, random_state=CFG.seed)

## LOFO importance

In [None]:
if CFG.select_features and CFG.selection_type=='lofo':
    # define the validation scheme
    cv = KFold(n_splits=2)
    train = pd.concat([train, valid], ignore_index=True)
    del valid
    gc.collect()
    # define the binary target and the features
    dataset = lofo.Dataset(df=train, target=CFG.target, features=features)
    # define the validation scheme and scorer
    lofo_imp = lofo.LOFOImportance(dataset, scoring="roc_auc", cv=cv, model=lgb.LGBMClassifier(**params))
    # get the mean and standard deviation of the importances in pandas format
    importance_df = lofo_imp.get_importance()
    importance_df.to_csv('importance_df.csv')
    # plot the means and standard deviations of the importances
    lofo.plot_importance(importance_df, figsize=(12, 20))

## Permutation importance

In [None]:
if CFG.select_features and CFG.selection_type=='perm':   
    # fit model
    model=lgb.LGBMClassifier(**params)
    model.fit(train[features], train[CFG.target], eval_set=(valid[features], valid[CFG.target]))
    # get permutation importance
    perm = PermutationImportance(model, random_state=CFG.seed).fit(valid[features], valid[CFG.target])
    eli5.show_weights(perm, feature_names = features)

## SHAP importance

In [None]:
if CFG.select_features and CFG.selection_type=='perm':   
    train[features] = train[features].fillna(-9999)
    # fit model
    model=lgb.LGBMClassifier(**params)
    # calculate importance
    feature_selector = BorutaShap(importance_measure='shap', classification=True)
    feature_selector.fit(X=train[features], y=train[CFG.target], n_trials=50, sample=False, train_or_test = 'test', normalize=True, verbose=True)
    feature_selector.plot(which_features='all', figsize=(16,12))

## Gain importance

In [None]:
if CFG.select_features and CFG.selection_type=='gain':   
    train[features] = train[features].fillna(-9999)
    # fit model
    model=lgb.LGBMClassifier(**params)
    # calculate importance
    feature_selector = BorutaShap(importance_measure='gini', classification=True)
    feature_selector.fit(X=train[features], y=train[CFG.target], n_trials=50, sample=False, train_or_test = 'test', normalize=True, verbose=True)
    feature_selector.plot(which_features='all', figsize=(16,12))

## Check correlation between features

In [None]:
if CFG.select_features and CFG.selection_type=='corr':
    features_corr = train.fillna(0).corr()
    # transform to low triangle matrix
    for i in range(features_corr.shape[0]):
        for j in range(features_corr.shape[1]):
            if j >= i:
                features_corr.iloc[i, j] = 0
    # unstack
    features_corr = features_corr.abs().unstack()
    features_corr = features_corr.reset_index()
    # select features with corr > 0 and sort them 
    features_corr = features_corr[features_corr[0] > 0]
    features_corr = features_corr.sort_values(0, kind="quicksort", ascending=False)
    display(features_corr.head(100))

## Train model

In [None]:
print(f'Train shape is {train.shape}')

if CFG.folds and CFG.train:
    models = fit_lgbm_folds(train[features], train[CFG.target], folds=train['fold'].values,
                            params=params, es_rounds=CFG.es_rounds)
elif CFG.full and CFG.train:
    model = fit_lgbm(train[features], train[CFG.target], 
                     train[features], train[CFG.target], 
                     params=params, es_rounds=CFG.es_rounds)
elif CFG.train:
    assert train.shape[1] == valid.shape[1]
    model = fit_lgbm(train[features], train[CFG.target], 
                     valid[features], valid[CFG.target], 
                     params=params, es_rounds=CFG.es_rounds)
elif CFG.folds:
    model_files = glob(os.path.join(CFG.model_dir, "lgbm*.pkl"))
    models = list()
    for model_file in model_files:
        with open(model_file, 'rb') as f:
            model = pickle.load(f)
            models.append(model)
else:
    model_file = f'{CFG.model_dir}/lgbm.pkl'
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

## Calculate metrics for the prediction

In [None]:
best_thr = 0.5
best_cv = 0

if CFG.folds:
    pred = predict_folds(models, train[features], train[CFG.target], train['fold'].values, best_thr)
    acc, f1, f2 = show_metrics(pred, best_thr, train[CFG.target])
else:
    pred = predict_(model, valid[features], valid[CFG.target], best_thr)      
    acc, f1, f2 = show_metrics(pred, best_thr, valid[CFG.target])

print(f'Best threshold is {best_thr}, Accuracy is {acc:.6f}, F1 score is {f1:.6f}, F2 score is {f2:.6f}')

## Fuctions for postprocessing and validation

In [None]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    id2poi = get_id2poi(input_df)
    poi2ids = get_poi2ids(input_df)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    
    return scores.mean()

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in df["matches"].values:
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    
    return df 

def get_matches(df, preds):
    match_id = df["match_id"].values
    matches = []

    for df_id, pred, match_idx in tqdm(zip(df["id"], preds, match_id), total=df.shape[0]):
        idx = np.round(pred)
        if pred == 1:
            matches.append(df_id + " " + match_idx)
        else:
            matches.append(df_id)
    
    df['matches'] = matches
    df = postprocess(df)
    
    return df[['id', 'matches', 'point_of_interest']]

## Add POI column to validation dataset

In [None]:
if not CFG.full:
    data_root = 'foursquare_location_matching'
    data = pd.read_csv(os.path.join(data_root, 'train.csv'))[['id', 'point_of_interest']]

    if CFG.folds:
        valid = train.merge(data, how='left', on='id')
    else:
        valid = valid.merge(data, how='left', on='id')

    del data
    gc.collect()

##  Find best threshold and calculate IOU

In [None]:
%%time

best_thr = 0.5
best_cv = 0

if not CFG.full:
#     for thr in tqdm(np.arange(0.4, 0.6, 0.01)):
#         if thr == 0.5:
#             continue
    y_hat = np.where(pred < CFG.threshold, 0, 1) 
    res = get_matches(valid, y_hat)
    res = res.drop_duplicates()
    cv = get_score(res)
    print(f'Threshold is {CFG.threshold:.3f}, score is {cv:.6f}')
#     if cv > best_cv:
#         best_cv = cv
#         best_thr = thr

# Plot importance

In [None]:
def plot_importance(model):
    importance_df = pd.DataFrame(model.feature_importance(), 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()
    
def plot_importances(models):
    importance_df = pd.DataFrame(models[0].feature_importance(), 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()
    
if CFG.folds:
    plot_importances(models)
else:
    plot_importance(model)

In [None]:
# Baseline
# IOU: 0.859097
# LB: 0.858

# 2 group folds
# IOU: 0.859097/0.857005
# LB: 0.863

# 5-stratified folds
# IOU: 0.882
# LB: 0.862

# 5-stratified group folds
# IOU: 0.882
# LB: 0.861

# 5-stratified group folds, thr 0.43
# IOU: 0.883
# LB: 0.854

# Return cluster feature, n_iter 1429/1183      
# IOU: 0.859382/0.859301
# LB: 0.865

#########################################################################

# Test baseline
# IOU: 0.854927

# Drop bad features
# IOU: 0.854878

# Add text similarity
# IOU: 0.854449

# Add all matches to train
# IOU: 0.882953

# Add text similarity + add all matches to train
# IOU: 0.882987

# Fill NaNs with -9999
# IOU: 0.851635

# Baseline with fixed TF-IDF (don' count unknowns)
# IOU: 0.854954

# Fix haversine distance for KNNs
# IOU: 0.856982

# Fillna for kdist and kdist_country
# IOU: 0.857143

# Add text similarity by cleaned columns
# IOU: 0.856037

# 5-folds
# IOU: 0.869376
# LB: 0.867

# Add Japaneese text
# IOU: 