In [2]:
import os
import gc
import re
import sys
import math
import json
import time
import eli5
import lofo
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
from glob import glob
from pathlib import Path
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from BorutaShap import BorutaShap
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score, fbeta_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from lofo import LOFOImportance, Dataset, plot_importance
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold
from eli5.sklearn import PermutationImportance

plt.rcParams["font.size"] = 13

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

  from tqdm.autonotebook import tqdm


# Config

In [3]:
CFG = Namespace(
    train = True,
    full = False,
    debug = False,
    select_features = False,
    selection_type = 'perm', # feasible values: lofo, perm, shap, corr, gain
    folds = 0,
    seed = 42,
    pos_frac = 0,
    target = 'label',
    threshold = 0.5,
    train_path = 'train_dataset',
    model_dir = 'fsq_cb_models',
    es_rounds = 50
)

# bad_features = ['category_venn',
#                 'city_decoded_leven',
#                 'city_decoded_nlcs',
#                 'closest_city_gesh', 
#                 'closest_city_leven',
#                 'city_lcs',
#                 'city_len_diff',
#                 'city_leven',
#                 'kdist_diff',
#                 'kneighbors_country',
#                 'phone_gesh',
#                 'state_nlcsk',
#                 'url_gesh',
#                 'url_lcs',
#                 'url_leven']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
bad_features = []
    
seed_everything(CFG.seed)

In [6]:
def vectorized_haversine(lats1, lats2, longs1, longs2):
    radius = 6371
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

lat1 = pairs['latitude_1']
lat2 = pairs['latitude_2']
lon1 = pairs['longitude_1']
lon2 = pairs['longitude_2']
pairs['haversine'] = vectorized_haversine(lat1, lat2, lon1, lon2)

In [61]:
def decode(col):
    if col != col:
        return np.nan
    return unidecode(col)

pairs['name_1_decoded'] = pairs['name_1'].apply(decode)
pairs['name_2_decoded'] = pairs['name_2'].apply(decode)

symbols_to_replace = [',', '.', "'", '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', 
                      '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', 
                      ']', '^', '_', '`', '{', '|', '}', '~']

def clean(col):
    if col != col:
        return np.nan
    col = col.lower()
    for s in symbols_to_replace:
        col = col.replace(s, '')
    return col

pairs['name_1_clean'] = pairs['name_1_decoded'].apply(clean)
pairs['name_2_clean'] = pairs['name_2_decoded'].apply(clean)

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False)
x_name_1 = pairs.loc[pairs['match']==1, 'name_1_clean']
x_name_1[x_name_1.isnull()] = ''
x_name_1 = tfidf.fit_transform(x_name_1.values)

x_name_2 = pairs.loc[pairs['match']==1, 'name_2_clean']
x_name_2[x_name_2.isnull()] = ''
x_name_2 = tfidf.transform(x_name_2.values)

In [125]:
def cosine_dist(name_1, name_2):
    return 1 - name_1.dot(name_2.T).A.ravel()[0]

cosines = np.zeros(x_name_1.shape[0])

for i in tqdm(range(x_name_1.shape[0])):
    cosines[i] = cosine_dist(x_name_1[i], x_name_2[i])

  0%|          | 0/398786 [00:00<?, ?it/s]

In [165]:
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 600)

gr1 = pairs.groupby('country_1')
country_name_dict1 = gr1.agg({'cosine': ['max', lambda x: x.quantile(0.98), 
                                            lambda x: x.quantile(0.99)]}).sort_values([('cosine', 'max')], ascending=False)#[('cosine', '<lambda_1>')]
# country_name_dict1 = country_name_dict1.to_dict()
country_name_dict1

Unnamed: 0_level_0,cosine,cosine,cosine
Unnamed: 0_level_1,max,<lambda_0>,<lambda_1>
country_1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
IT,1.0,1.0,1.0
VE,1.0,1.0,1.0
LA,1.0,0.994405,1.0
LB,1.0,0.8821963,0.9264458
LI,1.0,0.9651845,0.9825922
LK,1.0,1.0,1.0
LT,1.0,1.0,1.0
LU,1.0,0.8236052,0.9073874
LV,1.0,1.0,1.0
MA,1.0,1.0,1.0


In [149]:
gr2 = pairs.groupby('country_2')
country_name_dict2 = gr2.agg({'cosine': ['max', lambda x: x.quantile(0.998), 
                                         lambda x: x.quantile(0.999999)]}).sort_values([('cosine', '<lambda_1>')], ascending=True)[('cosine', '<lambda_1>')]
country_name_dict2 = country_name_dict2.to_dict()
country_name_dict2

{'SJ': -4.440892098500626e-16,
 'BJ': -2.220446049250313e-16,
 'LY': 0.0,
 'SH': 0.0,
 'EU': 1.1102230246251565e-16,
 'LR': 1.1102230246251565e-16,
 'SZ': 2.2204371674661158e-16,
 'TG': 2.220446049250313e-16,
 'PW': 0.06905066374873725,
 'AO': 0.10557280900008414,
 'DM': 0.12328549933682115,
 'ST': 0.19560033346015682,
 'EH': 0.2022759647825345,
 'TJ': 0.23623714793664222,
 'GL': 0.29952719645023596,
 'GW': 0.30973151003736654,
 'MF': 0.325798834336208,
 'TL': 0.3453463292920228,
 'DJ': 0.3465897371964518,
 'CI': 0.3750000000000001,
 'GD': 0.41612537791656506,
 'RE': 0.4206402254802049,
 'ZM': 0.422649286409732,
 'CD': 0.4343135935356084,
 'BI': 0.4427213927312214,
 'SN': 0.46966897477026115,
 'AI': 0.473764714650594,
 'AD': 0.47618995410735104,
 'WS': 0.47776696098576027,
 'XK': 0.4836008793438563,
 'BF': 0.48924497040384307,
 'TC': 0.49290654219976804,
 'CV': 0.49999924208117225,
 'KG': 0.4999995713571906,
 'SC': 0.5101011826080339,
 'PM': 0.5398210066915777,
 'VC': 0.543563915684684

In [156]:
country_name_dict = dict()
countries = set(country_name_dict1.keys()).union(set(country_name_dict2.keys()))

for c in countries:
    v1 = country_name_dict1.get(c, 0)
    v2 = country_name_dict2.get(c, 0)
    country_name_dict[c] = max(v1, v2, 0.001)
    

In [157]:
country_name_dict

{'SR': 0.5462567637505746,
 'EU': 0.001,
 'JP': 1.0,
 'JO': 1.0,
 'FR': 1.0,
 'MY': 1.0,
 'EH': 0.2022759647825345,
 'IR': 1.0,
 'PT': 1.0,
 'BS': 0.8121275764052539,
 'BY': 1.0,
 'PF': 0.7171554816657874,
 'PA': 1.0,
 'NO': 1.0,
 'BA': 0.9999949804219396,
 'LC': 0.6596960674720104,
 'PY': 1.0,
 'SI': 0.9999918097164066,
 'MN': 1.0,
 'GT': 0.6603164659645979,
 'VU': 1.0,
 'KG': 0.4999995713571906,
 'MU': 0.9999946455032991,
 'MC': 0.5477317347810394,
 'IE': 1.0,
 'QA': 1.0,
 'MT': 1.0,
 'AT': 1.0,
 'LR': 0.001,
 'PH': 1.0,
 'AD': 0.47618995410735104,
 'MM': 1.0,
 'GU': 1.0,
 'LU': 0.9999909041194629,
 'GI': 0.6885987523288479,
 'LT': 1.0,
 'AE': 1.0,
 'IT': 1.0,
 'RW': 0.6531549116767437,
 'SK': 1.0,
 'IQ': 1.0,
 'KW': 1.0,
 'SZ': 0.001,
 'XK': 0.6365081361429306,
 'GR': 1.0,
 'MD': 1.0,
 'AG': 0.9562280981356792,
 'NZ': 1.0,
 'CN': 1.0,
 'LK': 1.0,
 'PW': 0.06905066374873725,
 'RU': 1.0,
 'ZA': 1.0,
 'RE': 0.5914894001512416,
 'AN': 0.6315786842105264,
 'BJ': 0.001,
 'VC': 0.543563915

In [36]:
country_dist_dict1 = gr1.agg({'haversine': ['max', lambda x: x.quantile(0.98), 
                                            lambda x: x.quantile(0.99)]}).sort_values([('haversine', 'max')], ascending=False)[('haversine', '<lambda_1>')]
country_dist_dict1 = country_dist_dict1.to_dict()

In [40]:
country_dist_dict = dict()
countries = set(country_dist_dict1.keys()).union(set(country_dist_dict1.keys()))

for c in countries:
    v1 = country_dist_dict1.get(c, 0)
    v2 = country_dist_dict2.get(c, 0)
    country_dist_dict[c] = max(v1, v2)

In [32]:
country_dist_dict2 = gr2.agg({'haversine': ['max', lambda x: x.quantile(0.98), 
                                            lambda x: x.quantile(0.99)]}).sort_values([('haversine', 'max')], ascending=False)[('haversine', '<lambda_1>')]
country_dist_dict2 = country_dist_dict2.to_dict()

In [166]:
import json

with open('country_name_dists.json', 'w+') as f:
    json.dump(country_name_dict, f)

In [58]:
pairs['haversine'].quantile(0.991)

79.92817832085044

In [172]:
len(cdd)

210

In [167]:
with open('country_name_dists.json', 'r') as f:
    cnd = json.load(f)
cnd

{'SR': 0.5462567637505746,
 'EU': 0.001,
 'JP': 1.0,
 'JO': 1.0,
 'FR': 1.0,
 'MY': 1.0,
 'EH': 0.2022759647825345,
 'IR': 1.0,
 'PT': 1.0,
 'BS': 0.8121275764052539,
 'BY': 1.0,
 'PF': 0.7171554816657874,
 'PA': 1.0,
 'NO': 1.0,
 'BA': 0.9999949804219396,
 'LC': 0.6596960674720104,
 'PY': 1.0,
 'SI': 0.9999918097164066,
 'MN': 1.0,
 'GT': 0.6603164659645979,
 'VU': 1.0,
 'KG': 0.4999995713571906,
 'MU': 0.9999946455032991,
 'MC': 0.5477317347810394,
 'IE': 1.0,
 'QA': 1.0,
 'MT': 1.0,
 'AT': 1.0,
 'LR': 0.001,
 'PH': 1.0,
 'AD': 0.47618995410735104,
 'MM': 1.0,
 'GU': 1.0,
 'LU': 0.9999909041194629,
 'GI': 0.6885987523288479,
 'LT': 1.0,
 'AE': 1.0,
 'IT': 1.0,
 'RW': 0.6531549116767437,
 'SK': 1.0,
 'IQ': 1.0,
 'KW': 1.0,
 'SZ': 0.001,
 'XK': 0.6365081361429306,
 'GR': 1.0,
 'MD': 1.0,
 'AG': 0.9562280981356792,
 'NZ': 1.0,
 'CN': 1.0,
 'LK': 1.0,
 'PW': 0.06905066374873725,
 'RU': 1.0,
 'ZA': 1.0,
 'RE': 0.5914894001512416,
 'AN': 0.6315786842105264,
 'BJ': 0.001,
 'VC': 0.543563915

# Prepare data

## Load train dataset

In [3]:
def downcast_floats(df):
    floats = ['float32', 'float64']
    float_features = list(df.select_dtypes(include=floats).columns)
    for f in float_features:
        df[f] = df[f].astype('float16')
    return df
    
if CFG.full or CFG.folds:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))
    train_files = train_files + valid_files
else:
    train_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))

train = list()
for filename in tqdm(train_files):
    df = pd.read_parquet(filename)
    if CFG.debug:
        df = df.sample(n = 10000, random_state = CFG.seed)
        df = df.reset_index(drop = True)
    df = downcast_floats(df)
    train.append(df)

train = pd.concat(train, axis=0, ignore_index=True)

gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

20

## Load validation dataset

In [None]:
if not CFG.full and not CFG.folds:
    valid_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))

    valid = list()
    for filename in tqdm(valid_files):
        df = pd.read_parquet(filename)
        if CFG.debug:
            df = df.sample(n = 10000, random_state = CFG.seed)
            df = df.reset_index(drop = True)
        df = downcast_floats(df)
        valid.append(df)

    valid = pd.concat(valid, axis=0, ignore_index=True)
    
    gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

## Increase fraction of positive targets

In [5]:
%%time

if CFG.pos_frac:
    train_pos_index = train[train['label'] == 1].index
    train_neg_index = train[train['label'] == 0].index
    train_neg_index = np.random.choice(train_neg_index, size=int(len(train_pos_index)*((1-CFG.pos_frac)/CFG.pos_frac)))
    train_pos_index = np.concatenate([train_pos_index, train_neg_index])
    np.random.shuffle(train_pos_index)
    train = train.loc[train_pos_index].reset_index(drop=True)
    del train_pos_index, train_neg_index
    gc.collect()

    if not CFG.full and not CFG.folds:
        valid_pos_index = valid[valid['label'] == 1].index
        valid_neg_index = valid[valid['label'] == 0].index
        valid_neg_index = np.random.choice(valid_neg_index, size=int(len(valid_pos_index)*((1-CFG.pos_frac)/CFG.pos_frac)))
        valid_pos_index = np.concatenate([valid_pos_index, valid_neg_index])
        np.random.shuffle(valid_pos_index)
        valid = valid.loc[valid_pos_index].reset_index(drop=True)
        del valid_pos_index, valid_neg_index
        gc.collect() 

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.72 µs


##  Drop bad features

In [6]:
# if bad_features:
#     train = train.drop(bad_features, axis=1)
#     if    not CFG.full and not CFG.folds:
#         valid = valid.drop(bad_features, axis=1)
        
# gc.collect()

## Set features

In [7]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = list(train.select_dtypes(include=numerics).columns)
features.remove(CFG.target)

## Split dataset by folds

In [8]:
if CFG.folds > 0:
    kf = StratifiedGroupKFold(n_splits=CFG.folds, shuffle=True, random_state=CFG.seed)
    for i, (trn_idx, val_idx) in tqdm(enumerate(kf.split(train, train["label"], train["id"]))):
        train.loc[val_idx, "fold"] = i

# Optimize with Optuna

In [9]:
# def objective(trial: optuna.Trial):
#     # Parameters
#     params = {
#         'objective': 'Logloss', # 'objective': trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"])
#         'eval_metric': 'AUC',
#         'task_type': 'GPU',
#         'random_seed': CFG.seed,
#         'od_type': 'Iter', # Type of overfitting detector - stop after k iteraions
#         'od_wait': CFG.es_rounds, # Overfitting detector - stop training after k iterations without metric improvement
#         'metric_period': 50, # Show metric each k iterations
#         'iterations' : trial.suggest_int('iterations', 300, 1500),                         
#         'l2_leaf_reg': trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
#         'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 3e-1),             
#         'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']), 
#         'depth' : trial.suggest_int('depth', 4, 12),  # Max tree depth
#         'random_strength': trial.suggest_int('random_strength', 0, 100), # The amount of randomness to use 
#                                                                          # for scoring splits when the tree structure
#                                                                          # is selected. Helps to avoid overfitting
#         'max_bin': trial.suggest_categorical('max_bin', [3,4,5,10,20,32,64,128]), # The number of splits for 
#                                                                                   # numerical features
        
#         'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), # Assigns random 
#                                                                                               # weights to objects
#         'bootstrap_type': trial.suggest_categorical("bootstrap_type", ['Bayesian', 'Bernoulli', 
#                                                                        'MVS', 'Poisson']), # Poisson for GPU only
# #         'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [0,1])
#     }
    
#     if params['grow_policy'] == 'SymmetricTree': 
#         params['boosting_type']= trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
#     else:
#         params['boosting_type'] = 'Plain'
        
#     if params['grow_policy'] == 'Lossguide':
#         params['num_leaves']= trial.suggest_int('num_leaves', 20, 60)
    
#     # Learning
#     model = cat.CatBoostClassifier(**params)
#     pruning_callback = CatBoostPruningCallback(trial, "AUC")
    
#     model.fit(
#         X_train,
#         y_train,
#         eval_set=[(X_valid, y_valid)],
#         verbose=0,
#         callbacks=[pruning_callback],
#     )
    
#     model.fit(train_pool)
#     # Predict
#     preds = model.predict_proba(val_pool)[:,1]
#     # Evaluation
#     auc_metric = auc(y_valid, preds)
#     return auc_metric

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, timeout=3600*24) # change timeout if you want to make optimization process longer

# print("Number of finished trials: {}".format(len(study.trials)))

# print("Best trial:")
# trial = study.best_trial

# print("  Value: {}".format(trial.value))

# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))
    
# # Save study to dataframe
# study_df = study.trials_dataframe()
# study_df.to_csv('study_df.csv')

# Train

In [10]:
def fit_cb(model, X_train, y_train, X_val, y_val):

    train_dataset = Pool(X_train, label=y_train)
    valid_dataset = Pool(X_val, label=y_val)

    fit_model = model.fit(train_dataset,
                          eval_set=valid_dataset,
                          use_best_model=True,
                          verbose=5000,
                          plot=True
                         )

    file = f'{CFG.model_dir}/cb'
    model.save_model(file)

    return fit_model

def fit_cb_folds(model, X, y, folds):
    models = []
    
    for i in tqdm(range(CFG.folds)):
        print(f"== fold {i} ==")
        trn_idx = folds != i
        val_idx = folds == i
    
        train_dataset = Pool(X.iloc[trn_idx], label=y.iloc[trn_idx])
        valid_dataset = Pool(X.iloc[val_idx], label=y.iloc[val_idx])

        fit_model = model.fit(train_dataset,
                              eval_set=valid_dataset,
                              use_best_model=True,
                              verbose=5000,
                              plot=True
                             )

        models.append(fit_model)
    
        file = f'{CFG.model_dir}/cb_fold_{i}'
        model.save_model(file)

    return models

def predict_(model, X_val, y_val, threshold):
    pred = model.predict_proba(X_val)[:,1]
    return pred

def predict_folds(models, X, y, folds, threshold):
    oof = np.zeros((len(y)), dtype=np.float64)
    
    for i in tqdm(range(CFG.folds)):
        trn_idx = folds != i
        val_idx = folds == i
        
        pred = models[i].predict_proba(X.iloc[val_idx])[:,1]
        oof[val_idx] = pred
    
    return oof

def show_metrics(pred, threshold, y):
    y_hat = np.where(pred < threshold, 0, 1)  
    acc = (y_hat == y).mean()
    f1 = f1_score(y, y_hat)
    f2 = fbeta_score(y, y_hat, beta=2)
    return acc, f1, f2

## Set parameters

In [11]:
model = CatBoostClassifier(
                            loss_function="Logloss",
                            eval_metric="AUC",
                            task_type="GPU",
                            learning_rate=0.03,
                            n_estimators=1500,
                            reg_lambda=3,
                            depth=7,
#                             bootstrap_type
#                             bagging_temperature
#                             subsample
#                             grow_policy
#                             min_data_in_leaf=20,
                            random_seed=CFG.seed,
                            early_stopping_rounds=CFG.es_rounds,
                            boosting_type='Plain',
                            od_type="Iter",
                          )

if CFG.full:
    X_train, y_train = train[features], train[CFG.target]
else:
    X_train, y_train, X_val, y_val = train[features], train[CFG.target], valid[features], valid[CFG.target]   
    
if CFG.select_features:
    # extract a sample of the data
    train = train.sample(frac=0.01, random_state=CFG.seed)
    valid = valid.sample(frac=0.01, random_state=CFG.seed)

## LOFO importance

In [12]:
if CFG.select_features and CFG.selection_type=='lofo':
    # define the validation scheme
    cv = KFold(n_splits=2)
    train = pd.concat([train, valid], ignore_index=True)
    del valid
    gc.collect()
    # define the binary target and the features
    dataset = lofo.Dataset(df=train, target=CFG.target, features=features)
    # define the validation scheme and scorer
    lofo_imp = lofo.LOFOImportance(dataset, scoring="roc_auc", cv=cv, model=models)
    # get the mean and standard deviation of the importances in pandas format
    importance_df = lofo_imp.get_importance()
    importance_df.to_csv('importance_df.csv')
    # plot the means and standard deviations of the importances
    lofo.plot_importance(importance_df, figsize=(12, 20))

## Permutation importance

In [13]:
if CFG.select_features and CFG.selection_type=='perm':   
    # fit model
    model=lgb.LGBMClassifier(**params)
    model.fit(train[features], train[CFG.target], eval_set=(valid[features], valid[CFG.target]))
    # get permutation importance
    perm = PermutationImportance(model, random_state=CFG.seed).fit(valid[features], valid[CFG.target])
    eli5.show_weights(perm, feature_names = features)

## SHAP importance

In [14]:
if CFG.select_features and CFG.selection_type=='perm':   
    train[features] = train[features].fillna(-9999)
    # fit model
    model=lgb.LGBMClassifier(**params)
    # calculate importance
    feature_selector = BorutaShap(importance_measure='shap', classification=True)
    feature_selector.fit(X=train[features], y=train[CFG.target], n_trials=50, sample=False, train_or_test = 'test', normalize=True, verbose=True)
    feature_selector.plot(which_features='all', figsize=(16,12))

## Gain importance

In [15]:
if CFG.select_features and CFG.selection_type=='gain':   
    train[features] = train[features].fillna(-9999)
    # fit model
    model=lgb.LGBMClassifier(**params)
    # calculate importance
    feature_selector = BorutaShap(importance_measure='gini', classification=True)
    feature_selector.fit(X=train[features], y=train[CFG.target], n_trials=50, sample=False, train_or_test = 'test', normalize=True, verbose=True)
    feature_selector.plot(which_features='all', figsize=(16,12))

## Check correlation between features

In [16]:
if CFG.select_features and CFG.selection_type=='corr':
    features_corr = train.fillna(0).corr()
    # transform to low triangle matrix
    for i in range(features_corr.shape[0]):
        for j in range(features_corr.shape[1]):
            if j >= i:
                features_corr.iloc[i, j] = 0
    # unstack
    features_corr = features_corr.abs().unstack()
    features_corr = features_corr.reset_index()
    # select features with corr > 0 and sort them 
    features_corr = features_corr[features_corr[0] > 0]
    features_corr = features_corr.sort_values(0, kind="quicksort", ascending=False)
    display(features_corr.head(100))

## Train model

In [17]:
if CFG.folds and CFG.train:
    models = fit_cb_folds(model, train[features], train[CFG.target], folds=train['fold'].values)
elif CFG.full and CFG.train:
    model = fit_cb(model, train[features], train[CFG.target], 
                   train[features], train[CFG.target])
elif CFG.train:
    model = fit_cb(model, train[features], train[CFG.target], 
                   valid[features], valid[CFG.target])
elif CFG.folds:
    model_files = glob(os.path.join(CFG.model_dir, "cb*.pkl"))
    models = list()
    for model_file in model_files:
        with open(model_file, 'rb') as f:
            model = pickle.load(f)
            models.append(model)
else:
    model_file = f'{CFG.model_dir}/cb'
    model = CatBoostClassifier()      # parameters not required.
    model.load_model(model_file)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.9625193	test: 0.9591440	best: 0.9591440 (0)	total: 535ms	remaining: 13m 21s
1499:	learn: 0.9938302	test: 0.9921405	best: 0.9921405 (1499)	total: 16m 45s	remaining: 0us
bestTest = 0.9921405017
bestIteration = 1499


## Calculate metrics for the prediction

In [18]:
best_thr = 0.5
best_cv = 0

if CFG.folds:
    X = train[features]
    y = train[CFG.target]
else:
    X = valid[features]
    y = valid[CFG.target]

if CFG.folds:
    pred = predict_folds(models, X, y, train['fold'].values, best_thr)
else:
    pred = predict_(model, X, y, best_thr)
            
acc, f1, f2 = show_metrics(pred, best_thr, y)
print(f'Best threshold is {best_thr}, Accuracy is {acc:.6f}, F1 score is {f1:.6f}, F2 score is {f2:.6f}')

Best threshold is 0.5, Accuracy is 0.987935, F1 score is 0.858184, F2 score is 0.822560


## Fuctions for postprocessing and validation

In [19]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    id2poi = get_id2poi(input_df)
    poi2ids = get_poi2ids(input_df)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    
    return scores.mean()

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in df["matches"].values:
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    
    return df 

def get_matches(df, preds):
    match_id = df["match_id"].values
    matches = []

    for df_id, pred, match_idx in tqdm(zip(df["id"], preds, match_id), total=df.shape[0]):
        idx = np.round(pred)
        if pred == 1:
            matches.append(df_id + " " + match_idx)
        else:
            matches.append(df_id)
    
    df['matches'] = matches
    df = postprocess(df)
    
    return df[['id', 'matches', 'point_of_interest']]

## Add POI column to validation dataset

In [20]:
if not CFG.full:
    data_root = 'foursquare_location_matching'
    data = pd.read_csv(os.path.join(data_root, 'train.csv'))[['id', 'point_of_interest']]

    if CFG.folds:
        valid = train.merge(data, how='left', on='id')
    else:
        valid = valid.merge(data, how='left', on='id')

    del data
    gc.collect()

##  Find threshold for the best IOU

In [21]:
%%time

if not CFG.full:
    y_hat = np.where(pred < CFG.threshold, 0, 1) 
    res = get_matches(valid, y_hat)
    res = res.drop_duplicates()
    cv = get_score(res)
    print(f'Threshold is {CFG.threshold:.3f}, score is {cv:.6f}')

  0%|          | 0/26300528 [00:00<?, ?it/s]

Threshold is 0.500, score is 0.854431
CPU times: user 2min 10s, sys: 2.3 s, total: 2min 13s
Wall time: 2min 12s


# Plot importance

In [22]:
def plot_importance(model):
    importance_df = pd.DataFrame(model.feature_importance(), 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()
    
def plot_importances(models):
    importance_df = pd.DataFrame(models[0].feature_importance(), 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()
    
if CFG.folds:
    plot_importances(models)
else:
    plot_importance(model)

AttributeError: 'CatBoostClassifier' object has no attribute 'feature_importance'

In [None]:
# Baseline
# IOU 0.860169 (5000 iter)
# LB

# return recall_simple
# IOU 0.854431 (1500 iter)
# LB 