**Reference**

This notebook is based on [@ryotayoshinobu](https://www.kaggle.com/ryotayoshinobu)'s [baseline](https://www.kaggle.com/code/ryotayoshinobu/foursquare-lightgbm-baseline).

I joined this competition in the middle of the competition period, but thanks to [@ryotayoshinobu](https://www.kaggle.com/ryotayoshinobu), I was able to work on it smoothly.

Please don't forget to upvote the original notebook!

**about this notebook**

On both Kaggle and Colab, training and inference can be run on this single notebook!

1. Training
    
    Set `CFG.train = True` and run.

2. Inference

    Set `CFG.train = False` and run.

In [1]:
!nvidia-smi

Sat Jul  9 21:08:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

# Libraries

In [2]:
# ====================================================
# import libraries1
# ====================================================

import warnings
warnings.filterwarnings('ignore')

import os
import sys
import math
import random
import time
import numpy as np
import pandas as pd
import gc
import json
import joblib
from tqdm import tqdm
from pathlib import Path
import itertools
import collections
from collections import Counter

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau, _LRScheduler
from torch.nn import Parameter

import datetime
from datetime import timedelta
import hashlib
import difflib
import seaborn as sns
import matplotlib.pyplot as plt
from requests import get
from PIL import Image
import pickle
from contextlib import contextmanager
import multiprocessing

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.linear_model import RidgeCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import word2vec

import lightgbm as lgb
import typing as tp

from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter

tqdm.pandas()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print(f'Using device: {device}')

Using device: cuda


# Config

In [3]:
# ==============================================
#  Config
# ==============================================

class CFG:
    colab = "google.colab" in sys.modules
    exp = "064"
    train = False
    debug = False
    api_path = '/content/drive/My Drive/kaggle.json'
    seed = 42
    used_fold = [0,1,2,3,4]
    fold = 5
    target = "label"
    n_neighbors = 300
    threshold = 0.56
    data_split = 10
    name_low_bound = 20
    name_th = 0.65
    phone_th = 0.7
    
    # ====================================================
    # Stage1 (fine-tuning)
    # ====================================================
    model = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    batch_size = 32
    max_length = 32
    epochs = 15
    num_workers = 8
    lr = 1e-5
    scheduler = 'CosineAnnealingLR'

if not CFG.colab:
    CFG.model = "../input/sbert-models/paraphrase-multilingual-mpnet-base-v2"

In [4]:
# ==============================================
#  Catboost parameters
# ==============================================

CATEGORICAL_COL = []
DROP_COLS = ["id", "match_id"]

PARAMS = {
    'loss_function': 'Logloss', # ['Logloss', 'AUC']
    'learning_rate': 0.5,
    'max_depth': 7,
    'random_state': CFG.seed,
    'thread_count': 2,
    'task_type': 'GPU',
    #'scale_pos_weight': 4,
    'num_boost_round': 150000,
}

In [5]:
if CFG.colab:
    print("==============================================")
    print("This environment is Google Colab")
    print("==============================================")

    # Google Drive
    from google.colab import drive, files
    drive.mount('/content/drive')
    %cd "drive/My Drive/foursquare/"

    # Kaggle API
    f = open(CFG.api_path, 'r')
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]

    # Directory Setting
    if not os.path.exists(f"output/exp{CFG.exp}/"):
        os.makedirs(f"output/exp{CFG.exp}/")
    
    DATA_DIR = "input/"
    OUTPUT_DIR = f"output/exp{CFG.exp}/"
    MODEL_DIR = OUTPUT_DIR
    
    # Data Loading
    if not os.path.isfile(os.path.join(DATA_DIR, "foursquare-location-matching.zip")):
        !kaggle competitions download -c foursquare-location-matching -p $DATA_DIR

    # Libraries
    !pip install -q catboost
    !pip install -q Levenshtein
    #!pip install -q textdistance==4.2.2
    !pip install -q pylcs==0.0.6
    !pip install -q fasttext
    !pip install -q reverse_geocode
    !pip install -q transformers
    !pip install -q sentence_transformers==2.2.0

else:
    print("==============================================")
    print(" This environment is Kaggle Notebook")
    print("==============================================")

    # Directory Setting
    DATA_DIR = "../input/foursquare-location-matching/"
    OUTPUT_DIR = "./"
    MODEL_DIR = f"../input/foursquare-dataset-exp{CFG.exp}/"
    MODEL_DIR2 = f"../input/foursquare-dataset-exp094/"

    # Libraries
    !pip install /kaggle/input/reversegeocode/reverse_geocode-1.4.1-py3-none-any.whl
    #!pip install ../input/textdistance-install/textdistance-4.2.2-py3-none-any.whl
    !pip install --force-reinstall ../input/pylcs-install/pybind11-2.9.2-py2.py3-none-any.whl

    !rm -r mypip
    !mkdir mypip
    !tar -czvf mypip/pylcs-0.0.6.tar.gz -C ../input/pylcs-install/pylcs-0.0.6/pylcs-0.0.6 .
    !ls -l mypip

    !pip install --no-index mypip/pylcs-0.0.6.tar.gz
    
    sys.path.append("../input/sentencetransformersinstall/sentence-transformers-2.2.0")

# ====================================================
# import libraries2
# ====================================================

from catboost import CatBoost, Pool
import Levenshtein
import pylcs
#import textdistance
import reverse_geocode
from fasttext import load_model
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample, evaluation
from transformers import DistilBertModel, DistilBertTokenizer, AutoTokenizer, AutoModel, AutoConfig
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

 This environment is Kaggle Notebook
Processing /kaggle/input/reversegeocode/reverse_geocode-1.4.1-py3-none-any.whl
Installing collected packages: reverse-geocode
Successfully installed reverse-geocode-1.4.1
[0mProcessing /kaggle/input/pylcs-install/pybind11-2.9.2-py2.py3-none-any.whl
Installing collected packages: pybind11
  Attempting uninstall: pybind11
    Found existing installation: pybind11 2.9.2
    Uninstalling pybind11-2.9.2:
      Successfully uninstalled pybind11-2.9.2
Successfully installed pybind11-2.9.2
[0mrm: cannot remove 'mypip': No such file or directory
./
./setup.cfg
./README.md
./pylcs.egg-info/
./pylcs.egg-info/not-zip-safe
./pylcs.egg-info/dependency_links.txt
./pylcs.egg-info/SOURCES.txt
./pylcs.egg-info/requires.txt
./pylcs.egg-info/top_level.txt
./pylcs.egg-info/PKG-INFO
./src/
./src/main.cpp
./PKG-INFO
./setup.py
total 4
-rw-r--r-- 1 root root 3923 Jul  9 21:09 pylcs-0.0.6.tar.gz
Processing ./mypip/pylcs-0.0.6.tar.gz
  Preparing

# Helper Functions

In [6]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Memory usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [7]:
@contextmanager
def timer(name: str):
    t0 = time.time()
    print(f"[{name}] start")
    yield
    msg = f"[{name}] done in {time.time() - t0:.0f} s"
    print(msg)

In [8]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

seed_everything(CFG.seed)

In [9]:
def init_logger(log_file=OUTPUT_DIR+'train.log'):
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    logger.hasHandlers()
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = init_logger()

In [10]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def analysis(df):
    print('Num of data: %s' % len(df))
    print('Num of unique id: %s' % df['id'].nunique())
    print('Num of unique poi: %s' % df['point_of_interest'].nunique())
    
    poi_grouped = df.groupby('point_of_interest')['id'].count().reset_index()
    print('Mean num of unique poi: %s' % poi_grouped['id'].mean())

In [11]:
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [12]:
# ====================================================
#  Inference
# ====================================================

def inference(df):
    pred = np.zeros(len(df))
    for fold in tqdm(range(CFG.fold)):
        if fold in CFG.used_fold:
            
            # two seeds averaging
            
            # seed 42
            with open(f"../input/foursquare-dataset-exp064/model_fold{fold}.pkl", 'rb') as f:
                model = pickle.load(f)
            pred += model.predict(df.drop(columns=DROP_COLS), prediction_type='Probability').T[1]
            del model; gc_clear()
            
            # seed 31
            with open(f"../input/foursquare-dataset-exp064-seed31/model_fold{fold}.pkl", 'rb') as f:
                model = pickle.load(f)
            pred += model.predict(df.drop(columns=DROP_COLS), prediction_type='Probability').T[1]
            del model; gc_clear()
            
    pred = pred / (len(CFG.used_fold)*2)
    df = df[["id", "match_id"]]
    df["prediction"] = pred

    df = df[df["prediction"]>CFG.threshold].groupby("id")["match_id"].apply(list).reset_index()
    df.columns = ["id", "matches"]
    df["matches"] = df["matches"].apply(lambda x: " ".join(x))

    return df

In [13]:
# ====================================================
#  Post-processing
# ====================================================

def post_process(df):
    id2match = dict(zip(df['id'].values, df['matches'].str.split()))

    for base, match in df[['id', 'matches']].values:
        match = match.split()
        if len(match) == 1:        
            continue

        for m in match:
            if base not in id2match[m]:
                id2match[m].append(base)
    df['matches'] = df['id'].map(id2match).map(' '.join)
    return df 

In [14]:
def gc_clear():
    for i in range(5):
        gc.collect()

In [15]:
def flatten(x):
    return [e for i in x for e in i]

In [16]:
vec_columns = ['name', 'categories', 'address', 'text']
feat_columns = ['name', 'address', 'city', 'state', 'zip', 'country', 'url', 'phone', 'categories', 'name_lang', 'city2', 'country2', 'text'] 

# Data Loading

In [17]:
with timer("Data Loading"):
    if CFG.train:
        original_df = pd.read_csv(DATA_DIR + "train.csv")
        if CFG.debug:
            original_df = original_df[:100]
    else:
        original_df = pd.read_csv(DATA_DIR + "test.csv")
        original_df["point_of_interest"] = "match"
display(original_df)

[Data Loading] start
[Data Loading] done in 0 s


Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest
0,E_00001118ad0191,Jamu Petani Bagan Serai,5.012169,100.535805,,,,,MY,,,Cafés,match
1,E_000020eb6fed40,Johnny's Bar,40.434209,-80.56416,497 N 12th St,Weirton,WV,26062.0,US,,,Bars,match
2,E_00002f98667edf,QIWI,47.215134,39.686088,"Межевая улица, 60",Ростов-на-Дону,,,RU,https://qiwi.com,78003010000.0,ATMs,match
3,E_001b6bad66eb98,"Gelora Sriwijaya, Jaka Baring Sport City",-3.014675,104.794374,,,,,ID,,,Stadiums,match
4,E_0283d9f61e569d,Stadion Gelora Sriwijaya,-3.021727,104.788628,Jalan Gubernur Hasan Bastari,Palembang,South Sumatra,11480.0,ID,,,Soccer Stadiums,match


In [18]:
id2poi = get_id2poi(original_df)
poi2ids = get_poi2ids(original_df)

In [19]:
def calc_maximum_score(original_df, df):

    eval_df = pd.DataFrame()
    eval_df['id'] = original_df['id'].unique().tolist()
    eval_df['match_id'] = eval_df['id']

    eval_df_ = df[df['label'] == 1][['id', 'match_id']]
    eval_df = pd.concat([eval_df, eval_df_])

    eval_df = eval_df.groupby('id')['match_id'].apply(list).reset_index()
    eval_df['matches'] = eval_df['match_id'].apply(lambda x: ' '.join(set(x)))

    score_before_pp = get_score(eval_df)
    print(f"maximum socre (before pp): {score_before_pp}")
    score_after_pp = get_score(post_process(eval_df))
    print(f"maximum socre (after pp): {score_after_pp}")

In [20]:
def categorical_similarity(A, B):
    if A=="nan" or B=="nan":
        return np.nan

    A = set(str(A).split(", "))
    B = set(str(B).split(", "))

    nominator = A.intersection(B)

    similarity_1 = len(nominator) / len(A)
    similarity_2 = len(nominator) / len(B)

    return max(similarity_1, similarity_2)

def gesh(A, B):
    if A=="nan" or B=="nan":
        return np.nan
    else:
        return difflib.SequenceMatcher(None, A, B).ratio()

def leven(A, B):
    if A=="nan" or B=="nan":
        return np.nan
    else:
        return Levenshtein.distance(A, B)

def jaro(A, B):
    if A=="nan" or B=="nan":
        return np.nan
    else:
        return Levenshtein.jaro_winkler(A, B)

def lcs_sequence(A, B):
    if A=="nan" or B=="nan":
        return np.nan
    else:
        return pylcs.lcs(A, B)

def lcs_string(A, B):
    if A=="nan" or B=="nan":
        return np.nan
    else:
        return pylcs.lcs2(A, B)

def equal(A, B):
    if A=="nan" or B=="nan":
        return np.nan
    else:
        return int(A==B)

# Preprocess

## Devide Train Data into about 600K×2

In [21]:
if CFG.train:
    kf = GroupKFold(n_splits=2)
    for i_fold, (trn_idx, val_idx) in enumerate(kf.split(original_df, original_df["point_of_interest"], original_df["point_of_interest"])):
        original_df.loc[val_idx, "set"] = i_fold
    original_df["set"] = original_df["set"].astype("int8")
    print(original_df["set"].value_counts())

## Candidates Generation

In generating candidates, I used not only haversine distance, but also `name` similarity and `phone` similarity to select candidates with larger theoretical maximum IoU score.

In [22]:
def recall_knn(original_df, Neighbors=CFG.n_neighbors):

    # ==============================================
    #  Candidates Generation
    # ==============================================
    # ideas and codes from my teammate @mrt0933!

    near_ids = {}
    near_dists = {}

    for country, country_df in tqdm(original_df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
    
        knn = KNeighborsRegressor(n_neighbors=min(len(country_df), Neighbors), 
                                    metric='haversine', n_jobs=-1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)

        ids = country_df['id'].values
        nids = country_df['id'].values[nears]
        for j in range(len(country_df)):
            near_ids[ids[j]] = nids[j]
            near_dists[ids[j]] = dists[j]

    def get_lcs_rate(str1, str2):
        if str1!="nan":
            return lcs_sequence(str1, str2) / len(str1)
        else :
            return np.nan

    ids = original_df['id'].values
    id2index = {ids[i]:i for i in range(len(ids))}

    res = {idi:near_ids[idi][:CFG.name_low_bound].tolist() for idi in near_ids}
    names = original_df['name'].values.astype(str)

    original_df['phone'] = original_df['phone'].replace("", "nan")
 
    phone = original_df['phone'].values

    for id0 in tqdm(near_ids):
        idx0 = id2index[id0]
        n0 = names[idx0]
        ph0 = phone[idx0]

        for k, id1 in enumerate(near_ids[id0]):
            idx1 = id2index[id1]
            ph1 = phone[idx1]
            n1 = names[idx1]

            if k < CFG.name_low_bound:
                continue

            jrate = jaro(n0, n1)
            lrate = get_lcs_rate(ph0, ph1)
            if jrate >= CFG.name_th or lrate >= CFG.phone_th:
                res[id0].append(id1)

    id_list = []
    match_id_list = []

    for id0 in tqdm(res):
        num = len(res[id0])
        id_list.append([[id0]*num])
        match_id_list.append(res[id0])

    df = pd.DataFrame()
    df["id"] = flatten(flatten(id_list))
    df["match_id"] = flatten(match_id_list)

    del id_list, match_id_list
    gc_clear()

    # ==============================================
    #  Delete unnecessary rows
    # ==============================================

    # Delete same id
    df = df[df["id"]!=df["match_id"]].reset_index(drop=True)

    # Delete duplicate pairs
    if not CFG.train:
        df = df[~pd.DataFrame(np.sort(df[['id','match_id']].values,1)).duplicated()].reset_index(drop=True)

    print(f"len: {len(df)}")

    # ==============================================
    #  Make 0/1 label
    # ==============================================

    if CFG.train:

        ids = df['id'].tolist()
        match_ids = df['match_id'].tolist()

        poi = original_df.set_index('id').loc[ids]['point_of_interest'].values
        match_poi = original_df.set_index('id').loc[match_ids]['point_of_interest'].values

        df['label'] = np.array(poi == match_poi, dtype = np.int8)

        print(df['label'].value_counts())
        
        del poi, match_poi, ids, match_ids
        gc_clear()

    # ==============================================
    #  Max score
    # ==============================================
    if CFG.train:
        calc_maximum_score(original_df, df)
    
    # ==============================================
    #  Make test fold
    # ==============================================
            
    if len(df)<10000:
        CFG.data_split = 1
        df["data_split"] = 0
        df["data_split"] = df["data_split"].astype("int8")
    else:
        kf = GroupKFold(n_splits=CFG.data_split)
        for i_fold, (trn_idx, val_idx) in enumerate(kf.split(df, df["id"], df["id"])):
            df.loc[val_idx, "data_split"] = i_fold
        df["data_split"] = df["data_split"].astype("int8")
    
    return df

## Feature Engineering

### Preprocess original_df

In [23]:
def original_df_preprocess(original_df):
    # ==============================================
    # Language identification
    # ==============================================

    if CFG.colab:
        model = load_model("models/lid.176.bin")
    else:
        model = load_model("../input/language-predictor/lid.176.bin")

    original_df["name_lang"] = original_df["name"].fillna(" ").progress_apply(lambda x: model.predict(x)[0][0][9:])
    original_df.loc[original_df["name"]=="nan", "name_lang"] = "nan"

    del model
    gc_clear()

    # ==============================================
    # Reverse geocode
    # ==============================================

    def get_geo_info(coords):
        data = reverse_geocode.search(coords)
        return [v['country_code'] for v in data], [v['city'] for v in data]

    original_df['country2'] = get_geo_info(original_df[['latitude', 'longitude']])[0]
    original_df['city2'] = get_geo_info(original_df[['latitude', 'longitude']])[1]

    # ==============================================
    # Degree to radian
    # ==============================================
    original_df["latitude"] = original_df["latitude"] * np.pi / 180
    original_df["longitude"] = original_df["longitude"] * np.pi / 180

    # ==============================================
    # Make features
    # ==============================================
    original_df["text"] = original_df["name"].fillna("") + " " + \
                            original_df["address"].fillna("") + " " + \
                            original_df["city"].fillna("") + " " + \
                            original_df["state"].fillna("") + " " + \
                            original_df["categories"].fillna("")

    # ==============================================
    # Clean Sentences
    # ==============================================

    #def clean_text(text):
    #    try:
    #        text = text.encode('utf-8').decode("unicode_escape")
    #        text = text.encode('ascii', 'ignore').decode("unicode_escape")
    #    except:
    #        pass
    #    return text
    # 
    #original_df["name"] = original_df["name"].apply(lambda x: clean_text(x))

    #def remove_char(x):
    #    return re.sub(r'[^0-9]', '', x)

    #func = np.frompyfunc(remove_char, 1, 1)
    #original_df['phone'] = func(original_df['phone'].values)
    #original_df['phone'] = original_df['phone'].replace("", np.nan)

    for c in original_df.columns:
        if not c in ["id", "latitude", "longitude"]:
            
            original_df[c] = original_df[c].astype(str) #.str.lower()
                
    return original_df

### Distance features

In [24]:
# ==============================================
# Distance features
# ==============================================

def vectorized_haversine(lats1, lats2, longs1, longs2):
    dlat = np.radians(lats2 - lats1)
    dlon = np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return c

def add_distance_features(original_df, df):
    original_df = original_df.set_index('id')

    lat1 = original_df.loc[df["id"].tolist()]["latitude"].values
    lat2 = original_df.loc[df["match_id"].tolist()]["latitude"].values
    lon1 = original_df.loc[df["id"].tolist()]["longitude"].values
    lon2 = original_df.loc[df["match_id"].tolist()]["longitude"].values
    
    df['latdiff'] = abs(lat1 - lat2).astype("float16")
    df['londiff'] = abs(lon1 - lon2).astype("float16")

    df['haversine'] = vectorized_haversine(lat1, lat2, lon1, lon2).astype("float32")

    return df

### tfidf features

In [25]:
def add_tfidf_features(original_df, df):
    
    id2index_d = dict(zip(original_df['id'].values, original_df.index))
    indexs = [id2index_d[i] for i in df['id']]
    match_indexs = [id2index_d[i] for i in df['match_id']]

    tfidf1 = TfidfVectorizer(analyzer="word", #["word", "char", "char_wb"]
                            #strip_accents="unicode",
                            ngram_range=(1, 1))
    tfidf2 = TfidfVectorizer(analyzer="char_wb", #["word", "char", "char_wb"]
                            #strip_accents="unicode",
                            ngram_range=(3, 3))
    
    tfidf_dict = {"tfidf1": tfidf1,
                  "tfidf2": tfidf2,
                  }
    
    for tfidf_name in ["tfidf1","tfidf2"]:
        for col in tqdm(vec_columns):
            
            tv_fit = tfidf_dict[tfidf_name].fit_transform(original_df[col])
            
            output = np.array([])
            for i in range(5):
                chunk = len(indexs)//5 + 1
                s = i*chunk
                e = (i+1)*chunk
                output = np.append(output, tv_fit[indexs[s:e]].multiply(tv_fit[match_indexs[s:e]]).sum(axis = 1).A.ravel())

            df[f'{col}_{tfidf_name}_sim'] = output
            df[f'{col}_{tfidf_name}_sim'] = output.astype("float16")

            del tv_fit, output
            gc_clear()

    del id2index_d, indexs, match_indexs
    gc_clear()
        
    return df

### Pretrained BERT features

In [26]:
def add_bert_features(original_df, df):

    if CFG.colab:
        model_dict = {"mpnet": "paraphrase-multilingual-mpnet-base-v2",
                      "para_xlm": "paraphrase-xlm-r-multilingual-v1",
                      "xlm": "xlm-roberta-base",
                      "MiniLM": "paraphrase-multilingual-MiniLM-L12-v2",
                      }
    else:
        model_dict = {"mpnet": "../input/sbert-models/paraphrase-multilingual-mpnet-base-v2",
                      "para_xlm": "../input/sbert-models/paraphrase-xlm-r-multilingual-v1",
                      "xlm": "../input/sbert-models/xlm-roberta-base",
                      "MiniLM": "../input/sbert-models/paraphrase-multilingual-MiniLM-L12-v2",
                    }

    for model_name in model_dict:

        original_df["name_categories"] = original_df["name"] + "[SEP]" + original_df["categories"]

        original_df["index"] = original_df.index
        original_df["index"] = original_df["index"].astype("int32")
        id_indexes = original_df.set_index("id").loc[df["id"]]["index"].values
        match_id_indexes = original_df.set_index("id").loc[df["match_id"]]["index"].values

        if model_name in ["mpnet", "para_xlm", "xlm", "MiniLM"]:

            model = SentenceTransformer(model_dict[model_name], device=device)

            for col in ["name", "categories", "name_categories"]:
                vec = model.encode(original_df[col], device=device)

                output = []
                for idx1, idx2 in tqdm(zip(id_indexes, match_id_indexes), total=len(id_indexes)):
                    output.append(cos_sim(vec[idx1], vec[idx2]))

                df[f"{col}_{model_name}_sim"] = output
                df[f"{col}_{model_name}_sim"] = df[f"{col}_{model_name}_sim"].astype("float16")

                del vec, output
                gc_clear()

            del model
            torch.cuda.empty_cache()
            gc_clear()
        
        del original_df["name_categories"], original_df["index"], id_indexes, match_id_indexes
        gc_clear()

    return df

### Fine-tuned BERT feature (Stage1)

Reference: [My other notebook](https://www.kaggle.com/code/shkanda/unsupervised-baseline-curricularface?scriptVersionId=100429606)

In [27]:
# ====================================================
#  CurricularFace
# ====================================================   

def l2_norm(input, axis = 1):
    norm = torch.norm(input, 2, axis, True)
    output = torch.div(input, norm)

    return output

class CurricularFace(nn.Module):
    def __init__(self, in_features, out_features, s = 5, m = 0.050):
        super(CurricularFace, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.m = m
        self.s = s
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.threshold = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
        self.kernel = nn.Parameter(torch.Tensor(in_features, out_features))
        self.register_buffer('t', torch.zeros(1))
        nn.init.normal_(self.kernel, std=0.01)

    def forward(self, embbedings, label):
        embbedings = l2_norm(embbedings, axis = 1)
        kernel_norm = l2_norm(self.kernel, axis = 0)
        cos_theta = torch.mm(embbedings, kernel_norm)
        cos_theta = cos_theta.clamp(-1, 1)
        with torch.no_grad():
            origin_cos = cos_theta.clone()
        target_logit = cos_theta[torch.arange(0, embbedings.size(0)), label].view(-1, 1)

        sin_theta = torch.sqrt(1.0 - torch.pow(target_logit, 2))
        cos_theta_m = target_logit * self.cos_m - sin_theta * self.sin_m
        mask = cos_theta > cos_theta_m
        final_target_logit = torch.where(target_logit > self.threshold, cos_theta_m, target_logit - self.mm)

        hard_example = cos_theta[mask]
        with torch.no_grad():
            self.t = target_logit.mean() * 0.01 + (1 - 0.01) * self.t
        cos_theta[mask] = hard_example * (self.t + hard_example)
        cos_theta.scatter_(1, label.view(-1, 1).long(), final_target_logit)
        output = cos_theta * self.s
        return output

In [28]:
# ====================================================
#  Fine-tuned Bert Model
# ==================================================== 

class CustomModel(nn.Module):
    def __init__(self, model_name, n_classes=369985, embedding_size=128):                 
        super(CustomModel, self).__init__()

        self.config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, 
                                               config=self.config)
        #self.fc = ArcMarginProduct(embedding_size, CFG.n_classes)
        self.fc = CurricularFace(embedding_size, n_classes)
        self.head = nn.Sequential(
            nn.Linear(self.config.hidden_size + 2, embedding_size),
            nn.BatchNorm1d(embedding_size),
        )

    def forward(self, ids, mask, lat, lon, labels):
        embedding = self.extract(ids=ids, mask=mask, lat=lat, lon=lon)
        output = self.fc(embedding, labels)
        return output
    
    def extract(self, ids, mask, lat, lon):
        lat, lon = lat.view(-1, 1), lon.view(-1, 1)
        out = self.model(input_ids=ids, attention_mask=mask)
        embedding = out[0][:, 0, :] # CLS Token
        embedding = torch.cat([embedding, lat, lon], axis=1)
        embedding = self.head(embedding)
        return embedding
    
print(CustomModel(CFG.model))

CustomModel(
  (model): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [29]:
# ====================================================
#  Dataset
# ====================================================

class FoursquareDataset(Dataset):
    def __init__(self, df, include_labels=True):
        tokenizer = AutoTokenizer.from_pretrained(CFG.model)

        self.df = df
        self.include_labels = include_labels

        self.text = df['text'].tolist()
        self.lat = df['latitude'].values
        self.lon = df['longitude'].values
        self.labels = df['point_of_interest'].values

        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = CFG.max_length,
            truncation = True,
            return_attention_mask=True
        )
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):

        input_ids = torch.tensor(self.encoded['input_ids'][idx], dtype=torch.long)
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx], dtype=torch.long)
        lat = torch.tensor(self.lat[idx], dtype=torch.float)
        lon = torch.tensor(self.lon[idx], dtype=torch.float)

        if self.include_labels:
            label = torch.tensor(self.labels[idx], dtype=torch.long)
            return input_ids, attention_mask, lat, lon, label

        return input_ids, attention_mask, lat, lon

In [30]:
def inference_fn(test, i_set):
    
    if int(i_set) == int(0):
        n_classes = 369987
    elif int(i_set) == int(1):
        n_classes = 369985

    test_dataset = FoursquareDataset(test, include_labels=False)

    test_loader = DataLoader(
        test_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
        worker_init_fn=seed_worker,
        generator=g
    )

    model = CustomModel(CFG.model, n_classes=n_classes)
    path = MODEL_DIR2 + f"set{1-int(i_set)}_bert_epoch15.pth"
    state = torch.load(path, map_location=torch.device('cpu'))
    model.load_state_dict(state)
    model.to(device)
    model.eval()

    preds = []
    for step, (input_ids, attention_mask, lat, lon) in tqdm(enumerate(test_loader), total=len(test_loader)):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        lat = lat.to(device)
        lon = lon.to(device)
        
        with torch.no_grad():
            pred = model.extract(input_ids, attention_mask, lat, lon)
        preds.append(pred.detach().cpu().numpy())
    preds = np.concatenate(preds)

    del model
    torch.cuda.empty_cache()
    gc_clear()

    return preds

In [31]:
def add_finetuned_bert_features(original_df, df):

    original_df["index"] = original_df.index
    original_df["index"] = original_df["index"].astype("int32")
    id_indexes = original_df.set_index("id").loc[df["id"]]["index"].values
    match_id_indexes = original_df.set_index("id").loc[df["match_id"]]["index"].values

    if CFG.train:

        i_set = original_df["set"][0]
        embedding = inference_fn(original_df, i_set)

        output = []
        for idx1, idx2 in tqdm(zip(id_indexes, match_id_indexes), total=len(id_indexes)):
            output.append(cos_sim(embedding[idx1], embedding[idx2]))

        df[f"finetuned_mpnet_sim"] = output
        df[f"finetuned_mpnet_sim"] = df[f"finetuned_mpnet_sim"].astype("float16")

        del embedding, output
        gc_clear()

    else:

        embedding0 = inference_fn(original_df, 0)
        embedding1 = inference_fn(original_df, 1)

        output0 = []
        for idx1, idx2 in tqdm(zip(id_indexes, match_id_indexes), total=len(id_indexes)):
            output0.append(cos_sim(embedding0[idx1], embedding0[idx2]))

        output1 = []
        for idx1, idx2 in tqdm(zip(id_indexes, match_id_indexes), total=len(id_indexes)):
            output1.append(cos_sim(embedding1[idx1], embedding1[idx2]))

        output = (np.array(output0) + np.array(output1))/2

        df[f"finetuned_mpnet_sim"] = output
        df[f"finetuned_mpnet_sim"] = df[f"finetuned_mpnet_sim"].astype("float16")

        del embedding0, embedding1, output0, output1, output
        gc_clear()
        
    del original_df["index"], id_indexes, match_id_indexes
    gc_clear()

    return df

### Other features

`multiprocessing` is used to speed up feature generation.

Since `multiprocessing` requires large memory,I reduced memory usage by creating features and making predictions on each segmented data (each `data_split`).

In [32]:
def _add_other_features(args):
    (_, df), original_df = args

    for col in tqdm(feat_columns):
        
        col_values = original_df.set_index('id').loc[df['id']][col].values.astype(str)
        matcol_values = original_df.set_index('id').loc[df['match_id']][col].values.astype(str)

        if not col in ['country', 'name_lang', 'city2', 'country2']:
            df[f'{col}_gesh'] = [gesh(s1, s2) for s1, s2 in zip(col_values, matcol_values)]
            df[f'{col}_leven'] = [leven(s1, s2) for s1, s2 in zip(col_values, matcol_values)]
            df[f'{col}_jaro'] = [jaro(s1, s2) for s1, s2 in zip(col_values, matcol_values)]
            df[f'{col}_lcs_sequence'] = [lcs_sequence(s1, s2) for s1, s2 in zip(col_values, matcol_values)]
            df[f'{col}_lcs_string'] = [lcs_string(s1, s2) for s1, s2 in zip(col_values, matcol_values)]

            # # Measure against memory limitation
            df[f'{col}_gesh'] = df[f'{col}_gesh'].astype("float16")
            df[f'{col}_jaro'] = df[f'{col}_jaro'].astype("float16")
        
        if col not in ['country', 'name_lang', 'phone', 'zip', 'city2', 'country2']:
            df[f'{col}_len'] = list(map(len, col_values))
            df[f'match_{col}_len'] = list(map(len, matcol_values)) 
            df[f'{col}_len_diff'] = (np.abs(df[f'{col}_len'] - df[f'match_{col}_len'])).astype("int16")
            df[f'{col}_nleven'] = (df[f'{col}_leven'] / np.sqrt(df[f'{col}_len']*df[f'match_{col}_len'])).astype("float16")
            df[f'{col}_nlcs_sequence'] = (df[f'{col}_lcs_sequence'] / np.sqrt(df[f'{col}_len']*df[f'match_{col}_len'])).astype("float16")
            df[f'{col}_nlcs_string'] = (df[f'{col}_lcs_string'] / np.sqrt(df[f'{col}_len']*df[f'match_{col}_len'])).astype("float16")

            del df[f'match_{col}_len'], df[f'{col}_len']
            gc_clear()
                              
        if col in ['categories']:
            df[f'{col}_similarity'] = [categorical_similarity(s1, s2) for s1, s2 in zip(col_values, matcol_values)]
            df[f'{col}_similarity'] = df[f'{col}_similarity'].astype("float16")
        
        if col in ['name_lang', 'country', 'city2', 'country2']:
            df[f'{col}_equal'] = [equal(s1, s2) for s1, s2 in zip(col_values, matcol_values)]
            df[f'{col}_equal'] = df[f'{col}_equal'].fillna(-1).astype('int8')

        del col_values, matcol_values
        gc_clear()

    # ====================================================
    # Mean features
    # ====================================================    
    feat_columns2 = ['name', 'address', 'city', 'state', 'url', 'categories']

    df["gesh_mean"] = df[[f"{col}_gesh" for col in feat_columns2]].mean(axis=1).astype("float16")
    df["tfidf1_mean"] = df[[f"{col}_tfidf1_sim" for col in vec_columns]].mean(axis=1).astype("float16")
    df["tfidf2_mean"] = df[[f"{col}_tfidf2_sim" for col in vec_columns]].mean(axis=1).astype("float16")
    df["leven_mean"] = df[[f"{col}_leven" for col in feat_columns2]].mean(axis=1).astype("float16")
    df["jaro_mean"] = df[[f"{col}_jaro" for col in feat_columns2]].mean(axis=1).astype("float16")
    df["nlcs_sequence_mean"] = df[[f"{col}_nlcs_sequence" for col in feat_columns2]].mean(axis=1).astype("float16")
    df["nlcs_string_mean"] = df[[f"{col}_nlcs_string" for col in feat_columns2]].mean(axis=1).astype("float16")

    # Measure against memory limitation
    for col in feat_columns:
        if not col in ['country', 'name_lang', 'city2', 'country2']:
            df[f'{col}_leven'] = df[f'{col}_leven'].fillna(99).astype("int16")
            df[f'{col}_lcs_sequence'] = df[f'{col}_lcs_sequence'].fillna(99).astype("int16")
            df[f'{col}_lcs_string'] = df[f'{col}_lcs_string'].fillna(99).astype("int16")
                    
    return df

def add_other_features(original_df, df):
    if len(original_df)>5:
        processes = multiprocessing.cpu_count()
        with multiprocessing.Pool(processes=processes) as pool:
            df["idx_group"] = df.index // (len(df) / processes)
            len_df_gby = len(df.groupby('idx_group'))
            dfs = pool.imap_unordered(_add_other_features, zip(df.groupby('idx_group'), [original_df for _ in range(len_df_gby)]))
            dfs = tqdm(dfs, total=len_df_gby)
            dfs = list(dfs)
        df = pd.concat(dfs)
        df.drop(columns="idx_group", axis=1, inplace=True)
        del dfs
        return df
    else:  
        df = _add_other_features(((_, df), original_df))
        return df

### Run

In [33]:
def preprocess(original_df):
    original_df = original_df_preprocess(original_df)
    df = recall_knn(original_df, Neighbors=min(CFG.n_neighbors, len(original_df)))
    df = add_distance_features(original_df, df)
    df = add_tfidf_features(original_df, df)
    df = add_bert_features(original_df, df)
    # df = add_finetuned_bert_features(original_df, df)
    # ====================================================
    #  preprocessing (and prediction when testing)
    # ====================================================
    new_df = pd.DataFrame()

    for i in range(CFG.data_split):
        df_temp = df[df["data_split"]==i].reset_index(drop=True)
        df_temp = add_other_features(original_df, df_temp)

        # Prediction
        if not CFG.train:
            df_temp = inference(df_temp)
    
        new_df = new_df.append(df_temp, ignore_index=True)
    
        del df_temp
        gc_clear()
    
    del df
    gc_clear()

    display(new_df)

    return new_df

In [34]:
if CFG.train:
    df = pd.concat([
        preprocess(original_df[original_df["set"]==0].reset_index(drop=True)), 
        preprocess(original_df[original_df["set"]==1].reset_index(drop=True)), 
    ]).reset_index(drop=True)
else:
    df = preprocess(original_df)
display(df)

100%|██████████| 5/5 [00:00<00:00, 1279.53it/s]
100%|██████████| 4/4 [00:00<00:00,  9.32it/s]
100%|██████████| 5/5 [00:00<00:00, 36345.79it/s]
100%|██████████| 5/5 [00:00<00:00, 65128.94it/s]


len: 1


100%|██████████| 4/4 [00:04<00:00,  1.01s/it]
100%|██████████| 4/4 [00:04<00:00,  1.01s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 4206.92it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 5236.33it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 3483.64it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 3134.76it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 3160.74it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 2357.68it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 3945.72it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 3751.61it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 4510.00it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 3209.11it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 3912.60it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 3650.40it/s]
100%|██████████| 13/13 [00:21<00:00,  1.62s/it]
100%|██████████| 5/5 [00:32<00:00,  6.52s/it]


Unnamed: 0,id,matches
0,E_001b6bad66eb98,E_0283d9f61e569d


Unnamed: 0,id,matches
0,E_001b6bad66eb98,E_0283d9f61e569d


In [35]:
if CFG.train:
    with open(DATA_DIR + f"df-exp{CFG.exp}.pkl", mode="wb") as f:
        pickle.dump(df, f, protocol=4)

In [36]:
if CFG.train:
    with open(DATA_DIR + f'df-exp{CFG.exp}.pkl', 'rb') as f:
        df = pickle.load(f)
    display(df)

# Model

In [37]:
def make_fold(df):
    unique_id = df["id"].unique()
    fold = np.zeros(len(df), dtype=int)
    kf = KFold(n_splits=CFG.fold, shuffle=True, random_state=CFG.seed)
    for i_fold, (_, va_group_idx) in enumerate(kf.split(unique_id)):
        va_groups = unique_id[va_group_idx]
        is_va = df[df["id"].isin(va_groups)].index
        fold[is_va] = i_fold
    return fold

In [38]:
def calc_score(X_val, val_pred, threshold=CFG.threshold, pp=False):

    new_df = pd.DataFrame(X_val["id"].unique()).rename(columns={0: "id"})

    tmp_df = X_val[val_pred >= threshold].groupby("id")["match_id"].apply(list).reset_index()
    tmp_df.columns = ["id", "matches"]
    tmp_df["matches"] = tmp_df["matches"].apply(lambda x: " ".join(x))

    new_df = pd.merge(new_df, tmp_df, on="id", how="left")
    new_df["matches"] = new_df["id"] + " " + new_df["matches"].fillna("")

    if pp:
        score = get_score(post_process(new_df))
    else:
        score = get_score(new_df)

    return score

In [39]:
def run_catboost(param, df):
    
    oof_pred = np.zeros(len(df))
    feature_importance_df = pd.DataFrame()
    score_list = []
    
    folds_idx = make_fold(df)
    oof_pred = np.zeros(len(df))

    for fold in range(CFG.fold):

        if fold in CFG.used_fold:

            LOGGER.info(f"==============================================")
            LOGGER.info(f"▶︎ Start fold{fold} Training")
            LOGGER.info(f"==============================================")

            tr_idx = np.argwhere(folds_idx != fold).reshape(-1)
            va_idx = np.argwhere(folds_idx == fold).reshape(-1)
            X_trn, X_val = df.loc[tr_idx].reset_index(drop=True), df.loc[va_idx].reset_index(drop=True)
            y_trn, y_val = df.loc[tr_idx, CFG.target].reset_index(drop=True), df.loc[va_idx, CFG.target].reset_index(drop=True)

            LOGGER.info(f"train_shape: {X_trn.shape}, val_shape: {X_val.shape}")

            train = Pool(X_trn.drop(columns=[CFG.target]+DROP_COLS), y_trn, cat_features=CATEGORICAL_COL)
            valid = Pool(X_val.drop(columns=[CFG.target]+DROP_COLS), y_val, cat_features=CATEGORICAL_COL)

            model = CatBoost(param)
            model = model.fit(
                        train,
                        eval_set=valid,
                        use_best_model=True,
                        early_stopping_rounds=100,
                        verbose_eval=200
                        )
            
            # ==============================================
            # Feature Importances
            # ==============================================

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = model.feature_names_
            fold_importance_df["importance"] = model.feature_importances_
            fold_importance_df["fold"] = fold
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

            # ==============================================
            # Calculate Score
            # ==============================================
            val_pred = model.predict(X_val.drop(columns=[CFG.target]+DROP_COLS), prediction_type='Probability').T[1]
            score = calc_score(X_val, val_pred)

            LOGGER.info(f"fold{fold} score: {score:.6f}")

            oof_pred[va_idx] = val_pred
            score_list.append([fold, score])

            # ==============================================
            # Save model
            # ==============================================
            pickle.dump(model, open(OUTPUT_DIR + f'model_fold{fold}.pkl', 'wb'))
            
            del model, X_trn, X_val, y_trn, y_val, train, valid
            gc_clear()
    
    score_df = pd.DataFrame(
        score_list, columns=["fold", "IoU"])
    
    # ==============================================
    # Create Kaggle Dataset
    # ==============================================

    !kaggle datasets init -p $OUTPUT_DIR

    metadata = {"id": f"shkanda/foursquare-dataset-exp{CFG.exp}",
                    "title": f"foursquare-dataset-exp{CFG.exp}",
                    "licenses": [{"name": "CC0-1.0"}]}

    with open(OUTPUT_DIR+'dataset-metadata.json', 'w') as fp:
        json.dump(metadata, fp)

    !kaggle datasets create -p $OUTPUT_DIR
 
    return oof_pred, score_df, feature_importance_df

def show_feature_importance(feature_importance_df):
    order = list(feature_importance_df.groupby("feature").mean().sort_values("importance", ascending=False).index)
    plt.figure(figsize=(10, 15))
    sns.barplot(x="importance", y="feature", data=feature_importance_df, order=order)
    plt.title("feature importance")
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR+'feature_importance.png')

In [40]:
if CFG.train:
    oof_pred, score_df, feature_importance_df = run_catboost(PARAMS, df)
    display(score_df)

In [41]:
if CFG.train:
    show_feature_importance(feature_importance_df)

In [42]:
if CFG.train:
    df["pred"] = oof_pred
    with open(DATA_DIR + f"oof_df-exp{CFG.exp}.pkl", mode="wb") as f:
        pickle.dump(df, f, protocol=4)

# Submit

In [43]:
if not(CFG.train):
    sub = pd.merge(original_df[["id"]], df, on="id", how="left")
    sub["matches"] = (sub["id"] + " " + sub["matches"].fillna("")).apply(lambda x: x.strip())
    sub = post_process(sub)
    sub.to_csv("submission.csv", index=False)
    display(sub)

Unnamed: 0,id,matches
0,E_00001118ad0191,E_00001118ad0191
1,E_000020eb6fed40,E_000020eb6fed40
2,E_00002f98667edf,E_00002f98667edf
3,E_001b6bad66eb98,E_001b6bad66eb98 E_0283d9f61e569d
4,E_0283d9f61e569d,E_0283d9f61e569d E_001b6bad66eb98
