In [1]:
import os
import gc
import re
import sys
import math
import json
import time
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from glob import glob
from pathlib import Path
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

# Config

In [2]:
CFG = Namespace(
    kaggle = False,
    select_features = False,
    seed = 42,
    debug = False,
    validate = False,
    target = "label",
    n_neighbors = 20,
    n_splits = 5,
    train_path = 'train_dataset'
)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
bad_features = ['name_len_diff', 'state_jaro', 'state_encoded_lcs',
                'address_encoded_gesh', 'state_gesh', 'city_encoded_nlcsk',
                'categories_nlcsk', 'city_encoded_sim', 'state_leven',
                'address_encoded_nlcsk', 'name_nleven', 'city_encoded_gesh',
                'state_encoded_leven', 'city_encoded_jaro', 'country_lcs',
                'city_encoded_len_diff', 'state_nlcs', 'url_jaro']
    
seed_everything(CFG.seed)

# Load train dataset

In [3]:
train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))

train = list()
for filename in tqdm(train_files):
    df = pd.read_parquet(filename)
    train.append(df)

train = pd.concat(train, axis=0, ignore_index=True)

  0%|          | 0/5 [00:00<?, ?it/s]

# Load validation dataset

In [4]:
valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))

valid = list()
for filename in tqdm(valid_files):
    df = pd.read_parquet(filename)
    valid.append(df)

valid = pd.concat(valid, axis=0, ignore_index=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
if bad_features:
    train = train.drop(bad_features, axis=1)
    valid = valid.drop(bad_features, axis=1)

# Set features

In [6]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = list(train.select_dtypes(include=numerics).columns)
features.remove(CFG.target)

# Train

In [7]:
def fit_lgbm(X_train, y_train, X_val, y_val, params=None, es_rounds=50, model_dir=None):
    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_val, y_val)

    if model_dir is None:
        model = lgb.train(
            params,
            train_set = train_dataset, 
            valid_sets = [train_dataset, valid_dataset],
            callbacks = [lgb.log_evaluation(50), 
                         lgb.early_stopping(stopping_rounds=es_rounds)],
        )
    else:
        with open(f'{model_dir}/lgbm.pkl', 'rb') as f:
            model = pickle.load(f)

    file = f'fsq_lgbm_models_train_test/lgbm.pkl'
    pickle.dump(model, open(file, 'wb'))

    return model

def predict(model, threshold, X_val, y_val):
    pred = model.predict(X_val)
    cv = ((pred > threshold) == y_val).mean()
    print(f"CV-accuracy: {cv}")
    return pred

def inference_lgbm(model, feat_df):
    pred = np.array(model.predict(feat_df))
    return pred

def inference_lgbm_fold(models, feat_df):
    pred = np.array([model.predict(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

## Split dataset for train

## Train model

In [8]:
warnings.filterwarnings("ignore", module="lightgbm")

params = {
    'seed': CFG.seed,
#     'device': 'gpu',
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
#     'max_bin': 200,
    'max_depth': 7,   
    'num_leaves': 35, 
#     'min_data_in_leaf': 40,
    'n_estimators': 500, 
    'colsample_bytree': 0.9,
    'verbose': -1
}

if CFG.select_features:
    import lofo
    from lofo import LOFOImportance, Dataset, plot_importance
    # extract a sample of the data
    train = train.sample(frac=0.1, random_state=CFG.seed)
    valid = valid.sample(frac=0.1, random_state=CFG.seed)
    train = pd.concat([train, valid], ignore_index=True)
    del valid
    gc.collect()
    # define the validation scheme
    cv = KFold(n_splits=2)
    # define the binary target and the features
    dataset = Dataset(df=train, target=CFG.target, features=features)
    # define the validation scheme and scorer
    lofo_imp = LOFOImportance(dataset, scoring="roc_auc", cv=cv, model=lgb.LGBMClassifier(**params))
    # get the mean and standard deviation of the importances in pandas format
    importance_df = lofo_imp.get_importance()
    # plot the means and standard deviations of the importances
    plot_importance(importance_df, figsize=(12, 20))
else:
    model = fit_lgbm(train[features], train[CFG.target], 
                   valid[features], valid[CFG.target], 
                   params=params, es_rounds=50)
    pred = predict(model, threshold=0.5, valid[features], valid[CFG.target])

Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.993609	valid_1's auc: 0.99344
[100]	training's auc: 0.994307	valid_1's auc: 0.993899
[150]	training's auc: 0.994688	valid_1's auc: 0.994081
[200]	training's auc: 0.99494	valid_1's auc: 0.994158
[250]	training's auc: 0.995168	valid_1's auc: 0.994204
[300]	training's auc: 0.995353	valid_1's auc: 0.994225
[350]	training's auc: 0.995537	valid_1's auc: 0.994276
[400]	training's auc: 0.9957	valid_1's auc: 0.994319
[450]	training's auc: 0.995835	valid_1's auc: 0.994346
[500]	training's auc: 0.995947	valid_1's auc: 0.994347
Did not meet early stopping. Best iteration is:
[500]	training's auc: 0.995947	valid_1's auc: 0.994347

CV-accuracy: 0.9865615976114233


In [9]:
# bad_features = importance_df.loc[importance_df.val_imp_0 < 0.00002, 'feature'].values
# bad_features

## Fuctions for postprocessing and validation

In [10]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    id2poi = get_id2poi(input_df)
    poi2ids = get_poi2ids(input_df)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    
    return scores.mean()

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in df["matches"].values:
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    
    return df 

def get_matches(df, preds):
    match_id = df["match_id"].values
    matches = []

    for df_id, pred, match_idx in tqdm(zip(df["id"], preds, match_id), total=df.shape[0]):
        idx = np.round(pred)
        if idx == 1:
            matches.append(df_id + " " + match_idx)
        else:
            matches.append(df_id)
    
    df['matches'] = matches
    df = postprocess(df)
    
    return df[['id', 'matches', 'point_of_interest']]

## Add POI column to validation dataset

In [11]:
if CFG.kaggle:
    data_root = '../input/foursquare-location-matching'
else:
    data_root = 'foursquare_location_matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))[['id', 'point_of_interest']]

valid = valid.merge(data, how='left', on='id')

del data
gc.collect()

55

##  Predict matches and postprocess them

In [12]:
%%time

res = get_matches(valid, pred)
res = res.drop_duplicates()
print(f"CV: {get_score(res):.6f}")

gc.collect()

  0%|          | 0/13917577 [00:00<?, ?it/s]

CV: 0.835467
CPU times: user 1min 32s, sys: 776 ms, total: 1min 33s
Wall time: 1min 32s


17

In [13]:
# Baseline:
# AUC: 0.986
# CV: 0.831

# Add 'is_unbalance' parameter
# acc: 0.967
# CV: 0.749

# Sort categories
# acc: 0.9863
# CV: 0.8325

# Drop bad features
# acc: 0.98656
# CV: 0.835467

# Further ideas

- add ntlk.edit_distance to your features
- add new KNN from this notebook: https://www.kaggle.com/code/ragnar123/flm-xlmroberta-inference-baseline
- add manhattan distance and euqlidian distance
- increase number of nearest neighbours to a very high value (like 50-100-200), so you will be able to find more matches
- create text column, use BERT to categorize https://www.kaggle.com/code/lunapandachan/foursquare-s-bert-labo-note#V3-Labeled-train

- how to handle missing data https://www.kaggle.com/code/parulpandey/a-guide-to-handling-missing-values-in-python
- mean/median/std encode features

- use Cat2Vec to calculate categories similarity https://www.kaggle.com/code/aerdem4/foursquare-cat2vec/notebook


- Optuna!


- try XLMRoberta




- change min_data_in_leaf and try to retrain with CPU 