In [1]:
import os
import gc
import re
import sys
import math
import json
import time
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from glob import glob
from pathlib import Path
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

# Config

In [2]:
CFG = Namespace(
    train = True,
    full = False,
    select_features = False,
    find_thresh = False,
    seed = 42,
    debug = False,
    validate = False,
    target = "label",
    n_neighbors = 20,
    n_splits = 5,
    threshold = 0.5,
    train_path = 'train_dataset'
)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
new_features = ['category_venn',
                'euclidean',
                'haversine',
                'kdist_diff',
                'kneighbors_mean',
                'manhattan',
                'kdist_name_country',
                'kneighbors_name_country',
                'kdist_address_country',
                'kneighbors_address_country',
               ]
bad_featuers = ['name_clean', 'address_clean', 'phone_clean']
    
]
    
seed_everything(CFG.seed)

# Load train dataset

In [3]:
def downcast_floats(df):
    floats = ['float32', 'float64']
    float_features = list(df.select_dtypes(include=floats).columns)
    for f in float_features:
        df[f] = df[f].astype('float16')
    return df
    
if CFG.full:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))
    train_files = train_files + valid_files
else:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))

train = list()
for filename in tqdm(train_files):
    df = pd.read_parquet(filename)
    df = downcast_floats(df)
    train.append(df)

train = pd.concat(train, axis=0, ignore_index=True)

gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

0

# Load validation dataset

In [4]:
if not CFG.full:
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))

    valid = list()
    for filename in tqdm(valid_files):
        df = pd.read_parquet(filename)
        df = downcast_floats(df)
        valid.append(df)

    valid = pd.concat(valid, axis=0, ignore_index=True)
    
    gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

In [5]:
# if bad_features:
#     train = train.drop(bad_features, axis=1)
#     if not CFG.full:
#         valid = valid.drop(bad_features, axis=1)
        
# gc.collect()

# Set features

In [6]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = list(train.select_dtypes(include=numerics).columns)
features.remove(CFG.target)

# Train

In [7]:
def fit_lgbm(X_train, y_train, X_val, y_val, params=None, es_rounds=50, model_dir=None):
    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        params,
        train_set = train_dataset, 
        valid_sets = [train_dataset, valid_dataset],
        callbacks = [lgb.log_evaluation(50), 
                     lgb.early_stopping(stopping_rounds=es_rounds),
#                      init_model = gbm_init
                    ]
        )

    file = f'fsq_lgbm_models_train_test/lgbm.pkl'
    pickle.dump(model, open(file, 'wb'))

    return model

def predict(model, threshold, X_val, y_val):
    pred = model.predict(X_val)
    cv = ((pred > threshold) == y_val).mean()
    print(f"ROC AUC: {cv}")
    return pred, cv

def inference_lgbm(model, feat_df):
    pred = np.array(model.predict(feat_df))
    return pred

def inference_lgbm_fold(models, feat_df):
    pred = np.array([model.predict(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

## Train model

In [8]:
warnings.filterwarnings("ignore", module="lightgbm")

params = {
    'seed': CFG.seed,
#     'device': 'gpu',
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
#     'max_bin': 200,
    'max_depth': 7,   
    'num_leaves': 35, 
#     'min_data_in_leaf': 40,
    'n_estimators': 1500, 
    'colsample_bytree': 0.9,
    'snapshot_freq': 50,
    'verbose': -1
}

if CFG.select_features:
    import lofo
    from lofo import LOFOImportance, Dataset, plot_importance
    # extract a sample of the data
    train = train.sample(frac=0.15, random_state=CFG.seed)
    valid = valid.sample(frac=0.15, random_state=CFG.seed)
    train = pd.concat([train, valid], ignore_index=True)
    del valid
    gc.collect()
    # define the validation scheme
    cv = KFold(n_splits=2)
    # define the binary target and the features
    dataset = Dataset(df=train, target=CFG.target, features=new_features)
    # define the validation scheme and scorer
    lofo_imp = LOFOImportance(dataset, scoring="roc_auc", cv=cv, model=lgb.LGBMClassifier(**params))
    # get the mean and standard deviation of the importances in pandas format
    importance_df = lofo_imp.get_importance()
    importance_df.to_csv('importance_df.csv')
    # plot the means and standard deviations of the importances
    plot_importance(importance_df, figsize=(12, 20))
elif CFG.full and CFG.train:
    model = fit_lgbm(train[features], train[CFG.target], 
                     train[features], train[CFG.target], 
                     params=params, es_rounds=50)
elif CFG.train:
    model = fit_lgbm(train[features], train[CFG.target], 
                     valid[features], valid[CFG.target], 
                     params=params, es_rounds=50)
else:
    model_file = 'fsq_lgbm_models_train_test/lgbm1.pkl'
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.993762	valid_1's auc: 0.992778
[100]	training's auc: 0.994574	valid_1's auc: 0.993535
[150]	training's auc: 0.994958	valid_1's auc: 0.99365
[200]	training's auc: 0.99526	valid_1's auc: 0.993786
[250]	training's auc: 0.995463	valid_1's auc: 0.993856
[300]	training's auc: 0.995645	valid_1's auc: 0.99394
[350]	training's auc: 0.995803	valid_1's auc: 0.99401
[400]	training's auc: 0.99596	valid_1's auc: 0.994064
[450]	training's auc: 0.996081	valid_1's auc: 0.994097
[500]	training's auc: 0.996195	valid_1's auc: 0.994138
[550]	training's auc: 0.996302	valid_1's auc: 0.994169
[600]	training's auc: 0.996393	valid_1's auc: 0.994181
[650]	training's auc: 0.996481	valid_1's auc: 0.994192
[700]	training's auc: 0.996564	valid_1's auc: 0.994209
[750]	training's auc: 0.996638	valid_1's auc: 0.994214
[800]	training's auc: 0.99672	valid_1's auc: 0.994238
[850]	training's auc: 0.996802	valid_1's auc: 0.994262
[900]	train

## Find optimal threshold

In [9]:
best_thr = 0
best_cv = 0

if CFG.find_thresh:
    for thr in tqdm(np.arange(0.47, 0.54, 0.01)):
        print(f'Threshold is {thr}')
        _, cv = predict(model, thr, valid[features], valid[CFG.target])
        if cv > best_cv:
            best_cv = cv
            best_thr = thr
            
print(f'Best threshold is {best_thr}')

Best threshold is 0


## Fuctions for postprocessing and validation

In [10]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    id2poi = get_id2poi(input_df)
    poi2ids = get_poi2ids(input_df)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    
    return scores.mean()

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in df["matches"].values:
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    
    return df 

def get_matches(df, preds):
    match_id = df["match_id"].values
    matches = []

    for df_id, pred, match_idx in tqdm(zip(df["id"], preds, match_id), total=df.shape[0]):
        idx = np.round(pred)
        if idx == 1:
            matches.append(df_id + " " + match_idx)
        else:
            matches.append(df_id)
    
    df['matches'] = matches
    df = postprocess(df)
    
    return df[['id', 'matches', 'point_of_interest']]

## Add POI column to validation dataset

In [11]:
if not CFG.full:
    data_root = 'foursquare_location_matching'
    data = pd.read_csv(os.path.join(data_root, 'train.csv'))[['id', 'point_of_interest']]

    valid = valid.merge(data, how='left', on='id')

    del data
    gc.collect()

##  Predict matches and postprocess them

In [12]:
%%time

if CFG.find_thresh:
    pred, cv = predict(model, best_thr, valid[features], valid[CFG.target])
else:
    pred, cv = predict(model, CFG.threshold, valid[features], valid[CFG.target])

if not CFG.full:
    res = get_matches(valid, pred)
    res = res.drop_duplicates()
    print(f"IOU: {get_score(res):.6f}")

    gc.collect()

ROC AUC: 0.9869706449667187


  0%|          | 0/26677299 [00:00<?, ?it/s]

IOU: 0.859534
CPU times: user 33min, sys: 4.92 s, total: 33min 5s
Wall time: 5min 47s


In [13]:
# Baseline:
# ROC AUC: 0.986
# IOU: 0.831

# Add 'is_unbalance' parameter
# ROC AUC: 0.967
# IOU: 0.749

# Sort categories
# ROC AUC: 0.9863
# IOU: 0.8325

# Drop bad features
# ROC AUC: 0.98656
# IOU: 0.835467

# Add main_category and closest_city
# ROC AUC: 0.9868/0.987
# IOU: 0.83869/0.838849

# Clean name
# ROC AUC: 0.98686/0.98697
# IOU: 0.8392/0.8383

# Add some many distance and text features
# ROC AUC: 0.9871/
# IOU: 0.839483/

# Remove useless features
# ROC AUC: 0.98717
# IOU: 0.839719

# Add KNN by name
# ROC AUC: 0.985847
# IOU: 0.857884

# Add KNN by name and address
# ROC AUC: 0.98697
# IOU: 0.859534