In [1]:
import os
import gc
import re
import sys
import math
import json
import time
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from glob import glob
from pathlib import Path
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

# Config

In [2]:
CFG = Namespace(
    train = True,
    full = False,
    select_features = False,
    find_thresh = False,
    seed = 42,
    debug = False,
    validate = False,
    target = "label",
    n_neighbors = 20,
    n_splits = 5,
    threshold = 0.5,
    train_path = 'train_dataset'
)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
new_features = ['category_venn',
                'euclidean',
                'haversine',
                'kdist_diff',
                'kneighbors_mean',
                'manhattan',
                'text_gesh',
                'text_jaro',
                'text_lcs',
                'text_len_diff',
                'text_leven',
                'text_nlcs',
                'text_nlcsk',
                'text_nleven'
               ]  

bad_features = ['city_gesh', 'text_leven', 'address_encoded_len_diff',
               'name_encoded_nleven', 'name_lcs', 'city_leven', 'city_sim',
               'closest_city_leven', 'main_categories_leven', 'city_lcs',
               'url_leven', 'address_nlcsk', 'address_len_diff', 'url_lcs',
               'closest_city_nlcs', 'url_nlcsk', 'kneighbors', 'city_encoded_lcs',
               'city_jaro', 'address_encoded_lcs', 'main_categories_nlcsk',
               'address_encoded_jaro', 'url_nleven', 'url_len_diff',
               'closest_city_len_diff']
    
seed_everything(CFG.seed)

# Load train dataset

In [3]:
if CFG.full:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))
    train_files = train_files + valid_files
else:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))

train = list()
for filename in tqdm(train_files):
    df = pd.read_parquet(filename)
    train.append(df)

train = pd.concat(train, axis=0, ignore_index=True)

gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

0

# Load validation dataset

In [4]:
if not CFG.full:
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))

    valid = list()
    for filename in tqdm(valid_files):
        df = pd.read_parquet(filename)
        valid.append(df)

    valid = pd.concat(valid, axis=0, ignore_index=True)
    
    gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

## Add haversine distance

In [10]:
data_root = 'foursquare_location_matching'
data = pd.read_csv(os.path.join(data_root, 'train.csv'))
data = data[['id', 'latitude', 'longitude']]


def vectorized_haversine(lats1, lats2, longs1, longs2):
    radius = 6371
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

def add_haversine(df):
    df = df.merge(data, how='left', on='id')
    df.rename({'latitude': 'lat1', 'longitude': 'lon1'}, axis=1, inplace=True)
    df = df.merge(data, how='left', left_on='match_id', right_on='id')
    df.rename({'latitude': 'lat2', 'longitude': 'lon2', 'id_x': 'id'}, axis=1, inplace=True)
    df['haversine'] = vectorized_haversine(df['lat1'], df['lon1'], df['lat2'], df['lon2'])
    df.drop(['lat1', 'lon1', 'lat2', 'lon2', 'id_y'], axis=1, inplace=True)
    return df

train = add_haversine(train)
valid = add_haversine(valid)

In [6]:
if bad_features:
    train = train.drop(bad_features, axis=1)
    valid = valid.drop(bad_features, axis=1)

# Set features

In [7]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = list(train.select_dtypes(include=numerics).columns)
features.remove(CFG.target)

# Train

In [8]:
def fit_lgbm(X_train, y_train, X_val, y_val, params=None, es_rounds=50, model_dir=None):
    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        params,
        train_set = train_dataset, 
        valid_sets = [train_dataset, valid_dataset],
        callbacks = [lgb.log_evaluation(50), 
                     lgb.early_stopping(stopping_rounds=es_rounds)]
        )

    file = f'fsq_lgbm_models_train_test/lgbm.pkl'
    pickle.dump(model, open(file, 'wb'))

    return model

def predict(model, threshold, X_val, y_val):
    pred = model.predict(X_val)
    cv = ((pred > threshold) == y_val).mean()
    print(f"ROC AUC: {cv}")
    return pred, cv

def inference_lgbm(model, feat_df):
    pred = np.array(model.predict(feat_df))
    return pred

def inference_lgbm_fold(models, feat_df):
    pred = np.array([model.predict(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

## Split dataset for train

## Train model

In [11]:
warnings.filterwarnings("ignore", module="lightgbm")

params = {
    'seed': CFG.seed,
#     'device': 'gpu',
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
#     'max_bin': 200,
    'max_depth': 7,   
    'num_leaves': 35, 
#     'min_data_in_leaf': 40,
    'n_estimators': 1500, 
    'colsample_bytree': 0.9,
    'verbose': -1
}

if CFG.select_features:
    import lofo
    from lofo import LOFOImportance, Dataset, plot_importance
    # extract a sample of the data
    train = train.sample(frac=0.1, random_state=CFG.seed)
    valid = valid.sample(frac=0.1, random_state=CFG.seed)
    train = pd.concat([train, valid], ignore_index=True)
    del valid
    gc.collect()
    # define the validation scheme
    cv = KFold(n_splits=2)
    # define the binary target and the features
    dataset = Dataset(df=train, target=CFG.target, features=new_features)
    # define the validation scheme and scorer
    lofo_imp = LOFOImportance(dataset, scoring="roc_auc", cv=cv, model=lgb.LGBMClassifier(**params))
    # get the mean and standard deviation of the importances in pandas format
    importance_df = lofo_imp.get_importance()
    importance_df.to_csv('importance_df.csv')
    # plot the means and standard deviations of the importances
    plot_importance(importance_df, figsize=(12, 20))
elif CFG.full and CFG.train:
    model = fit_lgbm(train[features], train[CFG.target], 
                     train[features], train[CFG.target], 
                     params=params, es_rounds=50)
elif CFG.train:
    model = fit_lgbm(train[features], train[CFG.target], 
                     valid[features], valid[CFG.target], 
                     params=params, es_rounds=50)
else:
    model_file = 'fsq_lgbm_models_train_test/lgbm1.pkl'
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.995025	valid_1's auc: 0.994005
[100]	training's auc: 0.995768	valid_1's auc: 0.994497
[150]	training's auc: 0.996089	valid_1's auc: 0.994665
[200]	training's auc: 0.996386	valid_1's auc: 0.994763
[250]	training's auc: 0.996597	valid_1's auc: 0.994834
[300]	training's auc: 0.996794	valid_1's auc: 0.994896
[350]	training's auc: 0.996959	valid_1's auc: 0.994924
[400]	training's auc: 0.997089	valid_1's auc: 0.99496
[450]	training's auc: 0.997199	valid_1's auc: 0.994977
[500]	training's auc: 0.997307	valid_1's auc: 0.995019
[550]	training's auc: 0.997406	valid_1's auc: 0.995024
[600]	training's auc: 0.997495	valid_1's auc: 0.99504
[650]	training's auc: 0.99759	valid_1's auc: 0.995032
Early stopping, best iteration is:
[620]	training's auc: 0.997529	valid_1's auc: 0.995043


In [None]:
# [50]	training's auc: 0.99466	valid_1's auc: 0.993636
# [100]	training's auc: 0.995336	valid_1's auc: 0.99416
# [150]	training's auc: 0.995729	valid_1's auc: 0.99439
# [200]	training's auc: 0.995985	valid_1's auc: 0.994485
# [250]	training's auc: 0.996176	valid_1's auc: 0.994543
# [300]	training's auc: 0.996344	valid_1's auc: 0.994573
# [350]	training's auc: 0.996497	valid_1's auc: 0.994609
# [400]	training's auc: 0.996644	valid_1's auc: 0.99465
# [450]	training's auc: 0.996772	valid_1's auc: 0.994685
# [500]	training's auc: 0.996888	valid_1's auc: 0.994706
# [550]	training's auc: 0.996997	valid_1's auc: 0.994726
# [600]	training's auc: 0.997098	valid_1's auc: 0.994742
# [650]	training's auc: 0.99719	valid_1's auc: 0.994745
# [700]	training's auc: 0.997271	valid_1's auc: 0.994765
# [750]	training's auc: 0.997345	valid_1's auc: 0.994767
# Early stopping, best iteration is:
# [710]	training's auc: 0.997286	valid_1's auc: 0.99477

In [None]:
bad_features = importance_df.loc[(importance_df.importance_mean < 0) & 
                                 (importance_df.val_imp_0 < 0) & 
                                 (importance_df.val_imp_1 < 0), 'feature'].values
bad_features

## Find optimal threshold

## Fuctions for postprocessing and validation

In [12]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    id2poi = get_id2poi(input_df)
    poi2ids = get_poi2ids(input_df)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    
    return scores.mean()

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in df["matches"].values:
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    
    return df 

def get_matches(df, preds):
    match_id = df["match_id"].values
    matches = []

    for df_id, pred, match_idx in tqdm(zip(df["id"], preds, match_id), total=df.shape[0]):
        idx = np.round(pred)
        if idx == 1:
            matches.append(df_id + " " + match_idx)
        else:
            matches.append(df_id)
    
    df['matches'] = matches
    df = postprocess(df)
    
    return df[['id', 'matches', 'point_of_interest']]

## Add POI column to validation dataset

In [13]:
if not CFG.full:
    data_root = 'foursquare_location_matching'
    data = pd.read_csv(os.path.join(data_root, 'train.csv'))[['id', 'point_of_interest']]

    valid = valid.merge(data, how='left', on='id')

    del data
    gc.collect()

##  Predict matches and postprocess them

In [14]:
%%time

if CFG.find_thresh:
    pred, cv = predict(model, best_thr, valid[features], valid[CFG.target])
else:
    pred, cv = predict(model, CFG.threshold, valid[features], valid[CFG.target])

if not CFG.full:
    res = get_matches(valid, pred)
    res = res.drop_duplicates()
    print(f"IOU: {get_score(res):.6f}")

    gc.collect()

ROC AUC: 0.987181950977739


  0%|          | 0/13919513 [00:00<?, ?it/s]

IOU: 0.839483
CPU times: user 10min 4s, sys: 2.9 s, total: 10min 7s
Wall time: 2min 14s


In [None]:
# Baseline:
# ROC AUC: 0.986
# IOU: 0.831

# Add 'is_unbalance' parameter
# ROC AUC: 0.967
# IOU: 0.749

# Sort categories
# ROC AUC: 0.9863
# IOU: 0.8325

# Drop bad features
# ROC AUC: 0.98656
# IOU: 0.835467

# Add main_category and closest_city
# ROC AUC: 0.9868/0.987
# IOU: 0.83869/0.838849

# Clean name
# ROC AUC: 0.98686/0.98697
# IOU: 0.8392/0.8383

# Add some many different distances and text features
# ROC AUC: 0.9871/
# IOU: 0.839483/