In [1]:
import os
import gc
import re
import sys
import math
import json
import time
import random
import joblib
import pickle
import warnings
import difflib
import Levenshtein
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from glob import glob
from pathlib import Path
from unidecode import unidecode
import multiprocessing
from tqdm.auto import tqdm
from argparse import Namespace
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, fbeta_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, StratifiedGroupKFold

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

# Config

In [2]:
CFG = Namespace(
    train = True,
    full = False,
    debug = False,
    select_features = False,
    folds = 0,
    seed = 42,
    pos_frac = 0.2,
    target = "label",
    threshold = 0.5,
    train_path = 'train_dataset',
    model_dir = 'fsq_lgbm_models_train_test',
    es_rounds = 50
)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
bad_features = ['cat_cluster_same_cluster']
    
seed_everything(CFG.seed)

# Load train dataset

In [4]:
def downcast_floats(df):
    floats = ['float32', 'float64']
    float_features = list(df.select_dtypes(include=floats).columns)
    for f in float_features:
        df[f] = df[f].astype('float16')
    return df
    
if CFG.full or CFG.folds:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))
    train_files = train_files + valid_files
else:
    train_files = glob(os.path.join(CFG.train_path, "train_*.parquet"))

train = list()
for filename in tqdm(train_files):
    df = pd.read_parquet(filename)
    if CFG.debug:
        df = df.sample(n = 10000, random_state = CFG.seed)
        df = df.reset_index(drop = True)
    df = downcast_floats(df)
    train.append(df)

train = pd.concat(train, axis=0, ignore_index=True)

gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

20

# Load validation dataset

In [4]:
if not CFG.full and not CFG.folds:
    valid_files = glob(os.path.join(CFG.train_path, "valid_*.parquet"))

    valid = list()
    for filename in tqdm(valid_files):
        df = pd.read_parquet(filename)
        if CFG.debug:
            df = df.sample(n = 10000, random_state = CFG.seed)
            df = df.reset_index(drop = True)
        df = downcast_floats(df)
        valid.append(df)

    valid = pd.concat(valid, axis=0, ignore_index=True)
    
    gc.collect()

  0%|          | 0/10 [00:00<?, ?it/s]

# Increase fraction of positive targets

In [5]:
# %%time

# train_pos_index = train[train['label'] == 1].index
# train_neg_index = train[train['label'] == 0].index
# train_neg_index = np.random.choice(train_neg_index, size=int(len(train_pos_index)*((1-CFG.pos_frac)/CFG.pos_frac)))
# train_pos_index = np.concatenate([train_pos_index, train_neg_index])
# np.random.shuffle(train_pos_index)
# train = train.loc[train_pos_index].reset_index(drop=True)
# del train_pos_index, train_neg_index
# gc.collect()

# if not CFG.full and not CFG.folds:
#     valid_pos_index = valid[valid['label'] == 1].index
#     valid_neg_index = valid[valid['label'] == 0].index
#     valid_neg_index = np.random.choice(valid_neg_index, size=int(len(valid_pos_index)*((1-CFG.pos_frac)/CFG.pos_frac)))
#     valid_pos_index = np.concatenate([valid_pos_index, valid_neg_index])
#     np.random.shuffle(valid_pos_index)
#     valid = valid.loc[valid_pos_index].reset_index(drop=True)
#     del valid_pos_index, valid_neg_index
#     gc.collect() 

# Drop bad features

In [6]:
if bad_features:
    train = train.drop(bad_features, axis=1)
    if not CFG.full and not CFG.folds:
        valid = valid.drop(bad_features, axis=1)
        
gc.collect()

0

# Set features

In [7]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = list(train.select_dtypes(include=numerics).columns)
features.remove(CFG.target)

## Split dataset by folds

In [8]:
if CFG.folds > 0:
    # kf = StratifiedGroupKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
    kf = StratifiedKFold(n_splits=CFG.folds, shuffle=True, random_state=CFG.seed)
    for i, (trn_idx, val_idx) in tqdm(enumerate(kf.split(train, train["label"], train["label"]))):
        train.loc[val_idx, "fold"] = i

# Train

In [9]:
def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'f1', f1_score(y_true, y_hat), True

def lgb_f2_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0.5, 0, 1)  
    return 'f2', fbeta_score(y_true, y_hat, beta=2), True

def fit_lgbm(X_train, y_train, X_val, y_val, init_model=None, 
             params=None, es_rounds=50, num_iter=0):
    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_val, y_val)

    model = lgb.train(
        params,
        train_set = train_dataset, 
        valid_sets = [train_dataset, valid_dataset],
        init_model = init_model,
#         feval=lgb_f1_score,
        callbacks = [lgb.log_evaluation(10), 
                     lgb.early_stopping(stopping_rounds=es_rounds),
                    ]
        )

    file = f'{CFG.model_dir}/lgbm.pkl'
    pickle.dump(model, open(file, 'wb'))

    return model

def fit_lgbm_folds(X, y, folds, init_model=None, params=None, es_rounds=50, num_iter=0):
    models = []
    
    for i in tqdm(range(CFG.folds)):
        print(f"== fold {i} ==")
        trn_idx = folds != i
        val_idx = folds == i
    
        train_dataset = lgb.Dataset(X.iloc[trn_idx], y.iloc[trn_idx])
        valid_dataset = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

        model = lgb.train(
            params,
            train_set = train_dataset, 
            valid_sets = [train_dataset, valid_dataset],
            init_model = init_model,
#             feval=lgb_f1_score,
            callbacks = [lgb.log_evaluation(10), 
                         lgb.early_stopping(stopping_rounds=es_rounds),
                        ]
            )

        models.append(model)
    
        file = f'{CFG.model_dir}/lgbm_fold_{i}.pkl'
        pickle.dump(model, open(file, 'wb'))

    return models

def predict_(model, X_val, y_val, threshold):
    pred = model.predict(X_val)
    return pred

def predict_folds(models, X, y, folds, threshold):
    oof = np.zeros((len(y)), dtype=np.float64)
    
    for i in tqdm(range(CFG.folds)):
        trn_idx = folds != i
        val_idx = folds == i
        
        pred = models[i].predict(X.iloc[val_idx])
        oof[val_idx] = pred
    
    return oof

def show_metrics(pred, threshold, y):
    y_hat = np.where(pred < threshold, 0, 1)  
    acc = (y_hat == y).mean()
    f1 = f1_score(y, y_hat)
    f2 = fbeta_score(y, y_hat, beta=2)
#     print(f"Accuracy: {acc}, F1: {f1}, F2: {f2}")
    return acc, f1, f2

## Train model

In [None]:
warnings.filterwarnings("ignore", module="lightgbm")

params = {
    'seed': CFG.seed,
#     'device': 'gpu',
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'objective': 'binary',
#     'first_metric_only': True,
    'metric': 'auc',
    'learning_rate': 0.2,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
#     'max_bin': 200,
    'max_depth': 7,   
    'num_leaves': 35, 
#     'min_data_in_leaf': 40,
    'n_estimators': 1500, 
    'colsample_bytree': 0.9,
    'verbose': -1
}
            

if CFG.select_features:
    import lofo
    from lofo import LOFOImportance, Dataset, plot_importance
    # extract a sample of the data
    train = train.sample(frac=0.15, random_state=CFG.seed)
    valid = valid.sample(frac=0.15, random_state=CFG.seed)
    train = pd.concat([train, valid], ignore_index=True)
    del valid
    gc.collect()
    # define the validation scheme
    cv = KFold(n_splits=2)
    # define the binary target and the features
    dataset = Dataset(df=train, target=CFG.target, features=new_features)
    # define the validation scheme and scorer
    lofo_imp = LOFOImportance(dataset, scoring="roc_auc", cv=cv, model=lgb.LGBMClassifier(**params))
    # get the mean and standard deviation of the importances in pandas format
    importance_df = lofo_imp.get_importance()
    importance_df.to_csv('importance_df.csv')
    # plot the means and standard deviations of the importances
    plot_importance(importance_df, figsize=(12, 20))
elif CFG.folds and CFG.train:
    models = fit_lgbm_folds(train[features], train[CFG.target], folds=train['fold'].values,
                            params=params, es_rounds=CFG.es_rounds)
elif CFG.full and CFG.train:
    model = fit_lgbm(train[features], train[CFG.target], 
                     train[features], train[CFG.target], 
                     params=params, es_rounds=CFG.es_rounds)
elif CFG.train:
    model = fit_lgbm(train[features], train[CFG.target], 
                     valid[features], valid[CFG.target], 
                     params=params, es_rounds=CFG.es_rounds)
elif CFG.folds:
    model_files = glob(os.path.join(CFG.model_dir, "lgbm*.pkl"))
    models = list()
    for model_file in model_files:
        with open(model_file, 'rb') as f:
            model = pickle.load(f)
            models.append(model)
else:
    model_file = f'{CFG.model_dir}/lgbm.pkl'
    with open(model_file, 'rb') as f:
        model = pickle.load(f)

## Find optimal threshold

In [None]:
best_thr = 0.5
best_cv = 0

if CFG.folds:
    X = train[features]
    y = train[CFG.target]
else:
    X = valid[features]
    y = valid[CFG.target]

if CFG.folds:
    pred = predict_folds(models, X, y, train['fold'].values, best_thr)
else:
    pred = predict_(model, X, y, best_thr)

# for thr in tqdm(np.arange(0.3, 0.45, 0.005)):
#     y_hat = np.where(pred < thr, 0, 1)  
#     cv = f1_score(y, y_hat) # F1
#     print(f'Threshold is {thr}, score is {cv}')
#     if cv > best_cv:
#         best_cv = cv
#         best_thr = thr
            
# acc, f1, f2 = show_metrics(pred, best_thr, y)
# print(f'Best threshold is {best_thr}, Accuracy is {acc}, F1 score is {f1}, F2 score is {f2}')

## Fuctions for postprocessing and validation

In [None]:
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    id2poi = get_id2poi(input_df)
    poi2ids = get_poi2ids(input_df)
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    
    return scores.mean()

def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in df["matches"].values:
        match = match.split()
        if len(match) == 1:        
            continue

        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    
    return df 

def get_matches(df, preds):
    match_id = df["match_id"].values
    matches = []

    for df_id, pred, match_idx in tqdm(zip(df["id"], preds, match_id), total=df.shape[0]):
        idx = np.round(pred)
        if pred == 1:
            matches.append(df_id + " " + match_idx)
        else:
            matches.append(df_id)
    
    df['matches'] = matches
    df = postprocess(df)
    
    return df[['id', 'matches', 'point_of_interest']]

## Add POI column to validation dataset

In [None]:
if not CFG.full:
    data_root = 'foursquare_location_matching'
    data = pd.read_csv(os.path.join(data_root, 'train.csv'))[['id', 'point_of_interest']]

    if CFG.folds:
        valid = train.merge(data, how='left', on='id')
    else:
        valid = valid.merge(data, how='left', on='id')

    del data
    gc.collect()

##  Predict matches and postprocess them

In [None]:
%%time

y_hat = np.where(pred < best_thr, 0, 1) 
    
if not CFG.full:
    res = get_matches(valid, y_hat)
    res = res.drop_duplicates()
    print(f"IOU: {get_score(res):.6f}")

    gc.collect()

# Plot importance

In [None]:
def plot_importances(model):
    importance_df = pd.DataFrame(model.feature_importance(), 
                                 index=features, 
                                 columns=['importance'])\
                        .sort_values("importance", ascending=False)

    plt.subplots(figsize=(len(features) // 4, 5))
    plt.bar(importance_df.index, importance_df.importance)
    plt.grid()
    plt.xticks(rotation=90)
    plt.ylabel("importance")
    plt.tight_layout()
    plt.show()
    
plot_importances(model)

In [None]:
# 5-folds, auc, 1000 iter, thresh 0.5
# ACC: 0.9830761855878474
# F1: 0.9144342837012177
# F2 0.9083372734752316
# IOU: 0.9
# LB: ?

