In [1]:
import os
import sys
import re
import random
import string
import pickle
import math
import gc
from glob import glob
import multiprocessing
import numpy as np
import pandas as pd
from unidecode import unidecode
from argparse import Namespace
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, StratifiedGroupKFold
from haversine import haversine

from tqdm.auto import tqdm

pd.set_option('mode.chained_assignment', None) # Suppress annoying warnings

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

## Config

In [2]:
CFG = Namespace(
    seed = 42,
    train = True,
    debug = False,
    inference = False,
    target = "point_of_interest",
    n_neighbors = 20,
    n_splits = 10,
    threshold = 0.5,
    train_path = 'train_dataset',
    model_dir = '../input/fsquarecode/saved/',
    encode = False
)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG.seed)

## Load and preprocess data

In [3]:
# Loading, preprocessing
if CFG.train:
    df = pd.read_csv("foursquare_location_matching/train.csv")
elif CFG.debug:
    df = pd.read_csv("foursquare_location_matching/train.csv", nrows=3000)
else:
    
    df = pd.read_csv("../input/foursquare-location-matching/test.csv")

if len(df) < 20:
    df = pd.read_csv('../input/foursquare-location-matching/train.csv', nrows=3000)
    df = df.drop('point_of_interest', axis=1)

## Add main category

In [4]:
stop_words = ['/', '&', 'or', 'High', 'Miscellaneous', 'Fast', 'Other', 'Asian', 'Chinese', 'Event', 
              'Great', 'Noodle', 'Burger', 'Seafood', 'Breakfast', 'Ice', 'Diners', 'Cream', 'Indonesian', 
              'Thai', "Women's", 'Fried', 'Snack', 'Tea', 'Mexican', 'Nail', 'Sushi', 'Middle', 'Korean', 
              'Gift', 'Drink', 'Pet', 'Turkish', "Men's", 'Indian', 'Malay', 'Cocktail', 'Donut', 'Box', 
              'Condos)', 'Residential', 'Convenience', 'Gas', 'General', 'Bus', 'Pizza', 'Spaces', 'Mobile',
              'Phone', 'Academic', 'Japanese', 'Business', 'Shoe', 'Italian', 'American', 'Home', 'Auto', 
              'Furniture', 'Cosmetics', 'Sandwich', 'Dessert', 'Car', 'Arts', 'Financial', 'Legal', 'BBQ',
              'Hardware', 'Video', 'Music', 'Art', 'Student', 'Jewelry', 'Historic', 'Travel', 'Washes',
              'Beer', 'Arcades', 'Bike', 'Lookouts', 'Scenic', 'Rental', 'Accessories', 'Repairs', 'Discount', 
              'Optical', 'Bodegas', 'Big', 'Assisted', 'Living', 'Athletics', 'Agencies', 'Locations', 'Trails', 
              'Bed', 'Breakfasts', 'Wine', 'Real', 'Elementary', 'Theme', 'Golf', 'Rest',  'Photography', 
              'Nightlife', 'Courses', 'Convention', 'Eastern', 'Concert', 'Conference', 'Startups', 'Tech', 
              'Meeting', 'French', 'Supplies', 'Events', 'Sake', 'Dog', 'Ramen', 'City', 'Juice', 'Science',
              'Liquor', 'Lawyers', 'Insurance', 'Flower', 'Toy', 'Rentals', 'Paper', 'Flea', 'Bases', 'Baseball', 
              'Karaoke', 'Kids', 'Design', 'Farmers', 'Repair', 'Technology', 'Wards', 'Water', 'Supply', 
              'Filipino', 'Piers', 'Salad', 'Mattress', 'Print', 'Wings', 'Engineering', 'Non-Profits', 
              'Gastropubs', 'Bistros', 'Hot', 'Vietnamese', 'Hookah', 'Candy', 'Coffee', 'Electronics',
              'Department', 'Clothing', 'Trucks', 'Chicken', 'Movie', 'Health', 'Soccer', 'Crafts', 
              'Game', 'Community', 'Food', 'College', 'Sporting', 'Beauty', 'Ferries', 'Soup', 'Veterinarians', 
              'Basketball', 'Light', 'Rail', 'Taco', 'Classrooms', 'Shopping', 'Developments', 'Train', 'Performing',
              'Administrative', 'Lingerie', 'Dive', 'Storage', 'Office', 'Landscaping', 'Residence', 'Sports',
              'Goods', 'Dealerships', 'Grocery', 'Workshops', 'History'
             ]


def get_categories(category):
    if category == 'Auto':
        return 'Automotive'
    if category == 'Hotel' or category == 'Motels' or category == 'Hostels':
        return 'Hotels'
    if category == 'Courthouses':
        return 'Court'
    if category == 'College':
        return 'Colleges'
    if category == 'Cafés':
        return 'Cafes'
    if category == "Doctor's" or category == "Dentist's" or category == "Doctors":
        return 'Medical'
    if category == '(Apartments':
        return 'Apartments'
    return category

cat_freq = pd.read_csv('foursquare_main_categories/cat_freq.csv', index_col='Unnamed: 0')
cat_freq_dict = dict(zip(cat_freq['category'], cat_freq['frequence']))

def get_main_category(category):
    if category == category:
        category_list = re.split(', | ', category)
        most_freq_cat = np.nan
        freq = 0
        
        for c in category_list:
            if c in stop_words or c[-2:] == 'an':
                continue
            c = get_categories(c)
            f = cat_freq_dict.get(c, 0)
            if f > freq:
                freq = f
                most_freq_cat = c
        
        return most_freq_cat
            
    return np.nan

df['main_category'] = df['categories'].apply(get_main_category)
df["main_category"] = df["main_category"].fillna("None")

## Add closest city

In [5]:
# states = pd.read_csv('states.csv', index_col='Unnamed: 0')
cities = pd.read_csv('additional_data/cities.csv', encoding = "ISO-8859-1")
cities = cities[['asciiname', 'latitude', 'longitude', 'country code']]
cities.rename({'asciiname': 'city', 'country code': 'country'}, axis=1, inplace=True)

geoname_dict = {'city': cities}

def fill_the_missing_data(args):#, df_dists):
    country, country_df = args
    dfs = []
    columns = list(geoname_dict.keys())
    for c in tqdm(columns):

        geoname_df = geoname_dict[c]
        geoname_df = geoname_df[geoname_df['country'] == country]

        if len(country_df) == 0 or len(geoname_df) == 0:
            continue

        knn = KNeighborsRegressor(n_neighbors=min(len(geoname_df), 2), metric='haversine')
        knn.fit(geoname_df[['latitude','longitude']], geoname_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)

        if nears.shape[1] < 2:
            continue

        nears[:,1] = nears[:,0]
        nears[:,0] = country_df.index

        for n in nears:
            t_idx = n[0]
            c_idx = n[1]
            country_df.loc[t_idx, f"closest_{c}"] = geoname_df.iloc[c_idx][c]
        
    return country_df
    
    
df['country'] = df['country'].fillna('NA')
num_countries = df['country'].nunique()
    
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(fill_the_missing_data, df.groupby('country', sort=False))
    dfs = tqdm(dfs, total=num_countries)
    dfs = list(dfs)
    
df = pd.concat(dfs).reset_index(drop=True)
df['closest_city'] = df['closest_city'].fillna("None")

del cities

gc.collect()

  0%|          | 0/222 [00:00<?, ?it/s]

0

## Create vectors from text columns with multilingual encoder

In [6]:
# model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')# paraphrase-MiniLM-L3-v2')# all-MiniLM-L6-v2')
# vectors = model.encode(data['full_address'].values, batch_size=128, show_progress_bar=True)

# with open('additional_data/text_vectors.npy', 'wb') as f:
#     np.save(f, vectors)

# with open('additional_data/text_vectors.npy', 'rb') as f:
#     vectors = np.load(f)

## Preprocessing utils

In [7]:
def pickle_save(obj, filename):
    pickle.dump(obj, open(filename, 'wb'))

def pickle_load(filename):
    return pickle.load(open(filename, 'rb'))

def apply_notnull(df, column, target_column, function):
    df.loc[df[column].notnull(), target_column] = \
        df.loc[df[column].notnull(), column].apply(function)
    return df

def pair_func(func, x1, x2):
    if type(x1) == float and type(x2) == float:
        return -1
    elif type(x1) == float or type(x2) == float:
        return -0.5
    try:
        return func(x1, x2)
    except:
        return -1
    
def clean_string(df, column, target_column):
    # Unidecode
    df = apply_notnull(df, column, target_column, lambda x: unidecode(x))

    # Replace AND, AT
    df = apply_notnull(df, target_column, target_column, lambda x: x.translate(
        str.maketrans({"@": "at", "&": "and"})))

    # Strip punctuation
    df = apply_notnull(df, target_column, target_column, lambda x: x.translate(
        str.maketrans('', '', string.punctuation)))

    # To lowercase
    df = apply_notnull(df, target_column, target_column, lambda x: x.lower())

    # Remove leading spaces
    df = apply_notnull(df, target_column, target_column, lambda x: x.strip())

    return df


def get_shingles(df, column, shingle_k):
    for k in shingle_k:
        sh = ShingleBased(k=k)
        df = apply_notnull(df, column,
                           f"{column}_shingles_{k}", lambda x: sh.get_profile(x))

    return df

_SPACE_PATTERN = re.compile("\\s+")


class ShingleBased:

    def __init__(self, k=3):
        self.k = k

    def get_k(self):
        return self.k

    def get_profile(self, string):
        shingles = dict()
        no_space_str = _SPACE_PATTERN.sub(" ", string)
        for i in range(len(no_space_str) - self.k + 1):
            shingle = no_space_str[i:i + self.k]
            old = shingles.get(shingle)
            if old:
                shingles[str(shingle)] = int(old + 1)
            else:
                shingles[str(shingle)] = 1
        return shingles

def preprocessing(df):
    df = df.set_index("id", drop=False)

    # Name cleaning
    df = clean_string(df, "name", "name_cleaned")

    # Name shingles
    df = get_shingles(df, "name_cleaned", (2, 3))

    # Closest city shingles
    df = get_shingles(df, "closest_city", (3,))

    # Full address
    df["full_address"] = df["address"].fillna("") +\
        " " + df["city"].fillna("") +\
        " " + df["state"].fillna("")

    df.loc[df["full_address"] == "  ", "full_address"] = np.NaN
    df = clean_string(df, "full_address", "full_address_cleaned")
    df = get_shingles(df, "full_address_cleaned", (3,))

    # Numbers in name/address
    df = apply_notnull(df, "name_cleaned", "numbers_in_name", get_numbers_from_name)
    df.loc[df["numbers_in_name"] == "", "numbers_in_name"] = np.NaN

    df = apply_notnull(df, "full_address_cleaned", "numbers_in_full_address", get_numbers_from_name)
    df.loc[df["numbers_in_full_address"] == "", "numbers_in_full_address"] = np.NaN

    df = get_shingles(df, "numbers_in_name", (1, 2))
    df = get_shingles(df, "numbers_in_full_address", (1, 2))
    
    # Catogories shingles
    df = get_shingles(df, "categories", (3,))

    # Categories to frozenset
    df["categories"] = df["categories"].fillna("None")
    df["categories"] = df["categories"].apply(lambda x: x.split(", "))
    df["categories"] = df["categories"].apply(frozenset)
    
    # Main catogory shingles
    df = get_shingles(df, "main_category", (3,))

    # Encode categorical columns
    if CFG.encode:
        # No encoders provided, create and save
        encoder_params = {"dtype": np.int32,
                          "handle_unknown": "use_encoded_value",
                          "unknown_value": -1}

        ordinal_encoder = OrdinalEncoder(**encoder_params)

        ordinal_encoder = ordinal_encoder.fit(df[["country", "categories", 
                                                  "main_category", "closest_city"]])
        pickle_save(ordinal_encoder, "additional_data/ordinal_encoder.pkl")
        encoder = ordinal_encoder
    else:
        encoder = pickle_load("additional_data/ordinal_encoder.pkl")

    df[["country_enc", "categories_enc", 
        "main_category_enc", "closest_city_enc"]] = encoder.transform(df[["country", "categories",  
                                                                          "main_category", "closest_city"]])
    
    return df

def pickle_load(filename):
    return pickle.load(open(filename, 'rb'))

def get_numbers_from_name(name):
    return "".join(re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", name))

## Preprocess data

In [8]:
df = preprocessing(df)
df.index.rename("index", inplace = True)

## TF-IDF vectorization

In [9]:
def tf_idf_vectorize(df):
    # set dict for corresponding ids and index
    id2index_d = dict(zip(df['id'].values, df.index))

    # make TF-IDF features
    tfidf_d = {}
    for col in ["categories", "name_cleaned", "full_address_cleaned"]:
        if col == "categories":
            tfidf = TfidfVectorizer(use_idf=False)
        else:
            tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False, stop_words=['unknown'])
        tv_fit = tfidf.fit_transform(df[col].astype(str).values)
        tfidf_d[col] = tv_fit
        
    return id2index_d, tfidf_d, tv_fit

df = df.reset_index()

id2index_d, tfidf_d, tv_fit = tf_idf_vectorize(df)

df = df.set_index('index')

## Split data on train and test

In [10]:
def group_split(df):
    gkf = GroupKFold(n_splits=2)
    splits = list(gkf.split(
        df, groups=df["point_of_interest"]))

    return df.iloc[splits[0][1]], df.iloc[splits[1][1]]

df['country'] = df['country'].fillna('NA')
df['name_cleaned'] = df['name_cleaned'].fillna('')

if CFG.train or CFG.debug:
    fold0_df, fold1_df = group_split(df)

## Candidate search utils

In [11]:
def overlap(profile0, profile1):
    union = set()
    for k in profile0.keys():
        union.add(k)
    for k in profile1.keys():
        union.add(k)
    inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
    return inter / min(len(profile0), len(profile1))

def country_closest_k(train_df, country, candidate_k):
    country_df = train_df[train_df["country"] == country]

    # Coordinates
    country_np = np.deg2rad(country_df[["latitude", "longitude"]].to_numpy())

    # To 3d
    country_np = np.vstack([(np.cos(country_np[:, 0]) * np.cos(country_np[:, 1])),
                            (np.cos(country_np[:, 0]) *
                             np.sin(country_np[:, 1])),
                            (np.sin(country_np[:, 0]))]).T

    neigh = NearestNeighbors(n_jobs=-1).fit(country_np)
    try:
        distances, neighbors_indices = neigh.kneighbors(
            country_np, n_neighbors=candidate_k, return_distance=True)
    except:
        # Handle Expected n_neighbors <= n_samples error
        # Add all but exclude itself
        neighbors_indices = [
            [i for i in range(len(country_df)) if i != j] for j in range(len(country_df))]
        neighbors_indices = np.array(neighbors_indices, dtype=int)

    # Convert indices to id
    ids = country_df["id"].to_numpy()
    neighbors_ids = pd.Series(list(neighbors_indices), index=country_df.index).apply(
        lambda candidate_indices: ids[candidate_indices])

    return neighbors_ids


def candidate_selection(train_df, candidate_k):
    train_df["k_candidates"] = pd.Series(dtype='object')
    uq_countries = train_df["country"].value_counts().index

    for country in tqdm(uq_countries):
        train_df.loc[train_df["country"] == country, "k_candidates"] = \
            country_closest_k(train_df, country, candidate_k)

    # Empty candidates
    for row in train_df.loc[train_df["k_candidates"].isnull(), "k_candidates"].index:
        train_df.at[row, "k_candidates"] = []

    return train_df


def forming_pairs_filtering(train_df, th):
    pairs = []
    dict_ = train_df["name_cleaned_shingles_3"].to_dict()

    for p1_idx in tqdm(train_df.index):
        for p2_idx in train_df.loc[p1_idx, "k_candidates"]:
            if p1_idx == p2_idx:  # Skip
                continue

            try:
                sim = overlap(dict_[p1_idx], dict_[p2_idx])
                if sim >= th:
                    pairs.append([p1_idx, p2_idx])
            except:
                pass

    return pd.DataFrame(pairs, columns=["p1", "p2"])


def recall_knn(df, n_neighbors, label):
    print(80*'=')
    print('Start KNN grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        # distance KNN
        neighbors = min(len(country_df), n_neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                  metric = 'haversine',
                                  n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                      return_distance = True)

        # name KNN
        tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False, stop_words=['unknown'])
        x_name = country_df['name_cleaned'].values # CHANGED: name_cleaned
        x_name = tfidf.fit_transform(x_name)
        
        knn_name = NearestNeighbors(n_neighbors = neighbors,
                                    metric = 'cosine',
                                    n_jobs = -1)
        knn_name.fit(x_name)
        dists_name, nears_name = knn_name.kneighbors(x_name)
        
        del tfidf, knn, knn_name, x_name
        gc.collect()
        
        # join distance and name KNNs
        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            cur_df_name = country_df[['id']]
            cur_df_name['match_id'] = country_df['id'].values[nears_name[:, k]]
            cur_df_name['kdist_name_country'] = dists_name[:, k]
            cur_df_name['kneighbors_name_country'] = k
            cur_df = cur_df.merge(cur_df_name, on = ['id', 'match_id'], how = 'outer')
            
            train_df_country.append(cur_df)
    
    train_df_country = pd.concat(train_df_country)
    train_df_country = train_df_country.drop_duplicates(subset=['id', 'match_id'])
    
    print('Start KNN for the whole dataset')
    train_df = []
    knn = NearestNeighbors(n_neighbors = n_neighbors,
                           n_jobs = -1)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(n_neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country, on = ['id', 'match_id'], how = 'outer')
    
    del train_df_country
    gc.collect()
    
    return train_df, label

## Candidate search

In [12]:
# Candidate selection, pairs forming
#df = candidate_selection(df, 320)
#pairs = forming_pairs_filtering(df, 0.2)

if CFG.train or CFG.debug:
    fold0_df_pairs = recall_knn(fold0_df, 20, 'train')[0][["id", "match_id"]]
    fold0_df_pairs.rename(columns={"id": "p1", "match_id": "p2"}, inplace=True)

    fold1_df_pairs = recall_knn(fold1_df, 20, 'train')[0][["id", "match_id"]]
    fold1_df_pairs.rename(columns={"id": "p1", "match_id": "p2"}, inplace=True)
else:
    pairs = recall_knn(df, 20, 'train')[0][["id", "match_id"]]
    pairs.rename(columns={"id": "p1", "match_id": "p2"}, inplace=True)

Start KNN grouped by country


  0%|          | 0/210 [00:00<?, ?it/s]

Start KNN for the whole dataset
Start KNN grouped by country


  0%|          | 0/211 [00:00<?, ?it/s]

Start KNN for the whole dataset


In [13]:
fold0_df_pairs[fold0_df_pairs[['p1', 'p2']].duplicated()]

Unnamed: 0,p1,p2


In [14]:
fold1_df_pairs[fold1_df_pairs[['p1', 'p2']].duplicated()]

Unnamed: 0,p1,p2


## Create train target feature

In [15]:
%%time

if CFG.train or CFG.debug:
    df = df.set_index('id')

    ids = fold0_df_pairs['p1'].tolist()
    match_ids = fold0_df_pairs['p2'].tolist()
    poi = df.loc[ids]['point_of_interest'].values
    match_poi = df.loc[match_ids]['point_of_interest'].values
    fold0_df_pairs['match'] = np.array(poi == match_poi, dtype = np.int8)

    ids = fold1_df_pairs['p1'].tolist()
    match_ids = fold1_df_pairs['p2'].tolist()
    poi = df.loc[ids]['point_of_interest'].values 
    match_poi = df.loc[match_ids]['point_of_interest'].values
    fold1_df_pairs['match'] = np.array(poi == match_poi, dtype = np.int8)

    del poi, match_poi, ids, match_ids
    gc.collect()

    print('Num of unique train id: %s' % fold0_df_pairs['p1'].nunique())
    print('Num of train data: %s' % len(fold0_df_pairs))
    print('Pos rate: %s' % fold0_df_pairs['match'].mean())
    print('')
    print('Num of unique valid id: %s' % fold1_df_pairs['p1'].nunique())
    print('Num of valid data: %s' % len(fold1_df_pairs))
    print('Pos rate: %s' % fold1_df_pairs['match'].mean())

    df = df.reset_index('id')

Num of unique train id: 569406
Num of train data: 24175388
Pos rate: 0.04927110166753063

Num of unique valid id: 569406
Num of valid data: 24182861
Pos rate: 0.049252071539426207
CPU times: user 1min 34s, sys: 5.47 s, total: 1min 40s
Wall time: 1min 40s


## Group generation

In [16]:
def get_poi2id(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('p2')['p1'].apply(set).to_dict()

def create_poi_groups(poi2id):
    # merge POIs by groups
    for p1, values in tqdm(poi2id.items()):
        for p2 in values:
            if p2 != p1:
                poi2id[p1] = poi2id[p1].union(poi2id[p2])
                poi2id[p2] = set()
    # remove empty groups
    poi2id = {k: v for k, v in poi2id.items() if v}
    return poi2id

def merge_poi_groups(poi2id, poi_length, divider=20):
    l = len(poi2id)
    keys = list(poi2id.keys())
    for i in tqdm(range(l)):
        j = i + 1
        while 0 < len(poi2id[keys[i]]) < int(poi_length/divider) and j < l:
            poi2id[keys[i]] = poi2id[keys[i]].union(poi2id[keys[j]])
            poi2id[keys[j]] = set()
            j += 1
    # remove empty groups
    poi2id = {k: v for k, v in poi2id.items() if v}
    return poi2id

def clean_poi(poi2id):
    # clean POIs
    values_to_remove = set() 
    for p1, values in tqdm(poi2id.items()):
        values = values.difference(values_to_remove)
        values_to_remove = values_to_remove.union(values)
        poi2id[p1] = values
    # remove empty groups again
    poi2id = {k: v for k, v in poi2id.items() if v}
    return poi2id

def group_generation(pairs, n_splits=10):
    # get POI-ID dictionary
    poi2id = get_poi2id(pairs)
    poi_length = len(poi2id)

    # merge poi by groups
    poi2id = create_poi_groups(poi2id)

    # clean poi
    poi2id = clean_poi(poi2id)

    # decrease number of group by divider number to increase performance of further processes
    poi2id = merge_poi_groups(poi2id, poi_length, 20)

    # set groups for pairs
    pairs = pairs.set_index('p1')
    for idx, values in tqdm(enumerate(poi2id.values()), total=len(poi2id)):
        pairs.loc[list(values), 'set'] = idx
    pairs = pairs.reset_index()

    kf = GroupKFold(n_splits=n_splits)
    for i, (trn_idx, val_idx) in enumerate(kf.split(pairs, 
                                                    pairs['set'], 
                                                    pairs['set'])):
        pairs.loc[val_idx, 'group'] = i

    pairs['group'] = pairs['group'].astype('int8')
    pairs = pairs.drop('set', axis=1)
    
    return pairs

if CFG.train or CFG.debug:
    fold0_df_pairs = group_generation(fold0_df_pairs, CFG.n_splits)
    fold1_df_pairs = group_generation(fold1_df_pairs, CFG.n_splits)
else:
    pairs = group_generation(pairs, CFG.n_splits)

  0%|          | 0/569406 [00:00<?, ?it/s]

  0%|          | 0/18111 [00:00<?, ?it/s]

  0%|          | 0/4364 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/569406 [00:00<?, ?it/s]

  0%|          | 0/18193 [00:00<?, ?it/s]

  0%|          | 0/4369 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

## Feature Engeneering utils

In [17]:
def feature_engineering(train_df, pairs):
     # Candidates count
    if "count1" not in pairs.columns:
        pairs["count1"] = pairs.groupby("p1")["p1"].transform("count")
        pairs["count1"] = pairs["count1"].astype(np.int32)

    if "count2" not in pairs.columns:
        pairs["count2"] = pairs.groupby("p2")["p2"].transform("count")
        pairs["count2"] = pairs["count2"].astype(np.int32)

    # Distance metrics
    lat1 = train_df.loc[pairs["p1"], "latitude"].values
    lon1 = train_df.loc[pairs["p1"], "longitude"].values
    lat2 = train_df.loc[pairs["p2"], "latitude"].values
    lon2 = train_df.loc[pairs["p2"], "longitude"].values
    diff_lat = np.abs(lat2-lat1)
    diff_lon = np.abs(lon2-lon1)
    
    # Haversine
    if "haversine" not in pairs.columns:
        pairs["haversine"] = haversine_vec(lat1, lon1, lat2, lon2)
        pairs["haversine"] = pairs["haversine"].astype(np.float32)
    # Manhattan
    if "manhattan" not in pairs.columns:
        pairs['manhattan'] = diff_lat + diff_lon
        pairs["manhattan"] = pairs["manhattan"].astype(np.float32)
    # Euclidian
    if "euclidian" not in pairs.columns:   
        pairs['euclidian'] = np.sqrt(np.square(diff_lat) + np.square(diff_lon))
        pairs["euclidian"] = pairs["euclidian"].astype(np.float32)
    
    # Name similarity
    for name in ["jaccard", "overlap", "cosine"]:
        for k in tqdm([2, 3]):
            feature_name = f"name_cleaned_{name}_{k}"
            if feature_name not in pairs.columns:
                similarity = get_shingle_similarity(name)
                pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"name_cleaned_shingles_{k}"],
                                                 train_df.loc[pairs["p2"], f"name_cleaned_shingles_{k}"])
                pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Full address similarity
    for name in ["jaccard", "overlap"]:
        for k in tqdm([3]):
            feature_name = f"full_address_{name}_{k}"
            if feature_name not in pairs.columns:
                similarity = get_shingle_similarity(name)
                pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"full_address_cleaned_shingles_{k}"],
                                                 train_df.loc[pairs["p2"], f"full_address_cleaned_shingles_{k}"])
                pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Name-address similarity
    for name in ["overlap"]:
        for k in tqdm([3]):
            feature_name = f"name_address_{name}_{k}"
            if feature_name not in pairs.columns:
                similarity = get_shingle_similarity(name)
                pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"name_cleaned_shingles_{k}"],
                                                 train_df.loc[pairs["p2"], f"full_address_cleaned_shingles_{k}"])

                pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"full_address_cleaned_shingles_{k}"],
                                                  train_df.loc[pairs["p2"], f"name_cleaned_shingles_{k}"])
                pairs[feature_name] = pairs[feature_name] / 2

                pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Numbers in name similarity
    for name in ["overlap"]:
        feature_name = f"numbers_in_name_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"numbers_in_name_shingles_1"], train_df.loc[pairs["p2"], f"numbers_in_name_shingles_1"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_name_shingles_2"], train_df.loc[pairs["p2"], f"numbers_in_name_shingles_2"])
            pairs[feature_name] = pairs[feature_name] / 2

            pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Numbers in address similarity
    for name in ["overlap"]:
        feature_name = f"numbers_in_address_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"numbers_in_full_address_shingles_1"],
                                             train_df.loc[pairs["p2"], f"numbers_in_full_address_shingles_1"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_full_address_shingles_2"],
                                              train_df.loc[pairs["p2"], f"numbers_in_full_address_shingles_2"])
            pairs[feature_name] = pairs[feature_name] / 2

            pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Numbers in name-address similarity
    for name in ["overlap"]:
        feature_name = f"numbers_in_name_address_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"numbers_in_name_shingles_1"],
                                             train_df.loc[pairs["p2"], f"numbers_in_full_address_shingles_1"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_full_address_shingles_1"], 
                                              train_df.loc[pairs["p2"], f"numbers_in_name_shingles_1"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_name_shingles_2"],
                                              train_df.loc[pairs["p2"], f"numbers_in_full_address_shingles_2"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_full_address_shingles_2"],
                                              train_df.loc[pairs["p2"], f"numbers_in_name_shingles_2"])
            pairs[feature_name] = pairs[feature_name] / 4

            pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Category
    if "categories1" not in pairs.columns:
        pairs["categories1"] = train_df.loc[pairs["p1"], "categories_enc"].to_numpy()
        pairs["categories1"] = pairs["categories1"].astype(np.int32)

    if "categories2" not in pairs.columns:
        pairs["categories2"] = train_df.loc[pairs["p2"], "categories_enc"].to_numpy()
        pairs["categories2"] = pairs["categories2"].astype(np.int32)
        
    # Categories text similarity
    for name in ["overlap", "jaccard"]:
        feature_name = f"categories_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"categories_shingles_3"],
                                             train_df.loc[pairs["p2"], f"categories_shingles_3"])
            pairs[feature_name] = pairs[feature_name].astype(np.float16)
            
    # Main categories text similarity
    for name in ["jaccard", "overlap", "cosine"]:
        feature_name = f"main_category_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"main_category_shingles_3"],
                                             train_df.loc[pairs["p2"], f"main_category_shingles_3"])
            pairs[feature_name] = pairs[feature_name].astype(np.float16) 

    # Country (same for every pair)
    if "country" not in pairs.columns:
        pairs["country1"] = train_df.loc[pairs["p1"], "country_enc"].to_numpy()
        pairs["country1"] = pairs["country1"].astype(np.int32)
        
        pairs["country2"] = train_df.loc[pairs["p2"], "country_enc"].to_numpy()
        pairs["country2"] = pairs["country2"].astype(np.int32)

    # Main category
    if "main_category1" not in pairs.columns:
        pairs["main_category1"] = train_df.loc[pairs["p1"], "main_category_enc"].to_numpy()
        pairs["main_category1"] = pairs["main_category1"].astype(np.int32)

    if "main_category2" not in pairs.columns:
        pairs["main_category2"] = train_df.loc[pairs["p2"], "main_category_enc"].to_numpy()
        pairs["main_category2"] = pairs["main_category2"].astype(np.int32)
        
    # Closest city
    if "closest_city1" not in pairs.columns:
        pairs["closest_city1"] = train_df.loc[pairs["p1"], "closest_city_enc"].to_numpy()
        pairs["closest_city1"] = pairs["closest_city1"].astype(np.int32)

    if "closest_city2" not in pairs.columns:
        pairs["closest_city2"] = train_df.loc[pairs["p2"], "closest_city_enc"].to_numpy()
        pairs["closest_city2"] = pairs["closest_city2"].astype(np.int32)
    
    # TF-IDF features
    for column in ["name_cleaned", "full_address_cleaned"]:
        # for each id and match_id add corresponding TF-IDF vector
        # than multiply them elementwise to get similarity
        tv_fit = tfidf_d[column]
        indexs = [id2index_d[i] for i in pairs['p1']]
        match_indexs = [id2index_d[i] for i in pairs['p2']]                    
        pairs[f"tfidf_trigram_{column}"] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
            
    # Group-by features
    # Haversine features
    groupby_p1 = pairs.groupby('p1')["haversine"]
    groupby_p2 = pairs.groupby('p2')["haversine"]
    pairs[f"p1_haversine_mean"] = groupby_p1.transform(
        np.mean).astype(np.float32)
    pairs[f"p2_haversine_mean"] = groupby_p2.transform(
        np.mean).astype(np.float32)
    pairs[f"p1_haversine_min"] = groupby_p1.transform(
        np.min).astype(np.float32)
    pairs[f"p2_haversine_min"] = groupby_p2.transform(
        np.min).astype(np.float32)
    pairs[f"p1_haversine_max"] = groupby_p1.transform(
        np.max).astype(np.float32)
    pairs[f"p2_haversine_max"] = groupby_p2.transform(
        np.max).astype(np.float32)

    pairs[f"p1_haversine_rank"] = ((groupby_p1.transform(
        "rank", method="min") - 1) / pairs["count1"]).astype(np.float16)
    pairs[f"p2_haversine_rank"] = ((groupby_p2.transform(
        "rank", method="min") - 1) / pairs["count2"]).astype(np.float16)
    
    # Manhattan features
    groupby_p1 = pairs.groupby('p1')["manhattan"]
    groupby_p2 = pairs.groupby('p2')["manhattan"]
    pairs[f"p1_manhattan_mean"] = groupby_p1.transform(
        np.mean).astype(np.float32)
    pairs[f"p2_manhattan_mean"] = groupby_p2.transform(
        np.mean).astype(np.float32)
    pairs[f"p1_manhattan_min"] = groupby_p1.transform(
        np.min).astype(np.float32)
    pairs[f"p2_manhattan_min"] = groupby_p2.transform(
        np.min).astype(np.float32)
    pairs[f"p1_manhattan_max"] = groupby_p1.transform(
        np.max).astype(np.float32)
    pairs[f"p2_manhattan_max"] = groupby_p2.transform(
        np.max).astype(np.float32)

    pairs[f"p1_manhattan_rank"] = ((groupby_p1.transform(
        "rank", method="min") - 1) / pairs["count1"]).astype(np.float16)
    pairs[f"p2_manhattan_rank"] = ((groupby_p2.transform(
        "rank", method="min") - 1) / pairs["count2"]).astype(np.float16)
    
    # Euclidian features
    groupby_p1 = pairs.groupby('p1')["euclidian"]
    groupby_p2 = pairs.groupby('p2')["euclidian"]
    pairs[f"p1_euclidian_mean"] = groupby_p1.transform(
        np.mean).astype(np.float32)
    pairs[f"p2_euclidian_mean"] = groupby_p2.transform(
        np.mean).astype(np.float32)
    pairs[f"p1_euclidian_min"] = groupby_p1.transform(
        np.min).astype(np.float32)
    pairs[f"p2_euclidian_min"] = groupby_p2.transform(
        np.min).astype(np.float32)
    pairs[f"p1_euclidian_max"] = groupby_p1.transform(
        np.max).astype(np.float32)
    pairs[f"p2_euclidian_max"] = groupby_p2.transform(
        np.max).astype(np.float32)

    pairs[f"p1_euclidian_rank"] = ((groupby_p1.transform(
        "rank", method="min") - 1) / pairs["count1"]).astype(np.float16)
    pairs[f"p2_euclidian_rank"] = ((groupby_p2.transform(
        "rank", method="min") - 1) / pairs["count2"]).astype(np.float16)

    # Name features
    for feature in ["name_cleaned_overlap_3"]:
        groupby_p1 = pairs.groupby('p1')[feature]
        groupby_p2 = pairs.groupby('p2')[feature]
        pairs[f"p1_{feature}_mean"] = groupby_p1.transform(
            np.mean).astype(np.float16)
        pairs[f"p2_{feature}_mean"] = groupby_p2.transform(
            np.mean).astype(np.float16)
        pairs[f"p1_{feature}_max"] = groupby_p1.transform(
            np.max).astype(np.float16)
        pairs[f"p2_{feature}_max"] = groupby_p2.transform(
            np.max).astype(np.float16)

        pairs[f"p1_{feature}_rank"] = ((groupby_p1.transform(
            "rank", method="min") - 1) / pairs["count1"]).astype(np.float16)
        pairs[f"p2_{feature}_rank"] = ((groupby_p2.transform(
            "rank", method="min") - 1) / pairs["count2"]).astype(np.float16)

    # Address/numbers features: only mean
    for feature in ["full_address_overlap_3", "numbers_in_name_overlap",
                    "numbers_in_address_overlap", "numbers_in_name_address_overlap",
                    "categories_overlap", "categories_jaccard"]:
        groupby_p1 = pairs.groupby('p1')[feature]
        groupby_p2 = pairs.groupby('p2')[feature]
        pairs[f"p1_{feature}_mean"] = groupby_p1.transform(
            np.mean).astype(np.float16)
        pairs[f"p2_{feature}_mean"] = groupby_p2.transform(
            np.mean).astype(np.float16)

    return pairs

def overlap(profile0, profile1):
    union = set()
    for k in profile0.keys():
        union.add(k)
    for k in profile1.keys():
        union.add(k)
    inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
    return inter / min(len(profile0), len(profile1))


def jaccard(profile0, profile1):
    union = set()
    for ite in profile0.keys():
        union.add(ite)
    for ite in profile1.keys():
        union.add(ite)
    inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
    return 1.0 * inter / len(union)


def cosine(profile0, profile1):
    small = profile1
    large = profile0
    if len(profile0) < len(profile1):
        small = profile0
        large = profile1
    agg = 0.0
    for k, v in small.items():
        i = large.get(k)
        if not i:
            continue
        agg += 1.0 * v * i
    dot_product = agg

    agg = 0.0
    for k, v in profile0.items():
        agg += 1.0 * v * v
    profile0_norm = math.sqrt(agg)

    agg = 0.0
    for k, v in profile1.items():
        agg += 1.0 * v * v
    profile1_norm = math.sqrt(agg)

    return dot_product / (profile0_norm * profile1_norm)


def get_shingle_similarity(name):
    if name == "cosine":
        func = cosine
    elif name == "jaccard":
        func = jaccard
    elif name == "overlap":
        func = overlap

    func_ = np.vectorize(
        lambda x1, x2: pair_func(func, x1, x2))
    return func_

def seq_match_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.ratio(str1, str2)

def lev_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.distance(str1, str2)

def jw_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.jaro_winkler(str1, str2)

def lcs_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return LCS(str(str1), str(str2))

def haversine_vec(lat1, lon1, lat2, lon2):
    def h(la1, lo1, la2, lo2):
        return haversine((la1, lo1), (la2, lo2), unit='m')
    return np.vectorize(h)(lat1, lon1, lat2, lon2)

## Generate dataset

In [18]:
def generate_dataset_by_chunks(df, pairs, n_splits, label='train'):
    count = 0
    
    for k in tqdm(range(n_splits)):
        # split dataset by chunks
        print(f'Current split: {k+1}')
        cur_data = pairs[pairs['group'] == k]

        # add features & model prediction
        cur_data = feature_engineering(df, cur_data)

        count += len(cur_data)
        
        # save dataset
        cur_data = cur_data.drop('group', axis=1)
        if not CFG.debug:
            cur_data.to_pickle(f'{CFG.train_path}/{label}_data_{k+1}.pkl')    
        
        del cur_data
        gc.collect()
    
    print(f'Total len is {count}')
    return count

if CFG.train or CFG.debug:
    with multiprocessing.Pool(processes=2) as pool:
        pool.starmap(generate_dataset_by_chunks, [(fold0_df, fold0_df_pairs, CFG.n_splits, 'train'), 
                                                  (fold1_df, fold1_df_pairs, CFG.n_splits, 'valid')])
#     generate_dataset_by_chunks(fold0_df, fold0_df_pairs, CFG.n_splits, 'train')

Current split: 1


  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 1


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 2


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 2


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 3


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 3


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 4


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 4


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 5


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 5


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 6


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 6


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 7


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 7


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 8


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 8


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 9


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 9


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Current split: 10


  0%|          | 0/2 [00:00<?, ?it/s]

Current split: 10


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Total len is 24175388
Total len is 24182861


# Further ideas

- ordinal encode main_category and closest_city, set them as categorical features for LGBM
- levenstein, jaro, lcs
- include multilingual encoder for the full_address

- Optuna
- stacking of the best Optuna solutions

- add Catboost