In [1]:
import os
import sys
import re
import random
import string
import pickle
import math
import gc
from glob import glob
import multiprocessing
import numpy as np
import pandas as pd
from unidecode import unidecode
from argparse import Namespace
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, StratifiedGroupKFold
from haversine import haversine

from tqdm.auto import tqdm

pd.set_option('mode.chained_assignment', None) # Suppress annoying warnings

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 600)

## Config

In [2]:
CFG = Namespace(
    seed = 42,
    train = False,
    debug = True,
    inference = False,
    target = "point_of_interest",
    n_neighbors = 20,
    n_splits = 10,
    threshold = 0.5,
    train_path = 'train_dataset',
    model_dir = '../input/fsquarecode/saved/',
    encode = False
)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(CFG.seed)

## Load and preprocess data

In [3]:
# Loading, preprocessing
if CFG.train:
    df = pd.read_csv("foursquare_location_matching/train.csv")
elif CFG.debug:
    df = pd.read_csv("foursquare_location_matching/train.csv", nrows=3000)
else:
    
    df = pd.read_csv("../input/foursquare-location-matching/test.csv")

if len(df) < 20:
    df = pd.read_csv('../input/foursquare-location-matching/train.csv', nrows=3000)
    df = df.drop('point_of_interest', axis=1)

## Add main category

In [4]:
stop_words = ['/', '&', 'or', 'High', 'Miscellaneous', 'Fast', 'Other', 'Asian', 'Chinese', 'Event', 
              'Great', 'Noodle', 'Burger', 'Seafood', 'Breakfast', 'Ice', 'Diners', 'Cream', 'Indonesian', 
              'Thai', "Women's", 'Fried', 'Snack', 'Tea', 'Mexican', 'Nail', 'Sushi', 'Middle', 'Korean', 
              'Gift', 'Drink', 'Pet', 'Turkish', "Men's", 'Indian', 'Malay', 'Cocktail', 'Donut', 'Box', 
              'Condos)', 'Residential', 'Convenience', 'Gas', 'General', 'Bus', 'Pizza', 'Spaces', 'Mobile',
              'Phone', 'Academic', 'Japanese', 'Business', 'Shoe', 'Italian', 'American', 'Home', 'Auto', 
              'Furniture', 'Cosmetics', 'Sandwich', 'Dessert', 'Car', 'Arts', 'Financial', 'Legal', 'BBQ',
              'Hardware', 'Video', 'Music', 'Art', 'Student', 'Jewelry', 'Historic', 'Travel', 'Washes',
              'Beer', 'Arcades', 'Bike', 'Lookouts', 'Scenic', 'Rental', 'Accessories', 'Repairs', 'Discount', 
              'Optical', 'Bodegas', 'Big', 'Assisted', 'Living', 'Athletics', 'Agencies', 'Locations', 'Trails', 
              'Bed', 'Breakfasts', 'Wine', 'Real', 'Elementary', 'Theme', 'Golf', 'Rest',  'Photography', 
              'Nightlife', 'Courses', 'Convention', 'Eastern', 'Concert', 'Conference', 'Startups', 'Tech', 
              'Meeting', 'French', 'Supplies', 'Events', 'Sake', 'Dog', 'Ramen', 'City', 'Juice', 'Science',
              'Liquor', 'Lawyers', 'Insurance', 'Flower', 'Toy', 'Rentals', 'Paper', 'Flea', 'Bases', 'Baseball', 
              'Karaoke', 'Kids', 'Design', 'Farmers', 'Repair', 'Technology', 'Wards', 'Water', 'Supply', 
              'Filipino', 'Piers', 'Salad', 'Mattress', 'Print', 'Wings', 'Engineering', 'Non-Profits', 
              'Gastropubs', 'Bistros', 'Hot', 'Vietnamese', 'Hookah', 'Candy', 'Coffee', 'Electronics',
              'Department', 'Clothing', 'Trucks', 'Chicken', 'Movie', 'Health', 'Soccer', 'Crafts', 
              'Game', 'Community', 'Food', 'College', 'Sporting', 'Beauty', 'Ferries', 'Soup', 'Veterinarians', 
              'Basketball', 'Light', 'Rail', 'Taco', 'Classrooms', 'Shopping', 'Developments', 'Train', 'Performing',
              'Administrative', 'Lingerie', 'Dive', 'Storage', 'Office', 'Landscaping', 'Residence', 'Sports',
              'Goods', 'Dealerships', 'Grocery', 'Workshops', 'History'
             ]


def get_categories(category):
    if category == 'Auto':
        return 'Automotive'
    if category == 'Hotel' or category == 'Motels' or category == 'Hostels':
        return 'Hotels'
    if category == 'Courthouses':
        return 'Court'
    if category == 'College':
        return 'Colleges'
    if category == 'Cafés':
        return 'Cafes'
    if category == "Doctor's" or category == "Dentist's" or category == "Doctors":
        return 'Medical'
    if category == '(Apartments':
        return 'Apartments'
    return category

cat_freq = pd.read_csv('foursquare_main_categories/cat_freq.csv', index_col='Unnamed: 0')
cat_freq_dict = dict(zip(cat_freq['category'], cat_freq['frequence']))

def get_main_category(category):
    if category == category:
        category_list = re.split(', | ', category)
        most_freq_cat = np.nan
        freq = 0
        
        for c in category_list:
            if c in stop_words or c[-2:] == 'an':
                continue
            c = get_categories(c)
            f = cat_freq_dict.get(c, 0)
            if f > freq:
                freq = f
                most_freq_cat = c
        
        return most_freq_cat
            
    return np.nan

df['main_category'] = df['categories'].apply(get_main_category)

## Add closest city

In [5]:
# states = pd.read_csv('states.csv', index_col='Unnamed: 0')
cities = pd.read_csv('additional_data/cities.csv', encoding = "ISO-8859-1")
cities = cities[['asciiname', 'latitude', 'longitude', 'country code']]
cities.rename({'asciiname': 'city', 'country code': 'country'}, axis=1, inplace=True)

geoname_dict = {'city': cities}

def fill_the_missing_data(args):#, df_dists):
    country, country_df = args
    dfs = []
    columns = list(geoname_dict.keys())
    for c in tqdm(columns):

            geoname_df = geoname_dict[c]
            geoname_df = geoname_df[geoname_df['country'] == country]
                
            if len(country_df) == 0 or len(geoname_df) == 0:
                continue
            
            knn = KNeighborsRegressor(n_neighbors=min(len(geoname_df), 2), metric='haversine')
            knn.fit(geoname_df[['latitude','longitude']], geoname_df.index)
            dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)
            
            if nears.shape[1] < 2:
                continue
            
            nears[:,1] = nears[:,0]
            nears[:,0] = country_df.index
            
            for n in nears:
                t_idx = n[0]
                c_idx = n[1]
                country_df.loc[t_idx, f"closest_{c}"] = geoname_df.iloc[c_idx][c]
                       
    return country_df
    
    
df['country'] = df['country'].fillna('NA')
num_countries = df['country'].nunique()
    
processes = multiprocessing.cpu_count()
with multiprocessing.Pool(processes=processes) as pool:
    dfs = pool.imap_unordered(fill_the_missing_data, df.groupby('country', sort=False))
    dfs = tqdm(dfs, total=num_countries)
    dfs = list(dfs)
    
df = pd.concat(dfs).reset_index(drop=True)

del cities

gc.collect()

  0%|          | 0/96 [00:00<?, ?it/s]

27

## Create vectors from text columns with multilingual encoder

In [6]:
# model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')# paraphrase-MiniLM-L3-v2')# all-MiniLM-L6-v2')
# vectors = model.encode(data['full_address'].values, batch_size=128, show_progress_bar=True)

# with open('additional_data/text_vectors.npy', 'wb') as f:
#     np.save(f, vectors)

# with open('additional_data/text_vectors.npy', 'rb') as f:
#     vectors = np.load(f)

## Preprocessing utils

In [7]:
def pickle_save(obj, filename):
    pickle.dump(obj, open(filename, 'wb'))

def pickle_load(filename):
    return pickle.load(open(filename, 'rb'))

def apply_notnull(df, column, target_column, function):
    df.loc[df[column].notnull(), target_column] = \
        df.loc[df[column].notnull(), column].apply(function)
    return df

def pair_func(func, x1, x2):
    if type(x1) == float and type(x2) == float:
        return -1
    elif type(x1) == float or type(x2) == float:
        return -0.5
    try:
        return func(x1, x2)
    except:
        return -1
    
def clean_string(df, column, target_column):
    # Unidecode
    df = apply_notnull(df, column, target_column, lambda x: unidecode(x))

    # Replace AND, AT
    df = apply_notnull(df, target_column, target_column, lambda x: x.translate(
        str.maketrans({"@": "at", "&": "and"})))

    # Strip punctuation
    df = apply_notnull(df, target_column, target_column, lambda x: x.translate(
        str.maketrans('', '', string.punctuation)))

    # To lowercase
    df = apply_notnull(df, target_column, target_column, lambda x: x.lower())

    # Remove leading spaces
    df = apply_notnull(df, target_column, target_column, lambda x: x.strip())

    return df


def get_shingles(df, column, shingle_k):
    for k in shingle_k:
        sh = ShingleBased(k=k)
        df = apply_notnull(df, column,
                           f"{column}_shingles_{k}", lambda x: sh.get_profile(x))

    return df

_SPACE_PATTERN = re.compile("\\s+")


class ShingleBased:

    def __init__(self, k=3):
        self.k = k

    def get_k(self):
        return self.k

    def get_profile(self, string):
        shingles = dict()
        no_space_str = _SPACE_PATTERN.sub(" ", string)
        for i in range(len(no_space_str) - self.k + 1):
            shingle = no_space_str[i:i + self.k]
            old = shingles.get(shingle)
            if old:
                shingles[str(shingle)] = int(old + 1)
            else:
                shingles[str(shingle)] = 1
        return shingles

def preprocessing(df):
    df = df.set_index("id", drop=False)

    # Name cleaning
    df = clean_string(df, "name", "name_cleaned")

    # Name shingles
    df = get_shingles(df, "name_cleaned", (2, 3))

    # Closest city shingles
    df = get_shingles(df, "closest_city", (3,))

    # Full address
    df["full_address"] = df["address"].fillna("") +\
        " " + df["city"].fillna("") +\
        " " + df["state"].fillna("")

    df.loc[df["full_address"] == "  ", "full_address"] = np.NaN
    df = clean_string(df, "full_address", "full_address_cleaned")
    df = get_shingles(df, "full_address_cleaned", (3,))

    # Numbers in name/address
    df = apply_notnull(
        df, "name_cleaned", "numbers_in_name", get_numbers_from_name)
    df.loc[df["numbers_in_name"] == "", "numbers_in_name"] = np.NaN

    df = apply_notnull(
        df, "full_address_cleaned", "numbers_in_full_address", get_numbers_from_name)
    df.loc[df["numbers_in_full_address"]
           == "", "numbers_in_full_address"] = np.NaN

    df = get_shingles(df, "numbers_in_name", (1, 2))
    df = get_shingles(df, "numbers_in_full_address", (1, 2))
    
    # Catogories shingles
    df = get_shingles(df, "categories", (3,))

    # Categories to frozenset
    df["categories"] = df["categories"].fillna("None")
    df["categories"] = df["categories"].apply(lambda x: x.split(", "))
    df["categories"] = df["categories"].apply(frozenset)
    
    # Main catogory shingles
    df = get_shingles(df, "main_category", (3,))

    # Encode categorical columns
    if CFG.encode:
        # No encoders provided, create and save
        encoder_params = {"dtype": np.int32,
                          "handle_unknown": "use_encoded_value",
                          "unknown_value": -1}

        ordinal_encoder = OrdinalEncoder(**encoder_params)

        ordinal_encoder = ordinal_encoder.fit(df[["country", "categories"]]) # main_category, closest_city ?
        pickle_save(ordinal_encoder, "saved/ordinal_encoder.pkl")
        encoder = ordinal_encoder
    else:
        encoder = pickle_load("additional_data/ordinal_encoder.pkl")

    df[["country_enc", "categories_enc"]] = encoder.transform(df[["country", "categories"]])
    
    return df

def pickle_load(filename):
    return pickle.load(open(filename, 'rb'))

def get_numbers_from_name(name):
    return "".join(re.findall("[-+]?[.]?[\d]+(?:,\d\d\d)*[\.]?\d*(?:[eE][-+]?\d+)?", name))

## Preprocess data

In [8]:
df = preprocessing(df)
df.index.rename("index", inplace = True)

## TF-IDF vectorization

In [9]:
def tf_idf_vectorize(df):
    # set dict for corresponding ids and index
    id2index_d = dict(zip(df['id'].values, df.index))

    # make TF-IDF features
    tfidf_d = {}
    for col in ["categories", "name_cleaned", "full_address_cleaned"]:
        if col == "categories":
            tfidf = TfidfVectorizer(use_idf=False)
        else:
            tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False, stop_words=['unknown'])
        tv_fit = tfidf.fit_transform(df[col].astype(str).values)
        tfidf_d[col] = tv_fit
        
    return id2index_d, tfidf_d, tv_fit

df = df.reset_index()

id2index_d, tfidf_d, tv_fit = tf_idf_vectorize(df)

df = df.set_index('index')

## Split data on train and test

In [10]:
def group_split(df):
    gkf = GroupKFold(n_splits=2)
    splits = list(gkf.split(
        df, groups=df["point_of_interest"]))

    return df.iloc[splits[0][1]], df.iloc[splits[1][1]]

df['country'] = df['country'].fillna('NA')
df['name_cleaned'] = df['name_cleaned'].fillna('')

if CFG.train or CFG.debug:
    fold0_df, fold1_df = group_split(df)

## Candidate search utils

In [11]:
def overlap(profile0, profile1):
    union = set()
    for k in profile0.keys():
        union.add(k)
    for k in profile1.keys():
        union.add(k)
    inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
    return inter / min(len(profile0), len(profile1))

def country_closest_k(train_df, country, candidate_k):
    country_df = train_df[train_df["country"] == country]

    # Coordinates
    country_np = np.deg2rad(country_df[["latitude", "longitude"]].to_numpy())

    # To 3d
    country_np = np.vstack([(np.cos(country_np[:, 0]) * np.cos(country_np[:, 1])),
                            (np.cos(country_np[:, 0]) *
                             np.sin(country_np[:, 1])),
                            (np.sin(country_np[:, 0]))]).T

    neigh = NearestNeighbors(n_jobs=-1).fit(country_np)
    try:
        distances, neighbors_indices = neigh.kneighbors(
            country_np, n_neighbors=candidate_k, return_distance=True)
    except:
        # Handle Expected n_neighbors <= n_samples error
        # Add all but exclude itself
        neighbors_indices = [
            [i for i in range(len(country_df)) if i != j] for j in range(len(country_df))]
        neighbors_indices = np.array(neighbors_indices, dtype=int)

    # Convert indices to id
    ids = country_df["id"].to_numpy()
    neighbors_ids = pd.Series(list(neighbors_indices), index=country_df.index).apply(
        lambda candidate_indices: ids[candidate_indices])

    return neighbors_ids


def candidate_selection(train_df, candidate_k):
    train_df["k_candidates"] = pd.Series(dtype='object')
    uq_countries = train_df["country"].value_counts().index

    for country in tqdm(uq_countries):
        train_df.loc[train_df["country"] == country, "k_candidates"] = \
            country_closest_k(train_df, country, candidate_k)

    # Empty candidates
    for row in train_df.loc[train_df["k_candidates"].isnull(), "k_candidates"].index:
        train_df.at[row, "k_candidates"] = []

    return train_df


def forming_pairs_filtering(train_df, th):
    pairs = []
    dict_ = train_df["name_cleaned_shingles_3"].to_dict()

    for p1_idx in tqdm(train_df.index):
        for p2_idx in train_df.loc[p1_idx, "k_candidates"]:
            if p1_idx == p2_idx:  # Skip
                continue

            try:
                sim = overlap(dict_[p1_idx], dict_[p2_idx])
                if sim >= th:
                    pairs.append([p1_idx, p2_idx])
            except:
                pass

    return pd.DataFrame(pairs, columns=["p1", "p2"])


def recall_knn(df, n_neighbors, label):
    print(80*'=')
    print('Start KNN grouped by country')
    train_df_country = []
    for country, country_df in tqdm(df.groupby('country')):
        country_df = country_df.reset_index(drop = True)

        # distance KNN
        neighbors = min(len(country_df), n_neighbors)
        knn = KNeighborsRegressor(n_neighbors = neighbors,
                                  metric = 'haversine',
                                  n_jobs = -1)
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        dists, nears = knn.kneighbors(country_df[['latitude', 'longitude']], 
                                      return_distance = True)

        # name KNN
        tfidf = TfidfVectorizer(ngram_range=(3, 3), analyzer="char_wb", use_idf=False, stop_words=['unknown'])
        x_name = country_df['name_cleaned'].values # CHANGED: name_cleaned
        x_name = tfidf.fit_transform(x_name)
        
        knn_name = NearestNeighbors(n_neighbors = neighbors,
                                    metric = 'cosine',
                                    n_jobs = -1)
        knn_name.fit(x_name)
        dists_name, nears_name = knn_name.kneighbors(x_name)
        
        del tfidf, knn, knn_name, x_name
        gc.collect()
        
        # join distance and name KNNs
        for k in range(neighbors):            
            cur_df = country_df[['id']]
            cur_df['match_id'] = country_df['id'].values[nears[:, k]]
            cur_df['kdist_country'] = dists[:, k]
            cur_df['kneighbors_country'] = k
            
            cur_df_name = country_df[['id']]
            cur_df_name['match_id'] = country_df['id'].values[nears_name[:, k]]
            cur_df_name['kdist_name_country'] = dists_name[:, k]
            cur_df_name['kneighbors_name_country'] = k
            cur_df = cur_df.merge(cur_df_name, on = ['id', 'match_id'], how = 'outer')
            
            train_df_country.append(cur_df)
    
    train_df_country = pd.concat(train_df_country)
    train_df_country = train_df_country.drop_duplicates(subset=['id', 'match_id'])
    
    print('Start KNN for the whole dataset')
    train_df = []
    knn = NearestNeighbors(n_neighbors = n_neighbors,
                           n_jobs = -1)
    knn.fit(df[['latitude','longitude']], df.index)
    dists, nears = knn.kneighbors(df[['latitude','longitude']])
    
    for k in range(n_neighbors):            
        cur_df = df[['id']]
        cur_df['match_id'] = df['id'].values[nears[:, k]]
        cur_df['kdist'] = dists[:, k]
        cur_df['kneighbors'] = k
        train_df.append(cur_df)
    
    train_df = pd.concat(train_df)
    train_df = train_df.merge(train_df_country, on = ['id', 'match_id'], how = 'outer')
    
    del train_df_country
    gc.collect()
    
    return train_df, label

## Candidate search

In [12]:
# Candidate selection, pairs forming
#df = candidate_selection(df, 320)
#pairs = forming_pairs_filtering(df, 0.2)

if CFG.train or CFG.debug:
    fold0_df_pairs = recall_knn(fold0_df, 20, 'train')[0][["id", "match_id"]]
    fold0_df_pairs.rename(columns={"id": "p1", "match_id": "p2"}, inplace=True)

    fold1_df_pairs = recall_knn(fold1_df, 20, 'train')[0][["id", "match_id"]]
    fold1_df_pairs.rename(columns={"id": "p1", "match_id": "p2"}, inplace=True)
else:
    pairs = recall_knn(df, 20, 'train')[0][["id", "match_id"]]
    pairs.rename(columns={"id": "p1", "match_id": "p2"}, inplace=True)

Start KNN grouped by country


  0%|          | 0/81 [00:00<?, ?it/s]

Start KNN for the whole dataset
Start KNN grouped by country


  0%|          | 0/76 [00:00<?, ?it/s]

Start KNN for the whole dataset


## Create train target feature

In [13]:
%%time

if CFG.train or CFG.debug:
    df = df.set_index('id')

    ids = fold0_df_pairs['p1'].tolist()
    match_ids = fold0_df_pairs['p2'].tolist()
    poi = df.loc[ids]['point_of_interest'].values
    match_poi = df.loc[match_ids]['point_of_interest'].values
    fold0_df_pairs['match'] = np.array(poi == match_poi, dtype = np.int8)

    ids = fold1_df_pairs['p1'].tolist()
    match_ids = fold1_df_pairs['p2'].tolist()
    poi = df.loc[ids]['point_of_interest'].values 
    match_poi = df.loc[match_ids]['point_of_interest'].values
    fold1_df_pairs['match'] = np.array(poi == match_poi, dtype = np.int8)

    del poi, match_poi, ids, match_ids
    gc.collect()

    print('Num of unique train id: %s' % fold0_df_pairs['p1'].nunique())
    print('Num of train data: %s' % len(fold0_df_pairs))
    print('Pos rate: %s' % fold0_df_pairs['match'].mean())
    print('')
    print('Num of unique valid id: %s' % fold1_df_pairs['p1'].nunique())
    print('Num of valid data: %s' % len(fold1_df_pairs))
    print('Pos rate: %s' % fold1_df_pairs['match'].mean())

    df = df.reset_index('id')

Num of unique train id: 1500
Num of train data: 56965
Pos rate: 0.02640217677521285

Num of unique valid id: 1500
Num of valid data: 57780
Pos rate: 0.026029768085842852
CPU times: user 138 ms, sys: 11.7 ms, total: 150 ms
Wall time: 149 ms


## Group generation

In [14]:
def get_poi2id(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('p2')['p1'].apply(set).to_dict()

def create_poi_groups(poi2id):
    # merge POIs by groups
    for p1, values in tqdm(poi2id.items()):
        for p2 in values:
            if p2 != p1:
                poi2id[p1] = poi2id[p1].union(poi2id[p2])
                poi2id[p2] = set()
    # remove empty groups
    poi2id = {k: v for k, v in poi2id.items() if v}
    return poi2id

def merge_poi_groups(poi2id, poi_length, divider=20):
    l = len(poi2id)
    keys = list(poi2id.keys())
    for i in tqdm(range(l)):
        j = i + 1
        while 0 < len(poi2id[keys[i]]) < int(poi_length/divider) and j < l:
            poi2id[keys[i]] = poi2id[keys[i]].union(poi2id[keys[j]])
            poi2id[keys[j]] = set()
            j += 1
    # remove empty groups
    poi2id = {k: v for k, v in poi2id.items() if v}
    return poi2id

def clean_poi(poi2id):
    # clean POIs
    values_to_remove = set() 
    for p1, values in tqdm(poi2id.items()):
        values = values.difference(values_to_remove)
        values_to_remove = values_to_remove.union(values)
        poi2id[p1] = values
    # remove empty groups again
    poi2id = {k: v for k, v in poi2id.items() if v}
    return poi2id

def group_generation(pairs, n_splits=10):
    # get POI-ID dictionary
    poi2id = get_poi2id(pairs)
    poi_length = len(poi2id)

    # merge poi by groups
    poi2id = create_poi_groups(poi2id)

    # clean poi
    poi2id = clean_poi(poi2id)

    # decrease number of group by divider number to increase performance of further processes
    poi2id = merge_poi_groups(poi2id, poi_length, 20)

    # set groups for pairs
    pairs = pairs.set_index('p1')
    for idx, values in tqdm(enumerate(poi2id.values()), total=len(poi2id)):
        pairs.loc[list(values), 'set'] = idx
    pairs = pairs.reset_index()

    kf = GroupKFold(n_splits=n_splits)
    for i, (trn_idx, val_idx) in enumerate(kf.split(pairs, 
                                                    pairs['set'], 
                                                    pairs['set'])):
        pairs.loc[val_idx, 'group'] = i

    pairs['group'] = pairs['group'].astype('int8')
    pairs = pairs.drop('set', axis=1)
    
    return pairs

if CFG.train or CFG.debug:
    fold0_df_pairs = group_generation(fold0_df_pairs, CFG.n_splits)
    fold1_df_pairs = group_generation(fold1_df_pairs, CFG.n_splits)
else:
    pairs = group_generation(pairs, CFG.n_splits)

  0%|          | 0/1500 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/1500 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

## Feature Engeneering utils

In [15]:
def feature_engineering(train_df, pairs):

    # Candidates count
    if "count1" not in pairs.columns:
        pairs["count1"] = pairs.groupby("p1")["p1"].transform("count")
        pairs["count1"] = pairs["count1"].astype(np.int32)

    if "count2" not in pairs.columns:
        pairs["count2"] = pairs.groupby("p2")["p2"].transform("count")
        pairs["count2"] = pairs["count2"].astype(np.int32)

    # Distance metrics
    lat1 = train_df.loc[pairs["p1"], "latitude"].values
    lon1 = train_df.loc[pairs["p1"], "longitude"].values
    lat2 = train_df.loc[pairs["p2"], "latitude"].values
    lon2 = train_df.loc[pairs["p2"], "longitude"].values
    diff_lat = np.abs(lat2-lat1)
    diff_lon = np.abs(lon2-lon1)
    
    # Haversine
    if "haversine" not in pairs.columns:
        pairs["haversine"] = haversine_vec(lat1, lon1, lat2, lon2)
        pairs["haversine"] = pairs["haversine"].astype(np.float32)
    # Manhattan
    if "manhattan" not in pairs.columns:
        pairs['manhattan'] = diff_lat + diff_lon
        pairs["manhattan"] = pairs["manhattan"].astype(np.float32)
    # Euclidian
    if "euclidian" not in pairs.columns:   
        pairs['euclidian'] = np.sqrt(np.square(diff_lat) + np.square(diff_lon))
        pairs["euclidian"] = pairs["euclidian"].astype(np.float32)
    
    # Name similarity
    for name in ["jaccard", "overlap", "cosine"]:
        for k in tqdm([2, 3]):
            feature_name = f"name_cleaned_{name}_{k}"
            if feature_name not in pairs.columns:
                similarity = get_shingle_similarity(name)
                pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"name_cleaned_shingles_{k}"],
                                                 train_df.loc[pairs["p2"], f"name_cleaned_shingles_{k}"])
                pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Full address similarity
    for name in ["jaccard", "overlap"]:
        for k in tqdm([3]):
            feature_name = f"full_address_{name}_{k}"
            if feature_name not in pairs.columns:
                similarity = get_shingle_similarity(name)
                pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"full_address_cleaned_shingles_{k}"],
                                                 train_df.loc[pairs["p2"], f"full_address_cleaned_shingles_{k}"])
                pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Name-address similarity
    for name in ["overlap"]:
        for k in tqdm([3]):
            feature_name = f"name_address_{name}_{k}"
            if feature_name not in pairs.columns:
                similarity = get_shingle_similarity(name)
                pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"name_cleaned_shingles_{k}"],
                                                 train_df.loc[pairs["p2"], f"full_address_cleaned_shingles_{k}"])

                pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"full_address_cleaned_shingles_{k}"],
                                                  train_df.loc[pairs["p2"], f"name_cleaned_shingles_{k}"])
                pairs[feature_name] = pairs[feature_name] / 2

                pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Numbers in name similarity
    for name in ["overlap"]:
        feature_name = f"numbers_in_name_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"numbers_in_name_shingles_1"], train_df.loc[pairs["p2"], f"numbers_in_name_shingles_1"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_name_shingles_2"], train_df.loc[pairs["p2"], f"numbers_in_name_shingles_2"])
            pairs[feature_name] = pairs[feature_name] / 2

            pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Numbers in address similarity
    for name in ["overlap"]:
        feature_name = f"numbers_in_address_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"numbers_in_full_address_shingles_1"],
                                             train_df.loc[pairs["p2"], f"numbers_in_full_address_shingles_1"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_full_address_shingles_2"],
                                              train_df.loc[pairs["p2"], f"numbers_in_full_address_shingles_2"])
            pairs[feature_name] = pairs[feature_name] / 2

            pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Numbers in name-address similarity
    for name in ["overlap"]:
        feature_name = f"numbers_in_name_address_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"numbers_in_name_shingles_1"],
                                             train_df.loc[pairs["p2"], f"numbers_in_full_address_shingles_1"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_full_address_shingles_1"], train_df.loc[pairs["p2"], f"numbers_in_name_shingles_1"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_name_shingles_2"],
                                              train_df.loc[pairs["p2"], f"numbers_in_full_address_shingles_2"])
            pairs[feature_name] += similarity(train_df.loc[pairs["p1"], f"numbers_in_full_address_shingles_2"],
                                              train_df.loc[pairs["p2"], f"numbers_in_name_shingles_2"])
            pairs[feature_name] = pairs[feature_name] / 4

            pairs[feature_name] = pairs[feature_name].astype(np.float16)

    # Category
    if "categories1" not in pairs.columns:
        pairs["categories1"] = train_df.loc[pairs["p1"],
                                            "categories_enc"].astype(np.int32).to_numpy()
        pairs["categories1"] = pairs["categories1"].astype(np.int32)

    if "categories2" not in pairs.columns:
        pairs["categories2"] = train_df.loc[pairs["p2"],
                                            "categories_enc"].to_numpy()
        pairs["categories2"] = pairs["categories2"].astype(np.int32)
        
    # Categories text similarity
    for name in ["overlap", "jaccard"]:
        feature_name = f"categories_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"categories_shingles_3"],
                                             train_df.loc[pairs["p2"], f"categories_shingles_3"])
            pairs[feature_name] = pairs[feature_name].astype(np.float16)
            
    # Main categories text similarity
    for name in ["jaccard", "overlap", "cosine"]:
        feature_name = f"main_category_{name}"
        if feature_name not in pairs.columns:
            similarity = get_shingle_similarity(name)
            pairs[feature_name] = similarity(train_df.loc[pairs["p1"], f"main_category_shingles_3"],
                                             train_df.loc[pairs["p2"], f"main_category_shingles_3"])
            pairs[feature_name] = pairs[feature_name].astype(np.float16) 

    # Country (same for every pair)
    if "country" not in pairs.columns:
        pairs["country1"] = train_df.loc[pairs["p1"], "country_enc"].to_numpy()
        pairs["country1"] = pairs["country1"].astype(np.int32)
        
        pairs["country2"] = train_df.loc[pairs["p2"], "country_enc"].to_numpy()
        pairs["country2"] = pairs["country2"].astype(np.int32)

    # TF-IDF features
    for column in ["name_cleaned", "full_address_cleaned"]:
        # for each id and match_id add corresponding TF-IDF vector
        # than multiply them elementwise to get similarity
        tv_fit = tfidf_d[column]
        indexs = [id2index_d[i] for i in pairs['p1']]
        match_indexs = [id2index_d[i] for i in pairs['p2']]                    
        pairs[f"tfidf_trigram_{column}"] = tv_fit[indexs].multiply(tv_fit[match_indexs]).sum(axis = 1).A.ravel()
    
    
    # Country-based TF-IDF features
#     for country in tqdm(train_df["country_enc"].unique()):
#         country_df = train_df[train_df["country_enc"] == country]
#         country_df["country_index"] = np.arange(len(country_df)).astype(int)

#         country_pairs = pairs[pairs["country"] == country]
#         country_pairs["country_index1"] = country_df.loc[country_pairs["p1"],
#                                                          "country_index"].to_numpy()
#         country_pairs["country_index2"] = country_df.loc[country_pairs["p2"],
#                                                          "country_index"].to_numpy()

#         index1 = country_pairs["country_index1"].to_numpy()
#         index2 = country_pairs["country_index2"].to_numpy()

#         for column in ["name_cleaned", "full_address_cleaned"]:
#             try:
#                 vectorizer_words = TfidfVectorizer()
#                 vectorizer_trigrams = TfidfVectorizer(
#                     analyzer="char_wb", ngram_range=(3, 3))
#                 words_matrix = vectorizer_words.fit_transform(
#                     country_df[column].fillna(""))
#                 trigrams_matrix = vectorizer_trigrams.fit_transform(
#                     country_df[column].fillna(""))
#             except:
#                 continue

#             pairs.loc[pairs["country"] == country, f"tfidf_trigram_{column}"] = \
#                 np.sum(trigrams_matrix[index1].multiply(
#                     trigrams_matrix[index2]), axis=1)

#             pairs.loc[pairs["country"] == country, f"tfidf_words_{column}"] = \
#                 np.sum(words_matrix[index1].multiply(
#                     words_matrix[index2]), axis=1)

#             pairs[f"tfidf_trigram_{column}"] = \
#                 pairs[f"tfidf_trigram_{column}"].astype(np.float16)

#             pairs[f"tfidf_words_{column}"] = \
#                 pairs[f"tfidf_words_{column}"].astype(np.float16)
            
    # Group-by features
    # Haversine features
    groupby_p1 = pairs.groupby('p1')["haversine"]
    groupby_p2 = pairs.groupby('p2')["haversine"]
    pairs[f"p1_haversine_mean"] = groupby_p1.transform(
        np.mean).astype(np.float32)
    pairs[f"p2_haversine_mean"] = groupby_p2.transform(
        np.mean).astype(np.float32)
    pairs[f"p1_haversine_min"] = groupby_p1.transform(
        np.min).astype(np.float32)
    pairs[f"p2_haversine_min"] = groupby_p2.transform(
        np.min).astype(np.float32)
    pairs[f"p1_haversine_max"] = groupby_p1.transform(
        np.max).astype(np.float32)
    pairs[f"p2_haversine_max"] = groupby_p2.transform(
        np.max).astype(np.float32)

    pairs[f"p1_haversine_rank"] = ((groupby_p1.transform(
        "rank", method="min") - 1) / pairs["count1"]).astype(np.float16)
    pairs[f"p2_haversine_rank"] = ((groupby_p2.transform(
        "rank", method="min") - 1) / pairs["count2"]).astype(np.float16)
    
    # Manhattan features
    groupby_p1 = pairs.groupby('p1')["manhattan"]
    groupby_p2 = pairs.groupby('p2')["manhattan"]
    pairs[f"p1_manhattan_mean"] = groupby_p1.transform(
        np.mean).astype(np.float32)
    pairs[f"p2_manhattan_mean"] = groupby_p2.transform(
        np.mean).astype(np.float32)
    pairs[f"p1_manhattan_min"] = groupby_p1.transform(
        np.min).astype(np.float32)
    pairs[f"p2_manhattan_min"] = groupby_p2.transform(
        np.min).astype(np.float32)
    pairs[f"p1_manhattan_max"] = groupby_p1.transform(
        np.max).astype(np.float32)
    pairs[f"p2_manhattan_max"] = groupby_p2.transform(
        np.max).astype(np.float32)

    pairs[f"p1_manhattan_rank"] = ((groupby_p1.transform(
        "rank", method="min") - 1) / pairs["count1"]).astype(np.float16)
    pairs[f"p2_manhattan_rank"] = ((groupby_p2.transform(
        "rank", method="min") - 1) / pairs["count2"]).astype(np.float16)
    
    # Euclidian features
    groupby_p1 = pairs.groupby('p1')["euclidian"]
    groupby_p2 = pairs.groupby('p2')["euclidian"]
    pairs[f"p1_euclidian_mean"] = groupby_p1.transform(
        np.mean).astype(np.float32)
    pairs[f"p2_euclidian_mean"] = groupby_p2.transform(
        np.mean).astype(np.float32)
    pairs[f"p1_euclidian_min"] = groupby_p1.transform(
        np.min).astype(np.float32)
    pairs[f"p2_euclidian_min"] = groupby_p2.transform(
        np.min).astype(np.float32)
    pairs[f"p1_euclidian_max"] = groupby_p1.transform(
        np.max).astype(np.float32)
    pairs[f"p2_euclidian_max"] = groupby_p2.transform(
        np.max).astype(np.float32)

    pairs[f"p1_euclidian_rank"] = ((groupby_p1.transform(
        "rank", method="min") - 1) / pairs["count1"]).astype(np.float16)
    pairs[f"p2_euclidian_rank"] = ((groupby_p2.transform(
        "rank", method="min") - 1) / pairs["count2"]).astype(np.float16)

    # Name features
    for feature in ["name_cleaned_overlap_3"]:
        groupby_p1 = pairs.groupby('p1')[feature]
        groupby_p2 = pairs.groupby('p2')[feature]
        pairs[f"p1_{feature}_mean"] = groupby_p1.transform(
            np.mean).astype(np.float16)
        pairs[f"p2_{feature}_mean"] = groupby_p2.transform(
            np.mean).astype(np.float16)
        pairs[f"p1_{feature}_max"] = groupby_p1.transform(
            np.max).astype(np.float16)
        pairs[f"p2_{feature}_max"] = groupby_p2.transform(
            np.max).astype(np.float16)

        pairs[f"p1_{feature}_rank"] = ((groupby_p1.transform(
            "rank", method="min") - 1) / pairs["count1"]).astype(np.float16)
        pairs[f"p2_{feature}_rank"] = ((groupby_p2.transform(
            "rank", method="min") - 1) / pairs["count2"]).astype(np.float16)

    # Address/numbers features: only mean
    for feature in ["full_address_overlap_3", "numbers_in_name_overlap",
                    "numbers_in_address_overlap", "numbers_in_name_address_overlap",
                    "categories_overlap", "categories_jaccard"]:
        groupby_p1 = pairs.groupby('p1')[feature]
        groupby_p2 = pairs.groupby('p2')[feature]
        pairs[f"p1_{feature}_mean"] = groupby_p1.transform(
            np.mean).astype(np.float16)
        pairs[f"p2_{feature}_mean"] = groupby_p2.transform(
            np.mean).astype(np.float16)

    return pairs

def overlap(profile0, profile1):
    union = set()
    for k in profile0.keys():
        union.add(k)
    for k in profile1.keys():
        union.add(k)
    inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
    return inter / min(len(profile0), len(profile1))


def jaccard(profile0, profile1):
    union = set()
    for ite in profile0.keys():
        union.add(ite)
    for ite in profile1.keys():
        union.add(ite)
    inter = int(len(profile0.keys()) + len(profile1.keys()) - len(union))
    return 1.0 * inter / len(union)


def cosine(profile0, profile1):
    small = profile1
    large = profile0
    if len(profile0) < len(profile1):
        small = profile0
        large = profile1
    agg = 0.0
    for k, v in small.items():
        i = large.get(k)
        if not i:
            continue
        agg += 1.0 * v * i
    dot_product = agg

    agg = 0.0
    for k, v in profile0.items():
        agg += 1.0 * v * v
    profile0_norm = math.sqrt(agg)

    agg = 0.0
    for k, v in profile1.items():
        agg += 1.0 * v * v
    profile1_norm = math.sqrt(agg)

    return dot_product / (profile0_norm * profile1_norm)


def get_shingle_similarity(name):
    if name == "cosine":
        func = cosine
    elif name == "jaccard":
        func = jaccard
    elif name == "overlap":
        func = overlap

    func_ = np.vectorize(
        lambda x1, x2: pair_func(func, x1, x2))
    return func_

def seq_match_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.ratio(str1, str2)

def lev_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.distance(str1, str2)

def jw_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return Levenshtein.jaro_winkler(str1, str2)

def lcs_distance(str1, str2):
    if str1 == 'unknown' or str2 == 'unknown':
        return np.nan
    return LCS(str(str1), str(str2))

def haversine_vec(lat1, lon1, lat2, lon2):
    def h(la1, lo1, la2, lo2):
        return haversine((la1, lo1), (la2, lo2), unit='m')
    return np.vectorize(h)(lat1, lon1, lat2, lon2)

## Generate dataset

In [21]:
def generate_dataset_by_chunks(df, pairs, n_splits, label='train'):
    count = 0
    
    for k in tqdm(range(n_splits)):
        # split dataset by chunks
        print(f'Current split: {k+1}')
        cur_data = pairs[pairs['group'] == k]

        # add features & model prediction
        cur_data = feature_engineering(df, cur_data)

        count += len(cur_data)
        
        # save dataset
        cur_data = cur_data.drop('group', axis=1)
        if not CFG.debug:
            cur_data.to_pickle(f'{CFG.train_path}/{label}_data_{k+1}.pkl')    
        
        del cur_data
        gc.collect()
    
    print(f'Total len is {count}')
    return count

if CFG.train or CFG.debug:
#     with multiprocessing.Pool(processes=2) as pool:
#         pool.starmap(generate_dataset_by_chunks, [(fold0_df, fold0_df_pairs, CFG.n_splits, 'train'), 
#                                                   (fold1_df, fold1_df_pairs, CFG.n_splits, 'valid')])
    generate_dataset_by_chunks(fold0_df, fold0_df_pairs, CFG.n_splits, 'train')

  0%|          | 0/10 [00:00<?, ?it/s]

Current split: 1


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
56955,E_00ad494d262351,E_009746541353af,0,48,42,1052952.0,15.404252,11.123825,0.055542,0.0,0.142822,0.0,0.109131,0.0,0.0,0.0,0.0,0.0,-1.0,-0.75,4420,2362,0.0,0.0,0.0,0.0,0.0,203,203,0.109109,0.0,787980.1,962190.1,0.0,0.0,2917479.75,3857547.5,0.645996,0.619141,10.325733,12.783199,0.0,0.0,38.19865,45.698338,0.75,0.643066,8.363436,10.140498,0.0,0.0,34.089703,43.662174,0.645996,0.619141,0.076355,0.068787,1.0,1.0,0.0,0.0,0.069458,0.047607,0.03125,-0.856934,-0.666504,-0.761719,-0.492188,-0.845215,0.022308,0.054688,-0.011871,0.023911
56956,E_00ae3e9f057471,E_00ac4b34fdb91a,0,52,62,765315.8,10.810604,7.741186,0.119995,0.037048,0.272705,0.090881,0.179321,0.07312,0.017548,0.037048,0.0,-1.0,0.0,-0.5,15658,9682,0.0,0.0,0.0,0.0,0.0,203,203,0.072169,0.0,715499.4,1179377.0,0.0,0.0,3865656.0,3200141.25,0.672852,0.45166,9.091191,15.479317,0.0,0.0,52.578552,45.934872,0.672852,0.483887,7.20481,12.87838,0.0,0.0,43.470119,37.471951,0.672852,0.435547,0.06131,0.099426,1.0,1.0,0.653809,0.54834,0.056854,0.079712,-0.942383,-0.967773,0.01442,-0.002024,-0.557617,-0.566406,0.033142,0.091003,0.013641,0.045288
56957,E_00ae711cd1fd2f,E_00a8fa1aef729d,0,40,50,2623427.0,31.841925,31.185188,0.107117,0.033325,0.214233,0.076904,0.157227,0.060516,0.0,0.0,0.0,-1.0,-0.5,-0.75,8942,28421,-0.5,-0.5,-0.5,-0.5,-0.5,203,203,0.062017,0.0,1006678.0,1225865.0,0.0,0.0,4135626.5,2854045.0,0.850098,0.939941,12.857091,16.111586,0.0,0.0,52.283134,37.26643,0.825195,0.899902,11.394874,13.644604,0.0,0.0,48.858555,34.479118,0.850098,0.939941,0.075684,0.103516,1.0,1.0,0.600098,0.5,0.165161,0.041748,-0.975098,-0.97998,-0.049988,-0.640137,-0.575195,-0.814941,-0.549805,0.031494,-0.549805,0.013489
56958,E_00af3117566d04,E_0046352142627a,0,50,35,4439382.0,57.509979,45.260227,0.146362,0.063843,0.260986,0.125,0.291016,0.120117,0.016953,0.033325,0.0,-1.0,0.666504,-0.5,36853,24839,0.0,0.0,0.0,0.0,0.0,203,203,0.125,0.035737,1541868.0,6792191.0,0.0,0.0,4439382.5,8056841.0,0.97998,0.057129,19.729143,83.452156,0.0,0.0,57.509979,107.888672,0.97998,0.057129,17.058279,70.824364,0.0,0.0,45.260227,87.891052,0.97998,0.057129,0.115295,0.120483,1.0,1.0,0.580078,0.571289,0.117859,0.033813,-0.899902,-0.885742,0.046661,0.164185,-0.553223,-0.524902,0.055359,0.130249,0.030197,0.063965
56959,E_00afb5f1386239,E_002c8a558d29a7,0,54,53,1184769.0,12.532712,10.725442,0.133301,0.06665,0.285645,0.166626,0.209717,0.12915,0.0,0.0,0.083313,-1.0,0.166626,-0.5,19217,16913,0.099976,0.045441,0.0,0.0,0.0,203,203,0.1066,0.0,1116492.0,732753.4,0.0,0.0,6734385.5,2701736.0,0.592773,0.716797,13.832149,9.73243,0.0,0.0,78.953354,38.808781,0.574219,0.660156,11.486208,8.108853,0.0,0.0,68.788025,34.520855,0.574219,0.679199,0.09259,0.040863,1.0,1.0,0.777832,0.868164,0.049225,0.058655,-0.944336,-0.868164,0.080994,-0.062927,-0.553223,-0.57959,0.070984,0.024673,0.040802,0.002167
56960,E_00b07713470949,E_0028326dbe9a92,0,48,82,3323457.0,41.473591,36.59367,0.042542,0.020401,0.111084,0.058838,0.074524,0.042206,0.0,0.0,0.018524,-1.0,0.25,-0.5,32973,4031,0.0,0.0,0.0,0.0,0.0,203,203,0.085749,0.0,1739746.0,1019386.0,0.0,0.0,4170302.75,3458767.75,0.708496,0.963379,21.574541,13.296354,0.0,0.0,54.24152,51.204525,0.708496,0.951172,19.127478,10.408286,0.0,0.0,47.319004,39.142399,0.666504,0.963379,0.081726,0.143433,1.0,1.0,0.458252,0.402344,0.089722,0.006069,-0.9375,-0.938965,0.061096,0.08728,-0.546875,-0.557617,0.004452,0.004883,-0.01915,-0.002289
56961,E_00b17a1f85012a,E_008e3afc8bf638,0,50,72,2222964.0,27.547958,23.416464,0.024384,0.0,0.083313,0.0,0.046234,0.0,0.0,0.0,0.0,-1.0,0.25,-0.5,18033,27303,0.058838,0.014084,0.0,0.0,0.0,203,203,0.051848,0.0,1661215.0,1218812.0,0.0,0.0,3923133.75,3002114.5,0.600098,0.888672,21.009987,16.198492,0.0,0.0,50.691757,42.970654,0.580078,0.805664,18.346455,12.31106,0.0,0.0,44.612965,32.250042,0.600098,0.888672,0.055756,0.095398,1.0,1.0,0.0,0.0,0.062012,0.074585,-0.879883,-0.930664,0.075012,0.12384,-0.527344,-0.536621,0.128296,0.062622,0.074585,0.002865
56962,E_00b1d529cf4c2e,E_0077b19fdc5323,0,54,50,2218938.0,29.18738,22.65111,0.045441,0.0,0.090881,0.0,0.077148,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,20193,30525,0.083313,0.034485,0.0,0.0,0.0,203,203,0.077152,0.0,1543960.0,1559717.0,0.0,0.0,3245728.0,4182150.75,0.722168,0.560059,20.164446,19.992996,0.0,0.0,46.63858,55.629944,0.685059,0.580078,15.179702,17.271589,0.0,0.0,33.077007,48.066502,0.740723,0.540039,0.066101,0.062805,1.0,1.0,0.0,0.0,0.020828,0.083313,-0.907227,-0.899902,-0.638672,-0.660156,-0.796387,-0.805176,0.066772,0.054108,0.028961,0.037201
56963,E_00b2bcd6f9d6c2,E_00798f54783d83,0,56,51,1031121.0,15.234106,11.244662,0.0,0.0,0.0,0.0,0.0,0.0,0.150024,0.346191,0.0,-1.0,0.833496,-0.5,6951,8021,0.0,0.0,0.0,0.0,0.0,203,203,0.0,0.30443,980561.0,1017378.0,0.0,0.0,2695718.25,3243434.5,0.606934,0.60791,13.204518,13.751632,0.0,0.0,39.439117,45.51281,0.643066,0.666504,10.795245,11.246842,0.0,0.0,31.554468,39.98122,0.606934,0.627441,0.017853,0.051544,1.0,1.0,0.0,0.0,0.048187,0.085022,-0.893066,-0.921387,0.017853,0.03772,-0.544434,-0.555664,0.012161,0.131592,0.003963,0.074402
56964,E_00b2e19319858f,E_0034856174f0a2,0,54,52,371441.2,4.987554,3.969028,0.088257,0.027771,0.199951,0.0625,0.186157,0.054565,-0.5,-0.5,-0.25,-1.0,-0.5,-0.75,340,18755,0.0,0.0,0.0,0.0,0.0,203,203,0.108465,0.0,1036016.0,619349.9,0.0,0.0,3677585.25,3352989.25,0.370361,0.461426,13.211926,7.92697,0.0,0.0,53.983692,40.14436,0.370361,0.480713,11.053811,6.328839,0.0,0.0,43.332111,37.190163,0.370361,0.519043,0.082642,0.082947,1.0,1.0,0.574219,0.538574,-0.518555,0.055023,-0.852051,-0.942383,-0.657227,0.073364,-0.791504,-0.560059,0.018524,-0.004128,0.018524,-0.017334


Current split: 2


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
40182,E_003c9b24ce2973,E_0000c362229d93,0,22,36,970143.0,12.07519,8.769527,0.0,0.0,0.0,0.0,0.0,0.0,0.127319,0.25,0.099976,-1.0,-1.0,-1.0,13989,400,0.058838,0.017853,0.0,0.0,0.0,154,154,0.0,0.201319,222508.03125,800733.75,0.0,0.0,970143.0,1988267.375,0.95459,0.638672,2.756463,9.334436,0.0,0.0,12.07519,24.74824,0.95459,0.75,2.014139,7.227987,0.0,0.0,8.769527,17.905766,0.95459,0.638672,0.054291,0.052094,1.0,1.0,0.0,0.0,0.065063,-0.063538,-0.95459,-0.944336,-0.931641,-0.9375,-0.96582,-0.965332,0.105774,0.047211,0.065247,-0.002655
40183,E_0052155ba319e3,E_0000c362229d93,0,22,36,969911.8,12.058362,8.766851,0.0,0.0,0.0,0.0,0.0,0.0,0.07019,0.142822,0.0,-1.0,-0.25,-0.75,14542,400,0.058838,0.025635,0.0,0.0,0.0,154,154,0.0,0.111499,221515.90625,800733.75,0.0,0.0,969911.8,1988267.375,0.95459,0.583496,2.738453,9.334436,0.0,0.0,12.058362,24.74824,0.95459,0.666504,2.004774,7.227987,0.0,0.0,8.766851,17.905766,0.95459,0.583496,0.045441,0.052094,1.0,1.0,0.0,0.0,0.150757,-0.063538,-0.95459,-0.944336,-0.227295,-0.9375,-0.727051,-0.965332,0.09729,0.047211,0.069275,-0.002655
40184,E_008f3d80b3e684,E_0000c362229d93,0,22,36,977043.3,12.12818,8.830776,0.0,0.0,0.0,0.0,0.0,0.0,0.07019,0.142822,0.0,-1.0,-1.0,-1.0,34304,400,0.058838,0.019608,0.0,0.0,0.0,154,154,0.0,0.172559,224813.515625,800733.75,0.0,0.0,977043.3,1988267.375,0.95459,0.805664,2.770115,9.334436,0.0,0.0,12.12818,24.74824,0.95459,0.861328,2.034374,7.227987,0.0,0.0,8.830776,17.905766,0.95459,0.805664,0.059174,0.052094,1.0,1.0,0.0,0.0,0.091187,-0.063538,-0.95459,-0.944336,-0.931641,-0.9375,-0.96582,-0.965332,0.139526,0.047211,0.084839,-0.002655
40185,E_00ac077d3229b5,E_0000c362229d93,0,22,36,976338.8,12.104132,8.823775,0.071411,0.0,0.222168,0.0,0.116028,0.0,0.072449,0.178589,0.0,-1.0,-1.0,-1.0,28492,400,0.176514,0.063843,0.0,0.0,0.0,154,154,0.0,0.206157,225872.78125,800733.75,0.0,0.0,976338.8,1988267.375,0.95459,0.777832,2.784235,9.334436,0.0,0.0,12.104132,24.74824,0.95459,0.805664,2.043971,7.227987,0.0,0.0,8.823775,17.905766,0.95459,0.777832,0.058167,0.052094,1.0,1.0,0.0,0.0,0.121765,-0.063538,-0.95459,-0.944336,-0.931641,-0.9375,-0.96582,-0.965332,0.180908,0.047211,0.108704,-0.002655
40186,E_0000c362229d93,E_002aca3e798da9,0,32,33,1053237.0,13.11724,9.523973,0.0,0.0,0.0,0.0,0.0,0.0,0.114258,0.363525,0.0,-1.0,-1.0,-1.0,400,13088,0.0,0.0,0.0,0.0,0.0,154,154,0.0,0.193122,739323.5,551163.5625,0.0,0.0,1082590.0,1169185.875,0.9375,0.757812,8.728539,6.642144,0.0,0.0,13.420397,15.248392,0.9375,0.757812,6.672533,5.02071,0.0,0.0,9.787804,10.790494,0.9375,0.666504,0.058594,0.047028,1.0,1.0,0.0,0.0,-0.040222,0.081848,-0.9375,-0.969727,-0.953125,-0.79541,-0.96875,-0.924316,0.047913,0.195801,-0.005302,0.135498
40187,E_003c9b24ce2973,E_0064cb9782f4ee,0,22,37,963295.3,11.514804,8.68973,0.040009,0.0,0.166626,0.0,0.085144,0.0,-0.5,-0.5,-0.25,-1.0,-1.0,-1.0,13989,15645,0.0,0.0,0.0,0.0,0.0,154,154,0.0,0.0,222508.03125,784456.9375,0.0,0.0,970143.0,2058430.375,0.90918,0.648438,2.756463,8.757782,0.0,0.0,12.07519,25.308624,0.90918,0.756836,2.014139,7.072774,0.0,0.0,8.769527,18.535679,0.90918,0.648438,0.054291,0.027023,1.0,1.0,0.0,0.0,0.065063,-0.675781,-0.95459,-0.945801,-0.931641,-0.938965,-0.96582,-0.966309,0.105774,0.0,0.065247,0.0
40188,E_0000c362229d93,E_0033ab0914c63f,0,32,33,1082590.0,13.420397,9.787804,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.220581,-1.0,-1.0,-1.0,400,7866,0.0,0.0,0.0,0.0,0.0,154,154,0.0,0.0,739323.5,559262.0,0.0,0.0,1082590.0,1143369.125,0.96875,0.90918,8.728539,6.721249,0.0,0.0,13.420397,14.945235,0.96875,0.878906,6.672533,5.094511,0.0,0.0,9.787804,10.569527,0.96875,0.939453,0.058594,0.055878,1.0,1.0,0.0,0.0,-0.040222,-0.605957,-0.9375,-0.969727,-0.953125,-0.79541,-0.96875,-0.924316,0.047913,0.09314,-0.005302,0.057648
40189,E_0009fe536d869a,E_0033ab0914c63f,0,31,33,899263.2,10.90461,8.123076,0.034485,0.0,0.083313,0.0,0.068054,0.0,-0.5,-0.5,-0.25,-1.0,-1.0,-1.0,28062,7866,0.0,0.0,0.0,0.0,0.0,154,154,0.0,0.093659,666244.0625,559262.0,0.0,0.0,899263.2,1143369.125,0.967773,0.575684,7.532057,6.721249,0.0,0.0,10.90461,14.945235,0.967773,0.696777,6.007747,5.094511,0.0,0.0,8.123076,10.569527,0.967773,0.575684,0.041046,0.055878,1.0,1.0,0.0,0.0,-5.1e-05,-0.605957,-0.935547,-0.969727,-0.95166,-0.79541,-0.967773,-0.924316,0.001461,0.09314,-0.010986,0.057648
40190,E_002aca3e798da9,E_0000c362229d93,0,22,36,1053237.0,13.11724,9.523973,0.0,0.0,0.0,0.0,0.0,0.0,0.114258,0.363525,0.0,-1.0,-1.0,-1.0,13088,400,0.0,0.0,0.0,0.0,0.0,154,154,0.0,0.193122,285813.0625,800733.75,0.0,0.0,1053237.0,1988267.375,0.95459,0.833496,3.561,9.334436,0.0,0.0,13.11724,24.74824,0.95459,0.916504,2.589874,7.227987,0.0,0.0,9.523973,17.905766,0.95459,0.833496,0.063538,0.052094,1.0,1.0,0.0,0.0,0.190918,-0.063538,-0.95459,-0.944336,-0.931641,-0.9375,-0.96582,-0.965332,0.15271,0.047211,0.10675,-0.002655
40191,E_0033ab0914c63f,E_0000c362229d93,0,22,36,1082590.0,13.420397,9.787804,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.220581,-1.0,-1.0,-1.0,7866,400,0.0,0.0,0.0,0.0,0.0,154,154,0.0,0.0,312659.21875,800733.75,0.0,0.0,1082590.0,1988267.375,0.95459,0.944336,3.836597,9.334436,0.0,0.0,13.420397,24.74824,0.95459,0.944336,2.829438,7.227987,0.0,0.0,9.787804,17.905766,0.95459,0.944336,0.06665,0.052094,1.0,1.0,0.0,0.0,-0.59082,-0.063538,-0.95459,-0.944336,-0.931641,-0.9375,-0.96582,-0.965332,0.086182,0.047211,0.06543,-0.002655


Current split: 3


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
46605,E_00ae7bdcdb8492,E_008b2c82919464,0,42,60,525292.3125,6.299531,4.855895,0.033325,0.0,0.199951,0.0,0.087708,0.0,-0.5,-0.5,-0.22998,-1.0,-1.0,-1.0,28965,21673,0.0,0.0,0.0,0.0,0.0,197,197,0.083333,0.0,359657.84375,558111.25,0.0,0.0,725965.9,915235.8,0.571289,0.383301,4.753778,7.562459,0.0,0.0,10.618898,13.35422,0.547852,0.31665,3.816942,6.163883,0.0,0.0,7.677285,10.035568,0.547852,0.31665,0.10321,0.037506,1.0,1.0,0.0,0.0,-0.761719,-0.113281,-0.952148,-0.916504,-0.833496,-0.783203,-0.893066,-0.850098,0.0,0.02037,-0.008507,0.006096
46606,E_00aedd095d1fa6,E_0033c17c1e2697,0,45,61,473673.3125,6.794219,5.300899,0.074097,0.0,0.181763,0.0,0.197388,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,8942,12601,-0.5,-0.5,-1.0,-1.0,-1.0,197,197,0.071067,0.0,250766.828125,324791.84375,0.0,0.0,1192485.0,634681.9,0.844238,0.688477,3.225279,4.291173,0.0,0.0,14.459317,7.98646,0.844238,0.787109,2.700088,3.50322,0.0,0.0,13.998858,7.293589,0.844238,0.688477,0.086365,0.056396,1.0,1.0,0.0,0.0,-0.142334,-0.122925,-0.888672,-0.950684,-0.799805,-0.787109,-0.844238,-0.868652,-0.544434,-0.004917,-0.544434,-0.007332
46607,E_00b1c7ef7a7dcb,E_008cb5903d4983,0,43,80,331531.0,4.166296,3.891135,0.043488,0.020004,0.25,0.142822,0.086365,0.050507,-0.5,-0.5,-0.25,0.0,-1.0,-0.5,33707,24273,0.0,0.0,0.0,0.0,0.0,197,197,0.044947,0.0,233535.890625,344509.15625,0.0,0.0,1047420.0,871711.8,0.697754,0.387451,2.992367,4.669042,0.0,0.0,12.655982,10.456594,0.697754,0.3125,2.458076,3.789592,0.0,0.0,12.317155,10.213594,0.744141,0.487549,0.09967,0.060577,1.0,1.0,0.581543,0.825195,-0.213623,-0.693848,-0.813965,-0.03125,-0.906738,-0.774902,-0.860352,-0.490723,0.072083,0.059784,0.059113,0.009682
46608,E_00b2226147b836,E_00a026cc58982a,0,47,65,382565.84375,5.495873,3.910493,0.045441,0.0,0.199951,0.0,0.101257,0.0,-0.5,-0.5,-0.25,-1.0,-1.0,-1.0,13989,37827,0.0,0.0,0.0,0.0,0.0,197,197,0.054233,0.0,409669.75,682285.375,0.0,0.0,920665.9,1088486.0,0.468018,0.215332,5.435914,8.700117,0.0,0.0,13.437477,13.601101,0.510742,0.215332,4.349356,7.772122,0.0,0.0,9.902598,12.518088,0.425537,0.199951,0.068542,0.077209,1.0,1.0,0.0,0.0,-0.712891,-0.174194,-0.978516,-0.89209,-0.808594,-0.784668,-0.893555,-0.838379,0.197754,0.007858,0.147827,-0.012749
46609,E_00b274de809ffe,E_008fc872dc83f4,0,47,71,238437.203125,3.544512,2.597475,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.25,-1.0,-1.0,-1.0,13989,10902,0.0,0.0,0.0,0.0,0.0,197,197,0.0,0.0,367766.15625,319876.6875,0.0,0.0,1421394.0,1110289.0,0.510742,0.408447,4.99066,4.272648,0.0,0.0,19.495302,13.195008,0.531738,0.422607,3.990152,3.461342,0.0,0.0,16.305475,13.009437,0.510742,0.366211,0.068542,0.041779,1.0,1.0,0.0,0.0,-0.154297,-0.683105,-0.893555,-0.915527,-0.808594,-0.732422,-0.851074,-0.82373,0.156494,0.035217,0.114258,0.012405
46612,E_005fe2d7753f16,E_008cb00209a061,0,25,1,667504.5,9.080613,6.563042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-0.5,0.0,10902,33402,0.0,0.0,-0.5,-0.5,-0.5,201,201,0.0,0.0,552962.25,667504.5,0.0,667504.5,983085.5,667504.5,0.879883,0.0,6.890294,9.080613,0.0,9.080613,15.429142,9.080613,0.879883,0.0,5.451627,6.563042,0.0,6.563042,12.336339,6.563042,0.879883,0.0,0.057129,0.0,1.0,0.0,0.0,0.0,-0.134766,0.0,-0.919922,-1.0,-0.140015,-0.5,-0.360107,0.0,0.180054,0.0,0.109375,0.0
46614,E_005fe2d7753f16,E_00037fb71d569d,0,25,1,827325.5,13.051941,10.242278,0.105286,0.0,0.25,0.0,0.176758,0.0,0.0,0.0,0.0,0.0,0.0,-0.75,10902,17367,0.0,0.0,-0.5,-0.5,-0.5,201,201,0.0,0.0,552962.25,827325.5,0.0,827325.5,983085.5,827325.5,0.919922,0.0,6.890294,13.051941,0.0,13.051941,15.429142,13.051941,0.919922,0.0,5.451627,10.242278,0.0,10.242278,12.336339,10.242278,0.919922,0.0,0.057129,0.0,1.0,0.0,0.0,0.0,-0.134766,0.0,-0.919922,0.0,-0.140015,0.0,-0.360107,-0.75,0.180054,0.0,0.109375,0.0
46616,E_005fe2d7753f16,E_00888b74fcf94c,0,25,1,664448.5625,9.050233,6.537619,0.0,0.0,0.0,0.0,0.0,0.0,0.022217,0.06665,0.0,-1.0,-0.5,0.0,10902,8021,0.0,0.0,0.0,0.0,0.0,201,201,0.0,0.045644,552962.25,664448.5625,0.0,664448.5625,983085.5,664448.6,0.839844,0.0,6.890294,9.050233,0.0,9.050233,15.429142,9.050233,0.839844,0.0,5.451627,6.537619,0.0,6.537619,12.336339,6.537619,0.839844,0.0,0.057129,0.0,1.0,0.0,0.0,0.0,-0.134766,0.06665,-0.919922,-1.0,-0.140015,-0.5,-0.360107,0.0,0.180054,0.0,0.109375,0.0
46619,E_005fe2d7753f16,E_0029eb064621fa,0,25,1,656187.4375,8.874269,6.429547,0.090881,0.0,0.25,0.0,0.162231,0.0,0.0,0.0,0.03125,-1.0,0.0,-0.5,10902,33383,1.0,0.090881,0.0,0.0,0.0,201,201,0.0,0.0,552962.25,656187.4375,0.0,656187.4375,983085.5,656187.4,0.799805,0.0,6.890294,8.874269,0.0,8.874269,15.429142,8.874269,0.799805,0.0,5.451627,6.429547,0.0,6.429547,12.336339,6.429547,0.799805,0.0,0.057129,0.0,1.0,0.0,0.0,0.0,-0.134766,0.0,-0.919922,-1.0,-0.140015,0.0,-0.360107,-0.5,0.180054,1.0,0.109375,0.090881
46629,E_005fe2d7753f16,E_008bbf6d445cda,0,25,1,983085.5,15.429142,12.336339,0.0,0.0,0.0,0.0,0.0,0.0,0.014496,0.032257,0.0,-1.0,-0.5,0.0,10902,25637,0.0,0.0,0.0,0.0,0.0,201,201,0.0,0.030429,552962.25,983085.5,0.0,983085.5,983085.5,983085.5,0.959961,0.0,6.890294,15.429142,0.0,15.429142,15.429142,15.429142,0.959961,0.0,5.451627,12.336339,0.0,12.336339,12.336339,12.336339,0.959961,0.0,0.057129,0.0,1.0,0.0,0.0,0.0,-0.134766,0.032257,-0.919922,-1.0,-0.140015,-0.5,-0.360107,0.0,0.180054,0.0,0.109375,0.0


Current split: 4


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
38633,E_00168320fa2d3c,E_00898564f58a4e,0,34,2,324317.9375,4.469913,3.167872,0.135132,0.022217,0.357178,0.0625,0.333984,0.045654,0.071411,0.230713,0.132202,-1.0,-0.5,-0.75,21031,28062,0.0,0.0,0.0,0.0,0.0,108,108,0.043033,0.130946,280610.5625,321104.6875,0.0,317891.40625,349626.46875,324317.9375,0.764648,0.5,3.852715,4.424686,0.0,4.379459,4.872205,4.469913,0.706055,0.5,2.748104,3.136021,0.0,3.104169,3.446246,3.167872,0.706055,0.5,0.080383,0.0979,1.0,0.133301,0.558594,0.0,-0.010208,0.189453,-1.0,-1.0,-0.706055,-0.25,-0.853027,-0.625,-0.014709,0.045441,-0.014709,0.019226
38634,E_003b64b99aef7e,E_009b53a90a7537,0,35,2,322128.25,4.449486,3.151862,0.130493,0.0,0.272705,0.0,0.28418,0.0,0.065552,0.148193,0.083313,-1.0,0.25,-0.5,13673,9346,1.0,0.592773,1.0,1.0,1.0,108,108,0.0,0.155043,278664.90625,325344.4375,0.0,322128.25,343177.53125,328560.625,0.885742,0.0,3.82595,4.494712,0.0,4.449486,4.781752,4.539939,0.885742,0.0,2.728407,3.18374,0.0,3.151862,3.382347,3.215618,0.885742,0.0,0.061066,0.0,1.0,0.0,0.0,0.0,-0.064758,0.112549,-1.0,-1.0,-0.049988,-0.125,-0.600098,-0.625,0.063293,0.5,0.032532,0.296387
38639,E_003b64b99aef7e,E_0097337df549cd,0,35,1,316637.6875,4.363177,3.092434,0.038452,0.0,0.083313,0.0,0.060852,0.0,0.065552,0.148193,0.06665,-1.0,0.25,-0.5,13673,33707,0.0,0.0,0.0,0.0,0.0,108,108,0.0,0.140083,278664.90625,316637.6875,0.0,316637.6875,343177.53125,316637.6875,0.742676,0.0,3.82595,4.363177,0.0,4.363177,4.781752,4.363177,0.657227,0.0,2.728407,3.092434,0.0,3.092434,3.382347,3.092434,0.685547,0.0,0.061066,0.0,1.0,0.0,0.0,0.0,-0.064758,0.148193,-1.0,-1.0,-0.049988,0.25,-0.600098,-0.5,0.063293,0.0,0.032532,0.0
38641,E_003b64b99aef7e,E_001293c29ebe40,0,35,3,322425.5,4.456722,3.156497,0.114258,0.0,0.266602,0.0,0.226807,0.0,0.048401,0.111084,0.033325,-1.0,0.25,-0.5,13673,2869,0.625,0.333252,1.0,1.0,1.0,108,108,0.0,0.12555,278664.90625,503719.28125,0.0,322425.5,343177.53125,859872.875,0.914062,0.0,3.82595,6.928285,0.0,4.456722,4.781752,11.780957,0.914062,0.0,2.728407,4.907486,0.0,3.156497,3.382347,8.345695,0.914062,0.0,0.061066,0.020828,1.0,0.0625,0.0,0.0,-0.064758,0.071411,-1.0,-1.0,-0.049988,-0.041656,-0.600098,-0.583496,0.063293,0.25,0.032532,0.121826
38644,E_00168320fa2d3c,E_00aa5d03f59f7d,0,34,3,327847.8125,4.533175,3.210342,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.25,-1.0,-1.0,-1.0,21031,1876,0.0,0.0,0.0,0.0,0.0,108,108,0.0,0.0,280610.5625,502941.59375,0.0,321413.8125,349626.46875,859563.1875,0.82373,0.333252,3.852715,6.918611,0.0,4.442722,4.872205,11.779936,0.82373,0.333252,2.748104,4.900432,0.0,3.146572,3.446246,8.344381,0.82373,0.333252,0.080383,0.0,1.0,0.0,0.0,0.0,-0.010208,-0.5,-1.0,-1.0,-0.706055,-0.666504,-0.853027,-0.833496,-0.014709,0.291748,-0.014709,0.204346
38645,E_003b64b99aef7e,E_00aa5d03f59f7d,0,35,3,321413.8125,4.442722,3.146572,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.25,-1.0,-0.5,-0.75,13673,1876,0.75,0.571289,1.0,1.0,1.0,108,108,0.0,0.113228,278664.90625,502941.59375,0.0,321413.8125,343177.53125,859563.1875,0.828613,0.0,3.82595,6.918611,0.0,4.442722,4.781752,11.779936,0.828613,0.0,2.728407,4.900432,0.0,3.146572,3.382347,8.344381,0.828613,0.0,0.061066,0.0,1.0,0.0,0.0,0.0,-0.064758,-0.5,-1.0,-1.0,-0.049988,-0.666504,-0.600098,-0.833496,0.063293,0.291748,0.032532,0.204346
38648,E_00168320fa2d3c,E_009b53a90a7537,0,34,2,328560.625,4.539939,3.215618,0.041656,0.0,0.090881,0.0,0.125732,0.0,0.020004,0.076904,0.175049,-1.0,-0.5,-0.75,21031,9346,0.0,0.0,0.0,0.0,0.0,108,108,0.0,0.074536,280610.5625,325344.4375,0.0,322128.25,349626.46875,328560.625,0.882324,0.5,3.852715,4.494712,0.0,4.449486,4.872205,4.539939,0.882324,0.5,2.748104,3.18374,0.0,3.151862,3.446246,3.215618,0.882324,0.5,0.080383,0.0,1.0,0.0,0.0,0.0,-0.010208,0.112549,-1.0,-1.0,-0.706055,-0.125,-0.853027,-0.625,-0.014709,0.5,-0.014709,0.296387
38649,E_00168320fa2d3c,E_009cba12e4ec83,0,34,2,320990.03125,4.419371,3.132855,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.307617,0.125,-1.0,-0.5,-0.75,21031,33707,0.0,0.0,0.0,0.0,0.0,108,108,0.0,0.247594,280610.5625,317777.9375,0.0,314565.84375,349626.46875,320990.03125,0.646973,0.5,3.852715,4.374145,0.0,4.328918,4.872205,4.419371,0.646973,0.5,2.748104,3.101016,0.0,3.069176,3.446246,3.132855,0.646973,0.5,0.080383,0.0,1.0,0.0,0.0,0.0,-0.010208,0.209351,-1.0,-1.0,-0.706055,-0.125,-0.853027,-0.625,-0.014709,0.0,-0.014709,0.0
38661,E_00168320fa2d3c,E_0009f564553b2c,0,34,2,318597.21875,4.39926,3.11634,0.043488,0.0,0.099976,0.0,0.065918,0.0,0.024994,0.076904,0.09375,-1.0,-0.5,-0.75,21031,35474,0.0,0.0,0.0,0.0,0.0,108,108,0.0,0.125988,280610.5625,315381.5,0.0,312165.75,349626.46875,318597.21875,0.588379,0.5,3.852715,4.354033,0.0,4.308807,4.872205,4.39926,0.588379,0.5,2.748104,3.084467,0.0,3.052595,3.446246,3.11634,0.588379,0.5,0.080383,0.0,1.0,0.0,0.0,0.0,-0.010208,0.112549,-1.0,-1.0,-0.706055,-0.125,-0.853027,-0.625,-0.014709,0.0,-0.014709,0.0
38667,E_003b64b99aef7e,E_009cba12e4ec83,0,35,2,314565.84375,4.328918,3.069176,0.0,0.0,0.0,0.0,0.0,0.0,0.037964,0.111084,0.06665,-1.0,0.25,-0.5,13673,33707,0.0,0.0,0.0,0.0,0.0,108,108,0.0,0.077254,278664.90625,317777.9375,0.0,314565.84375,343177.53125,320990.03125,0.628418,0.0,3.82595,4.374145,0.0,4.328918,4.781752,4.419371,0.628418,0.0,2.728407,3.101016,0.0,3.069176,3.382347,3.132855,0.628418,0.0,0.061066,0.0,1.0,0.0,0.0,0.0,-0.064758,0.209351,-1.0,-1.0,-0.049988,-0.125,-0.600098,-0.625,0.063293,0.0,0.032532,0.0


Current split: 5


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
41874,E_00557e6ea7ef10,E_009571bd2af109,0,25,29,20579.771484,0.225122,0.185125,0.060608,0.0,0.181763,0.0,0.102844,0.0,0.166626,0.3479,0.0,0.0,0.0,0.0,15722,8942,-0.5,-0.5,-0.5,-0.5,-0.5,173,173,0.0,0.485424,11454.974609,14904.625,0.0,0.0,29364.408203,49341.417969,0.799805,0.724121,0.121555,0.166131,0.0,0.0,0.26958,0.615399,0.879883,0.758789,0.103042,0.134069,0.0,0.0,0.264153,0.443833,0.799805,0.724121,0.065735,0.053284,1.0,1.0,0.0,0.0,0.0625,0.044495,0.099976,-0.482666,0.080017,-0.620605,0.099976,-0.551758,-0.039368,-0.603516,-0.067749,-0.603516
41875,E_005d9ce17402e9,E_007e9bf7c37076,0,25,29,29359.105469,0.273671,0.264105,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-0.5,-0.5,-1.0,0.0,8942,22564,-0.5,-0.5,-1.0,-1.0,-1.0,173,173,0.0,1.0,11579.149414,20246.490234,0.0,0.0,29359.105469,39051.28125,0.959961,0.758789,0.124467,0.209521,0.0,0.0,0.273671,0.490438,0.959961,0.792969,0.104158,0.182126,0.0,0.0,0.264105,0.35127,0.959961,0.758789,0.077881,0.034485,1.0,1.0,0.0,0.0,-0.680176,-0.672363,-0.140015,-0.017242,-0.560059,-0.620605,-0.26001,-0.241333,-0.620117,0.034485,-0.620117,0.034485
41876,E_0012f150817ad6,E_00b3492b7d97fd,0,25,30,20856.595703,0.213676,0.187616,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.223633,0.0,0.0,-0.75,8942,15645,-0.5,-0.5,-0.5,-0.5,-0.5,173,173,0.0,0.0,10366.357422,16040.045898,0.0,0.0,27461.892578,45108.53125,0.879883,0.666504,0.11061,0.17116,0.0,0.0,0.282896,0.559657,0.839844,0.700195,0.093248,0.144285,0.0,0.0,0.247035,0.405762,0.879883,0.666504,0.060699,0.033325,1.0,1.0,0.0,0.0,-0.680176,-0.097839,-0.199951,-0.5,-0.560059,0.133301,-0.27002,-0.258301,-0.620117,-0.083313,-0.620117,-0.083313
41877,E_004575fae36c26,E_00b3492b7d97fd,0,25,30,18633.369141,0.192428,0.167617,0.0,0.0,0.0,0.0,0.0,0.0,0.049988,0.105286,0.026321,0.0,0.5,0.25,37661,15645,0.0,0.0,0.0,0.0,0.0,173,173,0.0,0.100504,9190.194336,16040.045898,0.0,0.0,25240.115234,45108.53125,0.879883,0.633301,0.101223,0.17116,0.0,0.0,0.261648,0.559657,0.839844,0.633301,0.082668,0.144285,0.0,0.0,0.227048,0.405762,0.879883,0.633301,0.058289,0.033325,1.0,1.0,0.0,0.0,0.067322,-0.097839,0.140015,-0.5,0.059998,0.133301,0.090027,-0.258301,-0.048004,-0.083313,-0.07135,-0.083313
41878,E_0089f521a38b30,E_004128aeb5bccc,0,25,25,14065.469727,0.158606,0.126524,0.0,0.0,0.0,0.0,0.0,0.0,0.275879,0.444336,0.0,-1.0,0.0,-0.5,8942,30525,-0.5,-0.5,-0.5,-0.5,-0.5,173,173,0.0,0.527046,8073.390137,12099.527344,0.0,0.0,17354.734375,30238.373047,0.879883,0.680176,0.089407,0.126817,0.0,0.0,0.203804,0.272612,0.839844,0.680176,0.072621,0.108839,0.0,0.0,0.156114,0.272015,0.879883,0.680176,0.040009,0.072021,1.0,1.0,0.0,0.0,0.107239,0.086243,-0.399902,-0.439941,-0.560059,0.059998,-0.47998,-0.290039,-0.620117,-0.020676,-0.620117,-0.044556
41879,E_008cb1db4871e7,E_00b3492b7d97fd,0,25,30,23904.330078,0.216368,0.215035,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.144775,0.0,0.0,-0.25,22564,15645,0.0,0.0,-0.5,-0.5,-0.5,173,173,0.0,0.0,12058.317383,16040.045898,0.0,0.0,30420.363281,45108.53125,0.879883,0.866699,0.12298,0.17116,0.0,0.0,0.282913,0.559657,0.799805,0.733398,0.10847,0.144285,0.0,0.0,0.273651,0.405762,0.879883,0.866699,0.062439,0.033325,1.0,1.0,0.0,0.0,-0.680176,-0.097839,0.080017,-0.5,-0.560059,0.133301,-0.180054,-0.258301,0.040009,-0.083313,0.040009,-0.083313
41880,E_00b3492b7d97fd,E_004128aeb5bccc,0,27,25,23766.751953,0.223719,0.213797,0.0,0.0,0.0,0.0,0.0,0.0,0.055542,0.105286,0.083313,-1.0,0.0,0.0,15645,30525,0.0,0.0,0.0,0.0,0.0,173,173,0.0,0.105409,13768.893555,12099.527344,0.0,0.0,25127.029297,30238.373047,0.888672,0.879883,0.140881,0.126817,0.0,0.0,0.268917,0.272612,0.888672,0.799805,0.123856,0.108839,0.0,0.0,0.226033,0.272015,0.888672,0.879883,0.037048,0.072021,1.0,1.0,0.0,0.0,-0.092102,0.086243,-0.444336,-0.439941,0.111084,0.059998,-0.25,-0.290039,-0.074097,-0.020676,-0.074097,-0.044556
41881,E_0012f150817ad6,E_0014138afddd16,0,25,30,20630.845703,0.223488,0.185584,0.061218,0.0,0.150024,0.0,0.127808,0.0,-1.0,-1.0,-0.5,0.0,-1.0,0.0,8942,11830,-0.5,-0.5,-0.5,-0.5,-0.5,173,173,0.0,1.0,10366.357422,15737.03418,0.0,0.0,27461.892578,44729.105469,0.839844,0.733398,0.11061,0.170314,0.0,0.0,0.282896,0.549845,0.879883,0.799805,0.093248,0.141559,0.0,0.0,0.247035,0.402354,0.839844,0.733398,0.060699,0.075317,1.0,1.0,0.0,0.0,-0.680176,-0.666504,-0.199951,0.18335,-0.560059,-0.600098,-0.27002,-0.19165,-0.620117,-0.083313,-0.620117,-0.083313
46611,E_0037667995bfcd,E_00037fb71d569d,0,22,1,664901.75,9.766632,9.030518,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.25,0.0,-1.0,-0.5,21706,17367,0.0,0.0,-0.5,-0.5,-0.5,201,201,0.0,0.0,577095.625,664901.75,0.0,664901.75,814368.875,664901.75,0.5,0.0,8.303601,9.766632,0.0,9.766632,11.628505,9.766632,0.54541,0.0,6.333509,9.030518,0.0,9.030518,11.129009,9.030518,0.90918,0.0,0.045441,0.0,1.0,0.0,0.0,0.0,-0.59082,-0.5,-0.86377,0.0,-0.54541,-1.0,-0.70459,-0.5,0.07373,0.0,0.061462,0.0
46615,E_0037667995bfcd,E_008bbf6d445cda,0,22,2,814368.875,11.628505,11.129009,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.25,-1.0,0.0,-0.5,21706,25637,0.0,0.0,0.0,0.0,0.0,201,201,0.0,0.0,577095.625,749823.3125,0.0,685277.75,814368.875,814368.875,0.95459,0.5,8.303601,9.74469,0.0,7.860875,11.628505,11.628505,0.95459,0.5,6.333509,8.72594,0.0,6.32287,11.129009,11.129009,0.95459,0.5,0.045441,0.0,1.0,0.0,0.0,0.0,-0.59082,-0.25,-0.86377,-1.0,-0.54541,0.0,-0.70459,-0.25,0.07373,0.0,0.061462,0.0


Current split: 6


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
31630,E_00773ae537183b,E_006fd5b826213b,0,44,48,1211036.75,14.922325,11.222834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-1.0,0.0,-0.5,19609,8582,0.0,0.0,0.0,0.0,0.0,30,30,0.0,0.0,870352.6,1319439.0,0.0,0.0,2049278.5,6213744.5,0.70459,0.541504,10.05331,16.1569,0.0,0.0,26.052198,78.216347,0.79541,0.604004,7.911678,12.137357,0.0,0.0,18.565105,61.078056,0.70459,0.583496,0.056824,0.047241,1.0,1.0,0.0,0.0,-0.25,0.041656,-0.88623,-0.854004,-0.65918,0.052094,-0.772949,-0.34375,0.053528,0.079163,0.034424,0.050232
31631,E_007c673bc8ef39,E_006d62bd14636c,0,42,47,1658756.625,16.089287,14.920179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158325,-1.0,0.0,-0.5,2651,28965,0.0,0.0,0.0,0.0,0.0,30,30,0.0,0.171561,923689.2,1442937.0,0.0,0.0,2551810.25,2915190.25,0.856934,0.531738,11.153478,16.159693,0.0,0.0,32.818417,33.408234,0.69043,0.510742,8.423987,13.017974,0.0,0.0,23.214211,26.365543,0.80957,0.531738,0.029755,0.027298,1.0,1.0,0.0,0.0,0.023804,0.021271,-0.952148,-0.893555,-0.761719,0.021271,-0.856934,-0.345703,0.078003,0.067017,0.04834,0.039062
31632,E_00834995ee06d1,E_006ebb62cefa4b,0,44,49,2690397.0,33.795509,24.433205,0.057129,0.028564,0.117676,0.0625,0.151733,0.055908,0.0,0.0,0.0,-1.0,0.0,0.0,5265,12161,0.0,0.0,-0.5,-0.5,-0.5,30,30,0.055902,0.193952,2059367.0,749897.4,0.0,0.0,3133159.25,2690397.0,0.772949,0.979492,24.339844,8.772562,0.0,0.0,36.511467,33.795509,0.88623,0.979492,18.641087,6.864287,0.0,0.0,28.317358,24.433205,0.772949,0.979492,0.085876,0.03656,1.0,1.0,0.54541,0.897949,0.02272,0.040802,-0.86377,-0.856934,0.068176,0.040802,-0.318115,-0.341797,0.154297,0.050873,0.065613,0.031799
31633,E_008adfc4b1ee75,E_008b225d13ab87,0,41,42,2471906.5,23.953718,22.234737,0.133301,0.0,0.285645,0.0,0.209717,0.0,0.0,0.0,0.099976,0.0,0.0,-0.5,22564,22216,0.0,0.0,-0.5,-0.5,-0.5,30,30,0.0,0.08,1625371.0,732210.3,0.0,0.0,2908586.25,2670730.5,0.829102,0.952148,17.052734,8.570797,0.0,0.0,26.987789,28.01556,0.731934,0.856934,14.63365,6.716927,0.0,0.0,26.157518,24.041189,0.829102,0.952148,0.061768,0.038086,1.0,1.0,0.0,0.0,0.024384,0.047607,0.012192,-0.928711,-0.70752,0.035706,-0.457275,-0.351074,0.092651,0.128784,0.0849,0.077087
31634,E_00990a7376e84e,E_000e3bce5bd214,0,47,48,2860164.5,34.639954,25.818716,0.115356,0.015381,0.214233,0.03125,0.215942,0.027954,0.0,0.0,-0.21875,-1.0,0.0,-0.5,37661,6504,0.0,0.0,0.0,0.0,0.0,30,30,0.057354,0.0,2332661.0,1150720.0,0.0,0.0,3794142.75,2860164.5,0.638184,0.979004,27.809656,13.404631,0.0,0.0,42.975521,34.639954,0.638184,0.979004,21.054262,10.504854,0.0,0.0,34.237179,25.818716,0.638184,0.979004,0.115662,0.091003,1.0,1.0,0.319092,0.395752,-0.297852,0.020828,-0.851074,-0.854004,-0.659668,0.010414,-0.755371,-0.369873,0.080872,0.032104,0.068115,0.025391
31635,E_009d214190ae28,E_005b1eb01674eb,0,36,54,2010388.625,25.644926,18.344233,0.083313,0.0,0.181763,0.0,0.14209,0.0,0.0,0.0,0.033325,-1.0,-1.0,-1.0,9121,21673,0.0,0.0,0.0,0.0,0.0,30,30,0.079057,0.0,1288915.0,875414.0,0.0,0.0,2832086.0,2714250.75,0.722168,0.777832,15.940246,10.25232,0.0,0.0,32.644432,34.363121,0.75,0.907227,11.717061,8.012418,0.0,0.0,25.605904,24.670473,0.75,0.796387,0.105835,0.056854,1.0,1.0,0.0,0.0,0.027771,0.018524,-0.916504,-0.888672,-0.777832,-0.703613,-0.847168,-0.796387,0.096558,0.054932,0.053528,0.035889
31636,E_009fc26a534890,E_004dd6aa7e9790,0,38,37,1579978.0,20.277359,14.373832,0.206909,0.026672,0.5,0.083313,0.391113,0.056091,0.0,0.0,0.094604,0.0,-1.0,-0.5,17106,37661,0.0,0.0,0.0,0.0,0.0,30,30,0.057735,0.0,1368650.0,926280.8,0.0,0.0,3012479.0,2510806.0,0.526367,0.811035,16.43351,10.787704,0.0,0.0,37.731876,31.508152,0.552734,0.89209,12.412054,8.421719,0.0,0.0,27.513666,22.660578,0.526367,0.811035,0.113464,0.112366,1.0,1.0,0.579102,0.594727,0.026321,0.027023,-0.868652,-0.013512,-0.763184,-0.648438,-0.815918,-0.5,0.049774,0.124329,0.035797,0.093079
31637,E_00a340587e8bd2,E_00990a7376e84e,0,39,49,3222686.25,39.418728,29.138929,0.073181,0.02272,0.214233,0.076904,0.171143,0.049042,-1.0,-1.0,-0.5,-1.0,-1.0,-1.0,19216,37661,0.0,0.0,-0.5,-0.5,-0.5,30,30,0.049029,1.0,917341.9,2617228.0,0.0,0.0,3222686.25,3794142.75,0.974121,0.652832,10.872716,31.739626,0.0,0.0,39.418728,43.295055,0.974121,0.652832,8.379376,23.635357,0.0,0.0,29.138929,34.237179,0.974121,0.652832,0.108521,0.096191,1.0,1.0,0.487061,0.67334,-0.358887,-0.346924,-0.94873,-0.877441,-0.692383,-0.67334,-0.820312,-0.775391,0.044861,0.103394,0.031769,0.075134
31638,E_00a42887211196,E_009fc26a534890,0,44,45,1570215.25,20.146055,14.284484,0.033325,0.014923,0.125,0.06665,0.089661,0.035461,0.0,0.0,0.285645,-1.0,-1.0,-1.0,12768,17106,0.0,0.0,0.0,0.0,0.0,30,30,0.0,0.0,983868.1,1570096.0,0.0,0.0,2504048.0,6494533.5,0.75,0.444336,11.495926,19.08403,0.0,0.0,31.444286,83.356476,0.84082,0.466553,8.926616,14.320255,0.0,0.0,22.599377,62.341553,0.772949,0.444336,0.030304,0.101624,1.0,1.0,0.86377,0.51123,0.045441,0.022217,-0.86377,-0.866699,-0.70459,-0.710938,-0.78418,-0.789062,0.15332,0.050934,0.063354,0.033844
31639,E_00b141aee4f50d,E_007f9152ecd3d2,0,42,46,2459535.25,26.774593,22.154762,0.102539,0.023804,0.285645,0.076904,0.180664,0.050629,-1.0,-1.0,-0.5,-1.0,-1.0,-1.0,19639,13989,0.0,0.0,0.0,0.0,0.0,30,30,0.049629,1.0,1627134.0,829634.2,0.0,0.0,2912896.75,2690083.0,0.833496,0.956543,17.194658,9.872655,0.0,0.0,27.030556,30.793667,0.928711,0.956543,14.65064,7.681225,0.0,0.0,26.196281,24.26071,0.833496,0.956543,0.09491,0.048645,1.0,1.0,0.571289,0.804199,-0.357178,-0.304443,-0.856934,-0.826172,-0.761719,-0.695801,-0.80957,-0.760742,0.047058,0.107239,0.032379,0.061096


Current split: 7


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
43044,E_0088f557cfe672,E_006c70d4e042ae,0,46,50,693092.0,7.026344,6.237963,0.02272,0.0,0.045441,0.0,0.033417,0.0,0.076904,0.153809,0.021744,-1.0,0.0,-0.5,33707,33433,0.0,0.0,0.0,0.0,0.0,191,191,0.0,0.284944,566015.125,224821.84375,0.0,0.0,1346991.0,741107.1,0.913086,0.819824,5.891934,2.566077,0.0,0.0,13.49348,9.0058,0.847656,0.839844,5.108138,2.041821,0.0,0.0,12.11853,6.762383,0.913086,0.819824,0.051025,0.037506,1.0,1.0,0.0,0.0,-0.037659,-0.026077,-0.869629,-0.779785,-0.782715,0.029999,-0.826172,-0.360107,0.095642,-0.019577,0.051483,-0.025146
43045,E_0090d00400318e,E_00a6a714bfe684,0,34,55,131111.9375,1.693726,1.197928,0.037048,0.0,0.125,0.0,0.063477,0.0,-1.0,-1.0,-0.5,-1.0,-1.0,-1.0,33707,7885,0.0,0.0,0.0,0.0,0.0,191,191,0.0,1.0,84432.976562,315594.125,0.0,0.0,703630.3,843919.9,0.853027,0.345459,0.955689,3.840642,0.0,0.0,8.040334,9.73406,0.853027,0.345459,0.762294,2.869403,0.0,0.0,6.342681,7.591526,0.853027,0.345459,0.052948,0.08667,1.0,1.0,0.0,0.0,-0.632324,-0.700195,-0.911621,-0.836426,-0.617676,-0.781738,-0.764648,-0.809082,0.058807,0.001806,0.016739,-0.033356
43046,E_00924edce3a2f9,E_0069dc15956f4d,0,37,43,89435.796875,0.91202,0.804756,0.0,0.0,0.0,0.0,0.0,0.0,0.159058,0.291748,0.0,-1.0,-1.0,-1.0,8942,33402,-0.5,-0.5,-1.0,-1.0,-1.0,191,191,0.0,0.336336,142582.765625,287777.375,0.0,0.0,710530.8,799359.4,0.756836,0.116272,1.581908,3.300458,0.0,0.0,8.281528,8.143489,0.756836,0.093018,1.289774,2.610206,0.0,0.0,6.409837,7.194756,0.756836,0.116272,0.122559,0.02829,1.0,1.0,0.0,0.0,0.035919,-0.051666,-0.89209,-0.813965,-0.621582,-0.744141,-0.756836,-0.779297,-0.581055,-0.025589,-0.581055,-0.031006
43047,E_00932112b5fac4,E_00a6a714bfe684,0,34,55,128561.21875,1.651339,1.17105,0.040009,0.0,0.166626,0.0,0.073303,0.0,-1.0,-1.0,-0.5,-1.0,-1.0,-1.0,8942,7885,-0.5,-0.5,-0.5,-0.5,-0.5,191,191,0.0,1.0,91545.5,315594.125,0.0,0.0,712651.6,843919.9,0.853027,0.272705,1.046387,3.840642,0.0,0.0,8.082722,9.73406,0.853027,0.272705,0.827287,2.869403,0.0,0.0,6.425651,7.591526,0.853027,0.254639,0.052948,0.08667,1.0,1.0,0.0,0.0,-0.632324,-0.700195,-0.911621,-0.836426,-0.646973,-0.781738,-0.779297,-0.809082,-0.588379,0.001806,-0.588379,-0.033356
43048,E_009ee98d202fef,E_00303f4a53f763,0,42,47,219395.015625,2.825094,2.011505,0.133301,0.027771,0.25,0.058838,0.2771,0.05423,-1.0,-1.0,-0.5,-1.0,-1.0,-1.0,2619,8685,0.0,0.0,0.0,0.0,0.0,191,191,0.048912,1.0,343203.46875,147777.953125,0.0,0.0,900451.1,744969.9,0.571289,0.766113,4.282383,1.616814,0.0,0.0,10.911137,8.004834,0.595215,0.766113,3.129833,1.334089,0.0,0.0,8.103917,6.699667,0.571289,0.766113,0.087769,0.081421,1.0,1.0,0.523926,0.468018,-0.666504,-0.67041,-0.833496,-0.893555,-0.833496,-0.702148,-0.833496,-0.797852,0.027573,-0.042542,-0.009514,-0.042542
43049,E_00a1e981be1fa2,E_00a3ec7823ea51,0,44,25,505027.15625,6.525192,4.635255,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5,-0.25,-1.0,-1.0,-1.0,8942,8212,-0.5,-0.5,-0.5,-0.5,-0.5,191,191,0.0,0.0,416927.34375,465951.40625,0.0,0.0,1081248.0,799359.4,0.88623,0.399902,5.21312,4.902887,0.0,0.0,13.074011,8.143489,0.88623,0.399902,3.811157,4.209038,0.0,0.0,9.736143,7.194756,0.88623,0.399902,0.03183,0.065002,1.0,1.0,0.0,0.0,-0.67041,-0.054993,-0.84082,-0.839844,-0.86377,-0.839844,-0.852051,-0.839844,-0.57959,-0.011002,-0.57959,-0.016541
43050,E_00a3ec7823ea51,E_00730121cd75ee,0,49,51,704495.8125,6.968579,6.338663,0.0,0.0,0.0,0.0,0.0,0.0,0.022995,0.058838,0.0,-1.0,0.0,-0.5,8212,8942,-0.5,-0.5,-0.5,-0.5,-0.5,191,191,0.0,0.074635,582857.3125,113639.820312,0.0,0.0,1368671.0,747839.3,0.714355,0.960938,5.975802,1.234332,0.0,0.0,13.700948,8.143347,0.734863,0.960938,5.257656,1.025717,0.0,0.0,12.31424,6.725477,0.714355,0.960938,0.042084,0.03244,1.0,1.0,0.0,0.0,-0.022186,-0.030365,-0.836914,-0.862793,-0.795898,-0.009804,-0.816406,-0.348145,-0.036224,-0.578613,-0.039062,-0.578613
43051,E_00a6a714bfe684,E_00583fc121ab65,0,39,39,132431.34375,1.710591,1.211013,0.121948,0.043488,0.25,0.090881,0.166748,0.083618,-0.5,-0.5,-0.25,-1.0,-1.0,-1.0,7885,30343,0.0,0.0,0.0,0.0,0.0,191,191,0.078567,0.0,258565.453125,83792.023438,0.0,0.0,843919.9,747637.1,0.615234,0.871582,3.117274,0.862208,0.0,0.0,9.815269,8.02347,0.615234,0.871582,2.347239,0.755742,0.0,0.0,7.591526,6.723652,0.615234,0.871582,0.112,0.039551,1.0,1.0,0.410156,0.922852,-0.641113,0.15564,-0.871582,-0.871582,-0.794922,-0.641113,-0.833496,-0.756348,0.020508,-0.025635,-0.013832,-0.025635
43052,E_00aab5345c00dc,E_00a3ec7823ea51,0,37,25,690712.8125,6.804265,6.214396,0.03125,0.0,0.0625,0.0,0.058258,0.0,0.054047,0.117676,0.085144,-1.0,-1.0,-1.0,25540,8212,0.099976,0.027771,0.0,0.0,0.0,191,191,0.0,0.116866,109731.945312,465951.40625,0.0,0.0,709765.6,799359.4,0.945801,0.600098,1.198149,4.902887,0.0,0.0,8.138627,8.143489,0.918945,0.560059,0.989701,4.209038,0.0,0.0,6.395585,7.194756,0.945801,0.600098,0.035583,0.065002,1.0,1.0,0.0,0.0,0.036377,-0.054993,-0.864746,-0.839844,-0.702637,-0.839844,-0.783691,-0.839844,0.121033,-0.011002,0.030624,-0.016541
43053,E_00ac4fc5c9346e,E_0004404e151b90,0,36,33,575102.375,6.486526,5.190011,0.076904,0.032257,0.285645,0.142822,0.488037,0.075623,-0.5,-0.5,-0.25,-1.0,-1.0,-1.0,8942,6504,-0.5,-0.5,-0.5,-0.5,-0.5,191,191,0.065372,0.0,154379.0,487813.75,0.0,0.0,715284.8,1323593.0,0.916504,0.515137,1.770367,5.351763,0.0,0.0,8.260026,13.241417,0.833496,0.45459,1.396502,4.414705,0.0,0.0,6.446415,11.907904,0.916504,0.54541,0.124329,0.140137,1.0,1.0,0.5,0.605957,-0.638672,-0.166626,-0.861328,-0.878906,-0.75,-0.848633,-0.805664,-0.86377,-0.583496,-0.02652,-0.583496,-0.028519


Current split: 8


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
51834,E_006a042f7402a9,E_0050e220c92d87,0,38,1,2273417.0,31.702286,24.887804,0.300049,0.111084,0.473633,0.210571,0.459961,0.200195,0.0,0.0,0.023804,-1.0,0.5,0.0,18178,24487,0.0,0.0,-0.5,-0.5,-0.5,203,203,0.205196,0.0,920776.25,2273417.0,0.0,2273417.0,3959556.75,2273417.0,0.815918,0.0,12.175799,31.702286,0.0,31.702286,51.364243,31.702286,0.815918,0.0,10.359089,24.887804,0.0,24.887804,45.066116,24.887804,0.815918,0.0,0.134888,0.210571,1.0,0.210571,0.736816,0.0,0.069092,0.0,-0.920898,-1.0,0.065796,0.5,-0.125,0.0,0.043854,0.0,0.006069,0.0
52398,E_006a042f7402a9,E_008709a7744b3f,0,38,1,3593972.0,45.994022,41.526745,0.194458,0.071411,0.350098,0.142822,0.327881,0.133667,0.0,0.0,0.0,-1.0,0.0,-0.5,18178,36597,0.0,0.0,0.0,0.0,0.0,203,203,0.186501,0.0,920776.25,3593972.0,0.0,3593972.0,3959556.75,3593972.0,0.920898,0.0,12.175799,45.994022,0.0,45.994022,51.364243,45.994022,0.920898,0.0,10.359089,41.526745,0.0,41.526745,45.066116,41.526745,0.920898,0.0,0.134888,0.142822,1.0,0.142822,0.526367,0.0,0.069092,0.0,-0.920898,-1.0,0.065796,0.0,-0.125,-0.5,0.043854,0.0,0.006069,0.0
52949,E_006a042f7402a9,E_00981b909d425a,0,38,1,999337.9,12.628399,11.797956,0.133301,0.0625,0.285645,0.153809,0.209717,0.121033,0.040009,0.111084,0.047607,-1.0,0.0,0.0,18178,26981,0.0,0.0,0.0,0.0,0.0,203,203,0.179284,0.074901,920776.25,999337.9,0.0,999337.9,3959556.75,999337.9,0.631348,0.0,12.175799,12.628399,0.0,12.628399,51.364243,12.628399,0.631348,0.0,10.359089,11.797956,0.0,11.797956,45.066116,11.797956,0.631348,0.0,0.134888,0.153809,1.0,0.153809,0.579102,0.0,0.069092,0.111084,-0.920898,-1.0,0.065796,0.0,-0.125,0.0,0.043854,0.0,0.006069,0.0
53498,E_006a042f7402a9,E_006eb0335148e1,0,38,1,2634928.0,32.95034,30.993942,0.134644,0.049988,0.350098,0.142822,0.274658,0.101013,0.0,0.0,0.029419,-1.0,0.0,-0.5,18178,15675,0.0,0.0,-0.5,-0.5,-0.5,203,203,0.179029,0.0,920776.25,2634928.0,0.0,2634928.0,3959556.75,2634928.0,0.868652,0.0,12.175799,32.95034,0.0,32.95034,51.364243,32.95034,0.868652,0.0,10.359089,30.993942,0.0,30.993942,45.066116,30.993942,0.868652,0.0,0.134888,0.142822,1.0,0.142822,0.526367,0.0,0.069092,0.0,-0.920898,-1.0,0.065796,0.0,-0.125,-0.5,0.043854,0.0,0.006069,0.0
54048,E_006a042f7402a9,E_00ac4b34fdb91a,0,38,1,1258878.0,17.735668,14.12851,0.193604,0.055542,0.353027,0.117676,0.438477,0.105835,0.0,0.0,0.023804,-1.0,0.0,0.0,18178,9682,0.0,0.0,0.0,0.0,0.0,203,203,0.167705,0.0,920776.25,1258878.0,0.0,1258878.0,3959556.75,1258878.0,0.710449,0.0,12.175799,17.735668,0.0,17.735668,51.364243,17.735668,0.710449,0.0,10.359089,14.12851,0.0,14.12851,45.066116,14.12851,0.710449,0.0,0.134888,0.117676,1.0,0.117676,0.5,0.0,0.069092,0.0,-0.920898,-1.0,0.065796,0.0,-0.125,0.0,0.043854,0.0,0.006069,0.0
54595,E_006a042f7402a9,E_0020969232556d,0,38,1,709548.0,10.342316,7.815037,0.193604,0.088257,0.353027,0.1875,0.380615,0.163696,0.0,0.0,0.0,-1.0,0.5,0.0,18178,2667,0.0,0.0,0.0,0.0,0.0,203,203,0.167705,0.0,920776.25,709548.0,0.0,709548.0,3959556.75,709548.0,0.579102,0.0,12.175799,10.342316,0.0,10.342316,51.364243,10.342316,0.579102,0.0,10.359089,7.815037,0.0,7.815037,45.066116,7.815037,0.579102,0.0,0.134888,0.1875,1.0,0.1875,0.631348,0.0,0.069092,0.0,-0.920898,-1.0,0.065796,0.5,-0.125,0.0,0.043854,0.0,0.006069,0.0
55139,E_006a042f7402a9,E_0008860a82ff95,0,38,1,1217478.0,17.634552,12.774158,0.193604,0.088257,0.353027,0.1875,0.380615,0.163696,0.0,0.0,0.0,-1.0,0.0,-0.5,18178,2667,0.0,0.0,0.0,0.0,0.0,203,203,0.167705,0.0,920776.25,1217478.0,0.0,1217478.0,3959556.75,1217478.0,0.684082,0.0,12.175799,17.634552,0.0,17.634552,51.364243,17.634552,0.684082,0.0,10.359089,12.774158,0.0,12.774158,45.066116,12.774158,0.684082,0.0,0.134888,0.1875,1.0,0.1875,0.631348,0.0,0.069092,0.0,-0.920898,-1.0,0.065796,0.0,-0.125,-0.5,0.043854,0.0,0.006069,0.0
55698,E_006a042f7402a9,E_0004ea391c9404,0,38,1,431517.1,5.802652,5.055639,0.166626,0.085083,0.350098,0.19043,0.265137,0.159424,0.0,0.0,0.023804,0.0,0.0,0.0,18178,8942,-0.5,-0.5,-0.5,-0.5,-0.5,203,203,0.163299,0.0,920776.25,431517.1,0.0,431517.1,3959556.75,431517.1,0.552734,0.0,12.175799,5.802652,0.0,5.802652,51.364243,5.802652,0.552734,0.0,10.359089,5.055639,0.0,5.055639,45.066116,5.055639,0.552734,0.0,0.134888,0.19043,1.0,0.19043,0.684082,0.0,0.069092,0.0,-0.920898,0.0,0.065796,0.0,-0.125,0.0,0.043854,-0.5,0.006069,-0.5
56263,E_006a042f7402a9,E_00240142c2c355,0,38,1,1338278.0,18.554869,16.098696,0.151489,0.083313,0.277832,0.166626,0.299561,0.154297,0.0,0.0,0.0,-1.0,0.0,0.0,18178,37661,0.0,0.0,0.0,0.0,0.0,203,203,0.158114,0.0,920776.25,1338278.0,0.0,1338278.0,3959556.75,1338278.0,0.736816,0.0,12.175799,18.554869,0.0,18.554869,51.364243,18.554869,0.736816,0.0,10.359089,16.098696,0.0,16.098696,45.066116,16.098696,0.736816,0.0,0.134888,0.166626,1.0,0.166626,0.605469,0.0,0.069092,0.0,-0.920898,-1.0,0.065796,0.0,-0.125,0.0,0.043854,0.0,0.006069,0.0
56842,E_006a042f7402a9,E_00ab117ff2f7b1,0,38,1,66173.29,0.985123,0.723416,0.205933,0.05127,0.350098,0.099976,0.342285,0.097595,0.0,0.0,0.0,-1.0,0.0,0.0,18178,19282,0.0,0.0,0.0,0.0,0.0,203,203,0.153897,0.0,920776.25,66173.29,0.0,66173.29,3959556.75,66173.29,0.526367,0.0,12.175799,0.985123,0.0,0.985123,51.364243,0.985123,0.526367,0.0,10.359089,0.723416,0.0,0.723416,45.066116,0.723416,0.526367,0.0,0.134888,0.099976,1.0,0.099976,0.473633,0.0,0.069092,0.0,-0.920898,-1.0,0.065796,0.0,-0.125,0.0,0.043854,0.0,0.006069,0.0


Current split: 9


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
41740,E_00b0f257eba358,E_0093f47e279488,0,22,17,859055.8,10.811408,8.295925,0.060608,0.0,0.166626,0.0,0.120361,0.0,-1.0,-1.0,-0.5,0.0,-1.0,-0.5,20271,22526,0.0,0.0,0.0,0.0,0.0,169,169,0.0,1.0,421789.0,882728.6,0.0,0.0,910265.0,3972565.75,0.90918,0.706055,5.035472,10.551211,0.0,0.0,10.811408,40.673439,0.95459,0.529297,4.083929,8.345329,0.0,0.0,9.035267,35.740746,0.90918,0.706055,0.060944,0.067261,1.0,1.0,0.0,0.0,-0.272705,-0.294189,-0.95459,0.058838,-0.95459,-1.0,-0.95459,-0.470703,0.133911,0.124207,0.100342,0.120117
41741,E_00b31c55f68efc,E_0093f47e279488,0,22,17,858466.4,10.823874,8.288627,0.024994,0.0,0.055542,0.0,0.035248,0.0,0.0,0.0,-0.25,0.0,-1.0,-0.5,23899,22526,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,420692.125,882728.6,0.0,0.0,907074.8,3972565.75,0.90918,0.588379,5.025581,10.551211,0.0,0.0,10.823874,40.673439,0.95459,0.588379,4.0745,8.345329,0.0,0.0,9.006212,35.740746,0.90918,0.646973,0.045441,0.067261,1.0,1.0,0.0,0.0,0.090881,-0.294189,-0.95459,0.058838,-0.95459,-1.0,-0.95459,-0.470703,0.085876,0.124207,0.069702,0.120117
41742,E_000e5f33c85e38,E_00aeb8c0a97818,0,22,17,900939.2,10.703849,8.942701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,13088,37684,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,421464.71875,1024601.0,0.0,0.0,900939.2,4522376.0,0.95459,0.588379,5.032147,11.186654,0.0,0.0,10.731906,44.66951,0.90918,0.588379,4.08144,9.82814,0.0,0.0,8.942701,40.682743,0.95459,0.529297,0.056824,0.058838,1.0,1.0,0.0,0.0,0.045441,0.058838,-0.95459,-0.941406,-0.95459,-1.0,-0.95459,-0.970703,0.12793,0.070557,0.096924,0.064148
41743,E_0044bcc79a1738,E_00aeb8c0a97818,0,26,17,1171480.0,12.967003,11.709599,0.045441,0.0,0.142822,0.0,0.086731,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,35958,37684,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,505715.28125,1024601.0,0.0,0.0,1171480.0,4522376.0,0.961426,0.764648,5.909151,11.186654,0.0,0.0,14.29589,44.66951,0.807617,0.882324,4.94156,9.82814,0.0,0.0,11.709599,40.682743,0.961426,0.764648,0.045685,0.058838,1.0,1.0,0.0,0.0,0.038452,0.058838,-0.961426,-0.941406,-0.922852,-1.0,-0.942383,-0.970703,0.083069,0.070557,0.058319,0.064148
41744,E_0057e071c137f9,E_0093f47e279488,0,26,17,1246137.0,15.868477,12.085464,0.027771,0.0,0.071411,0.0,0.055725,0.0,0.0,0.0,-0.25,0.0,-1.0,-0.5,17640,22526,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,525748.1875,882728.6,0.0,0.0,1246137.0,3972565.75,0.961426,0.882324,6.50588,10.551211,0.0,0.0,15.868477,40.673439,0.961426,0.882324,5.137516,8.345329,0.0,0.0,12.242251,35.740746,0.922852,0.882324,0.041412,0.067261,1.0,1.0,0.0,0.0,0.076904,-0.294189,-0.961426,0.058838,-0.922852,-1.0,-0.942383,-0.470703,0.079895,0.124207,0.057098,0.120117
41745,E_0072ac9b496cef,E_00aeb8c0a97818,0,22,17,899291.8,10.691883,8.925521,0.071411,0.0,0.142822,0.0,0.133667,0.0,0.0,0.0,-0.25,-1.0,-1.0,-1.0,28062,37684,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,422042.34375,1024601.0,0.0,0.0,899291.8,4522376.0,0.95459,0.470703,5.039416,11.186654,0.0,0.0,10.705282,44.66951,0.90918,0.529297,4.086945,9.82814,0.0,0.0,8.925521,40.682743,0.95459,0.470703,0.064941,0.058838,1.0,1.0,0.0,0.0,-0.272705,0.058838,-0.95459,-0.941406,-0.95459,-1.0,-0.95459,-0.970703,0.066101,0.070557,0.053223,0.064148
41746,E_007c828173adf0,E_0093f47e279488,0,26,17,1245387.0,15.856707,12.07821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,0.0,-1.0,-0.5,21970,22526,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,525442.75,882728.6,0.0,0.0,1245387.0,3972565.75,0.961426,0.82373,6.499649,10.551211,0.0,0.0,15.856707,40.673439,0.961426,0.82373,5.134654,8.345329,0.0,0.0,12.237473,35.740746,0.922852,0.82373,0.038452,0.067261,1.0,1.0,0.0,0.0,0.076904,-0.294189,-0.961426,0.058838,-0.922852,-1.0,-0.942383,-0.470703,0.078979,0.124207,0.054352,0.120117
41747,E_00861d02178953,E_00aeb8c0a97818,0,22,17,900781.3,10.660132,8.946378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,33707,37684,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,420225.09375,1024601.0,0.0,0.0,900781.3,4522376.0,0.95459,0.529297,5.022013,11.186654,0.0,0.0,10.810537,44.66951,0.818359,0.470703,4.070853,9.82814,0.0,0.0,8.946378,40.682743,0.95459,0.588379,0.045441,0.058838,1.0,1.0,0.0,0.0,0.090881,0.058838,-0.95459,-0.941406,-0.95459,-1.0,-0.95459,-0.970703,0.115662,0.070557,0.104553,0.064148
41748,E_00b0f257eba358,E_00aeb8c0a97818,0,22,17,910265.0,10.808483,9.035267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25,-1.0,-1.0,-1.0,20271,37684,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,421789.0,1024601.0,0.0,0.0,910265.0,4522376.0,0.95459,0.706055,5.035472,11.186654,0.0,0.0,10.811408,44.66951,0.90918,0.706055,4.083929,9.82814,0.0,0.0,9.035267,40.682743,0.95459,0.706055,0.060944,0.058838,1.0,1.0,0.0,0.0,-0.272705,0.058838,-0.95459,-0.941406,-0.95459,-1.0,-0.95459,-0.970703,0.133911,0.070557,0.100342,0.064148
41749,E_00b31c55f68efc,E_00aeb8c0a97818,0,22,17,907074.8,10.751969,9.006212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,23899,37684,0.0,0.0,0.0,0.0,0.0,169,169,0.0,0.0,420692.125,1024601.0,0.0,0.0,907074.8,4522376.0,0.95459,0.646973,5.025581,11.186654,0.0,0.0,10.823874,44.66951,0.818359,0.646973,4.0745,9.82814,0.0,0.0,9.006212,40.682743,0.95459,0.646973,0.045441,0.058838,1.0,1.0,0.0,0.0,0.090881,0.058838,-0.95459,-0.941406,-0.95459,-1.0,-0.95459,-0.970703,0.085876,0.070557,0.069702,0.064148


Current split: 10


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,p1,p2,match,count1,count2,haversine,manhattan,euclidian,name_cleaned_jaccard_2,name_cleaned_jaccard_3,name_cleaned_overlap_2,name_cleaned_overlap_3,name_cleaned_cosine_2,name_cleaned_cosine_3,full_address_jaccard_3,full_address_overlap_3,name_address_overlap_3,numbers_in_name_overlap,numbers_in_address_overlap,numbers_in_name_address_overlap,categories1,categories2,categories_overlap,categories_jaccard,main_category_jaccard,main_category_overlap,main_category_cosine,country1,country2,tfidf_trigram_name_cleaned,tfidf_trigram_full_address_cleaned,p1_haversine_mean,p2_haversine_mean,p1_haversine_min,p2_haversine_min,p1_haversine_max,p2_haversine_max,p1_haversine_rank,p2_haversine_rank,p1_manhattan_mean,p2_manhattan_mean,p1_manhattan_min,p2_manhattan_min,p1_manhattan_max,p2_manhattan_max,p1_manhattan_rank,p2_manhattan_rank,p1_euclidian_mean,p2_euclidian_mean,p1_euclidian_min,p2_euclidian_min,p1_euclidian_max,p2_euclidian_max,p1_euclidian_rank,p2_euclidian_rank,p1_name_cleaned_overlap_3_mean,p2_name_cleaned_overlap_3_mean,p1_name_cleaned_overlap_3_max,p2_name_cleaned_overlap_3_max,p1_name_cleaned_overlap_3_rank,p2_name_cleaned_overlap_3_rank,p1_full_address_overlap_3_mean,p2_full_address_overlap_3_mean,p1_numbers_in_name_overlap_mean,p2_numbers_in_name_overlap_mean,p1_numbers_in_address_overlap_mean,p2_numbers_in_address_overlap_mean,p1_numbers_in_name_address_overlap_mean,p2_numbers_in_name_address_overlap_mean,p1_categories_overlap_mean,p2_categories_overlap_mean,p1_categories_jaccard_mean,p2_categories_jaccard_mean
46618,E_00037fb71d569d,E_0029eb064621fa,0,25,6,504117.3125,8.262639,6.546914,0.074097,0.0,0.153809,0.0,0.114685,0.0,0.0,0.0,0.0,0.0,-1.0,-0.5,17367,33383,0.0,0.0,-0.5,-0.5,-0.5,201,201,0.0,0.0,425267.21875,259825.015625,0.0,0.0,827325.5,633618.9375,0.879883,0.666504,6.156436,3.989171,0.0,0.0,13.051941,10.124512,0.879883,0.666504,4.821257,3.198532,0.0,0.0,10.242278,8.528543,0.879883,0.666504,0.043091,0.166626,1.0,1.0,0.0,0.0,0.010002,0.229126,0.040009,-0.833496,-0.399902,-0.5,-0.419922,-0.666504,0.040009,0.178589,0.040009,0.170288
46620,E_008bbf6d445cda,E_0029eb064621fa,0,25,6,633618.9375,10.124512,8.528543,0.035706,0.0,0.076904,0.0,0.127197,0.0,0.0,0.0,0.0,-1.0,0.0,-0.5,25637,33383,0.0,0.0,0.0,0.0,0.0,201,201,0.0,0.0,423576.15625,259825.015625,0.0,0.0,983085.5,633618.9375,0.879883,0.833496,6.201942,3.989171,0.0,0.0,15.429142,10.124512,0.879883,0.833496,4.951262,3.198532,0.0,0.0,12.336339,8.528543,0.879883,0.833496,0.043335,0.166626,1.0,1.0,0.0,0.0,-0.008148,0.229126,-0.919922,-0.833496,0.020004,-0.5,-0.189941,-0.666504,0.040009,0.178589,0.040009,0.170288
46621,E_0029eb064621fa,E_008bbf6d445cda,0,21,17,633618.9375,10.124512,8.528543,0.035706,0.0,0.076904,0.0,0.127197,0.0,0.0,0.0,0.0,-1.0,0.0,-0.5,33383,25637,0.0,0.0,0.0,0.0,0.0,201,201,0.0,0.0,481712.0625,342097.4375,0.0,0.0,745781.0,633618.9375,0.619141,0.941406,6.90804,4.975991,0.0,0.0,10.499531,10.124512,0.666504,0.941406,5.095216,3.94761,0.0,0.0,8.528543,8.528543,0.952148,0.941406,0.050598,0.063721,1.0,1.0,0.0,0.0,-0.053558,0.072083,-0.904785,-0.941406,-0.619141,0.058838,-0.761719,-0.088257,0.187012,0.058838,0.06958,0.058838
46622,E_00888b74fcf94c,E_008bbf6d445cda,0,21,17,625770.0,10.033777,8.407353,0.046509,0.0,0.153809,0.0,0.098083,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,8021,25637,0.0,0.0,0.0,0.0,0.0,201,201,0.0,0.0,479277.0,342097.4375,0.0,0.0,745450.875,633618.9375,0.619141,0.82373,6.835636,4.975991,0.0,0.0,10.408796,10.124512,0.666504,0.82373,5.045984,3.94761,0.0,0.0,8.407353,8.528543,0.952148,0.82373,0.05658,0.063721,1.0,1.0,0.0,0.0,-0.053986,0.072083,-0.904785,-0.941406,0.023804,0.058838,-0.285645,-0.088257,0.064514,0.058838,0.054688,0.058838
46623,E_008cb00209a061,E_008bbf6d445cda,0,21,17,626690.875,10.06403,8.413932,0.055542,0.0,0.166626,0.0,0.11322,0.0,0.035706,0.105286,0.0,-1.0,0.0,0.0,33402,25637,0.0,0.0,-0.5,-0.5,-0.5,201,201,0.0,0.072739,480059.40625,342097.4375,0.0,0.0,748427.625,633618.9375,0.619141,0.882324,6.845708,4.975991,0.0,0.0,10.439048,10.124512,0.666504,0.882324,5.052673,3.94761,0.0,0.0,8.413932,8.528543,0.952148,0.882324,0.057129,0.063721,1.0,1.0,0.0,0.0,-0.038544,0.072083,-0.904785,-0.941406,0.0,0.058838,-0.321533,-0.088257,0.047607,0.058838,0.047607,0.058838
46624,E_008bbf6d445cda,E_00888b74fcf94c,0,25,6,625770.0,10.033777,8.407353,0.046509,0.0,0.153809,0.0,0.098083,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,25637,8021,0.0,0.0,0.0,0.0,0.0,201,201,0.0,0.0,423576.15625,256140.25,0.0,0.0,983085.5,625770.0,0.799805,0.833496,6.201942,3.914476,0.0,0.0,15.429142,10.033777,0.799805,0.833496,4.951262,3.135568,0.0,0.0,12.336339,8.407353,0.799805,0.833496,0.043335,0.179443,1.0,1.0,0.0,0.0,-0.008148,0.166626,-0.919922,-0.833496,0.020004,0.166626,-0.189941,-0.166626,0.040009,0.178589,0.040009,0.170288
46625,E_00037fb71d569d,E_005fe2d7753f16,0,25,6,827325.5,13.051941,10.242278,0.105286,0.0,0.25,0.0,0.176758,0.0,0.0,0.0,0.0,0.0,0.0,-0.75,17367,10902,0.0,0.0,-0.5,-0.5,-0.5,201,201,0.0,0.0,425267.21875,710335.0625,0.0,463459.03125,827325.5,983085.5,0.959961,0.666504,6.156436,10.46017,0.0,7.274823,13.051941,15.429142,0.959961,0.666504,4.821257,7.954017,0.0,5.615276,10.242278,12.336339,0.959961,0.666504,0.043091,0.0,1.0,0.0,0.0,0.0,0.010002,0.016479,0.040009,-0.833496,-0.399902,-0.25,-0.419922,-0.291748,0.040009,0.166626,0.040009,0.015144
46626,E_008bbf6d445cda,E_005fe2d7753f16,0,25,6,983085.5,15.429142,12.336339,0.0,0.0,0.0,0.0,0.0,0.0,0.014496,0.032257,0.0,-1.0,-0.5,0.0,25637,10902,0.0,0.0,0.0,0.0,0.0,201,201,0.0,0.030429,423576.15625,710335.0625,0.0,463459.03125,983085.5,983085.5,0.959961,0.833496,6.201942,10.46017,0.0,7.274823,15.429142,15.429142,0.959961,0.833496,4.951262,7.954017,0.0,5.615276,12.336339,12.336339,0.959961,0.833496,0.043335,0.0,1.0,0.0,0.0,0.0,-0.008148,0.016479,-0.919922,-0.833496,0.020004,-0.25,-0.189941,-0.291748,0.040009,0.166626,0.040009,0.015144
46627,E_00037fb71d569d,E_008cb00209a061,0,25,6,498995.75,8.202157,6.443866,0.055542,0.0,0.166626,0.0,0.102051,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,17367,33402,0.0,0.0,-1.0,-1.0,-1.0,201,201,0.0,0.0,425267.21875,257383.15625,0.0,0.0,827325.5,626690.875,0.839844,0.666504,6.156436,3.934666,0.0,0.0,13.051941,10.06403,0.839844,0.666504,4.821257,3.144863,0.0,0.0,10.242278,8.413932,0.839844,0.666504,0.043091,0.166626,1.0,1.0,0.0,0.0,0.010002,0.255371,0.040009,-0.833496,-0.399902,0.166626,-0.419922,-0.25,0.040009,0.166626,0.040009,0.166626
46628,E_008bbf6d445cda,E_008cb00209a061,0,25,6,626690.875,10.06403,8.413932,0.055542,0.0,0.166626,0.0,0.11322,0.0,0.035706,0.105286,0.0,-1.0,0.0,0.0,25637,33402,0.0,0.0,-0.5,-0.5,-0.5,201,201,0.0,0.072739,423576.15625,257383.15625,0.0,0.0,983085.5,626690.875,0.839844,0.833496,6.201942,3.934666,0.0,0.0,15.429142,10.06403,0.839844,0.833496,4.951262,3.144863,0.0,0.0,12.336339,8.413932,0.839844,0.833496,0.043335,0.166626,1.0,1.0,0.0,0.0,-0.008148,0.255371,-0.919922,-0.833496,0.020004,0.166626,-0.189941,-0.25,0.040009,0.166626,0.040009,0.166626


Total len is 56965


## Make prediction

In [17]:
def feature_and_predict_by_chunks(df, pairs, models, n_splits):
    count = 0
    
    pred_df = pd.DataFrame()
    
    for k in tqdm(range(n_splits)):
        # split dataset by chunks
        print(f'Current split: {k+1}')
        cur_data = pairs[pairs['group'] == k]

        # add features & model prediction
        cur_data = feature_engineering(df, cur_data)

        # predict
        cur_data = cur_data.drop('group', axis=1)
        cur_data['predict_proba'] = np.zeros(len(cur_data), dtype=np.float16)

        for model in models:
            cat_features = ["country1", "country2", "categories1", "categories2"]
            num_features = [x for x in cur_data.columns
                            if x not in ['p1', 'p2', 'match'] + cat_features + ["predict_proba"]]
            
            pred = np.array([model.predict_proba(cur_data[num_features + cat_features])[:, 1] for model in models])
            pred = np.mean(pred, axis=0)

        
        pred_df = pd.concat([pred_df, cur_data[["p1", "p2", "predict_proba"]]])

        count += len(cur_data)
        
        del cur_data
        gc.collect()
    
    print(f'Total len is {count}')
    return pred_df

if CFG.inference:
    pred_df = feature_and_predict_by_chunks(df, pairs, models, CFG.n_splits)

## Submission

In [18]:
# # Submission
# prediction = pred_df[pred_df["predict_proba"] > CFG.threshold][["p1", "p2"]].groupby("p1").agg(set)
# df["prediction"] = prediction

# # Fill empty
# for row in df.loc[df["prediction"].isnull(), "prediction"].index:
#     df.at[row, "prediction"] = set()

# # Add itself
# df.apply(lambda x: x["prediction"].add(x["id"]), axis=1)

# # Forming sumbission.csv
# df["prediction_sub"] = df["prediction"].apply(lambda x: " ".join(x))
# pd.concat([df['id'], df["prediction_sub"]], axis=1, keys=['id', 'matches']).to_csv("submission.csv", index = False)

In [19]:
def post_process(df):
    id2match = dict(zip(df['id'].values, df['matches'].str.split()))

    for base, match in df[['id', 'matches']].values:
        match = match.split()
        if len(match) == 1:        
            continue

        for m in match:
            if base not in id2match[m]:
                id2match[m].append(base)
    df['matches'] = df['id'].map(id2match).map(' '.join)
    return df 

## Submission    
pred_df = pred_df[pred_df['predict_proba'] >= CFG.threshold][['p1', 'p2']]

out_df = pd.DataFrame()
df = df.reset_index()
out_df['id'] = df['id'].unique().tolist()
out_df['match_id'] = out_df['id']

out_df = pd.concat([out_df, pred_df])
out_df = out_df.groupby('id')['match_id'].apply(list).reset_index()
out_df['matches'] = out_df['match_id'].apply(lambda x: ' '.join(set(x)))
out_df = post_process(out_df)
print(f'Unique id: {len(out_df)}')
display(out_df.head())

out_df[['id', 'matches']].to_csv('submission.csv', index = False)

NameError: name 'pred_df' is not defined

# Further ideas

- ordinal encode main_category and closest_city, set them as categorical features for LGBM
- levenstein, jaro, lcs
- include multilingual encoder for the full_address

- Optuna
- stacking of the best Optuna solutions

- add Catboost