In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import math
import pathlib
import unicodedata as ud

from enum import Enum
from functools import partial
from sys import getsizeof

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pyarrow.parquet as pq
import pyarrow as pa

from numpy import sin, cos, deg2rad, rad2deg
from numpy.linalg import inv
from pandas import CategoricalDtype
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from tqdm.auto import tqdm

pd.set_option("max_colwidth", 900)
pd.set_option("max_rows", 100)
plt.style.use("ggplot")

In [None]:
class CFG:
    H3_COL = "h3_res1"
    DATA = pathlib.Path("/content/drive/My Drive/Competition/4sq/data")
    MODELS = {
        "stsb-xlm-r-multilingual": "sentence-transformers/stsb-xlm-r-multilingual",
        "all-mpnet-base-v2": "sentence-transformers/all-mpnet-base-v2",
        "all-distilroberta-v1": "sentence-transformers/all-distilroberta-v1",
        "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
        "all-MiniLM-L12-v2": "sentence-transformers/all-MiniLM-L12-v2",
    }
    MODEL_KEY = "all-MiniLM-L6-v2"
    EMBEDDING_KEYS = ["name", "categories"]
    EMBEDDING_MODEL = MODELS[MODEL_KEY]
    RANDOM_SEED = 1234
    N_CLOSEST = 25
    N_NEGATIVES = 6
    BIDIRECTIONAL = False
    TEXT_COLS = [
        "name",
        "address",
        "city",
        "state",
        "zip",
        "country",
        "url",
        "phone",
        "categories",
    ]
    NUM_COLS = ["latitude", "longitude"]

In [None]:
def normalize(df, cols):
    print(f"normalize: {cols}")
    df = df.copy()
    for col in cols:
        ser = df[col]
        ser = ser.str.lower()
        ser = ser.str.replace("\s", " ", regex=True)
        ser = ser.str.replace(" +", " ", regex=True)
        ser = ser.str.replace("^ $", "", regex=True)
        df[f"{col}"] = ser

        print(f"  #blank rows in {col}: {len(df[df[col] == '']):,}")
    return df


normalize_cols = partial(normalize, cols=CFG.TEXT_COLS)

In [None]:
def fill_na(df, cols):
    df = df.copy()
    df[cols] = df[cols].fillna("")
    return df


fill_na_cols = partial(fill_na, cols=CFG.TEXT_COLS)

In [None]:
def fill_blank(df, cols):
    df = df.copy()
    for col in cols:
        df[col] = df[col].str.replace("^$", "-", regex=True)
    return df


fill_blank_cols = partial(fill_blank, cols=CFG.TEXT_COLS)

In [None]:
def compose(transforms):
    def transform_func(df):
        for transform in transforms:
            df = transform(df)
        return df

    return transform_func

In [None]:
transforms = compose([fill_na_cols, normalize_cols, fill_blank_cols])

In [None]:
DATA = CFG.DATA

In [None]:
ebkeys = "_".join(CFG.EMBEDDING_KEYS)
!cp "{DATA}/preds_dev_{CFG.H3_COL}_{ebkeys}_{CFG.MODEL_KEY}_text_seed{CFG.RANDOM_SEED}.parquet" .
!cp "{DATA}/preds_test_{CFG.H3_COL}_{ebkeys}_{CFG.MODEL_KEY}_text_seed{CFG.RANDOM_SEED}.parquet" .

In [None]:
preds_dev_df = pd.read_parquet(f"{DATA}/preds_dev_{CFG.H3_COL}_{ebkeys}_{CFG.MODEL_KEY}_text_seed{CFG.RANDOM_SEED}.parquet")
preds_test_df = pd.read_parquet(f"{DATA}/preds_test_{CFG.H3_COL}_{ebkeys}_{CFG.MODEL_KEY}_text_seed{CFG.RANDOM_SEED}.parquet")

In [None]:
!cp "{DATA}/train_dev.parquet" .
!cp "{DATA}/train_test.parquet" .

In [None]:
train_dev = transforms(pd.read_parquet("train_dev.parquet"))
train_test = transforms(pd.read_parquet("train_test.parquet"))

normalize: ['name', 'address', 'city', 'state', 'zip', 'country', 'url', 'phone', 'categories']
  #blank rows in name: 0
  #blank rows in address: 82,194
  #blank rows in city: 63,135
  #blank rows in state: 84,507
  #blank rows in zip: 116,033
  #blank rows in country: 0
  #blank rows in url: 160,816
  #blank rows in phone: 145,060
  #blank rows in categories: 21,679
normalize: ['name', 'address', 'city', 'state', 'zip', 'country', 'url', 'phone', 'categories']
  #blank rows in name: 0
  #blank rows in address: 82,110
  #blank rows in city: 62,993
  #blank rows in state: 84,654
  #blank rows in zip: 116,068
  #blank rows in country: 1
  #blank rows in url: 160,564
  #blank rows in phone: 144,840
  #blank rows in categories: 21,590


In [None]:
train_dev

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest,h3_res0,h3_res1,h3_res2,h3_res3,h3_res4,h3_res5
0,E_000001272c6c5d,café stad oudenaarde,50.859975,3.634196,abdijstraat,nederename,oost-vlaanderen,9700,be,-,-,bars,P_677e840bb6fc7e,8019fffffffffff,81197ffffffffff,82194ffffffffff,83194dfffffffff,84194dbffffffff,85194daffffffff
4,E_00001d92066153,restaurante casa cofiño,43.338196,-4.326821,-,caviedes,cantabria,-,es,-,-,spanish restaurants,P_809a884d4407fb,8039fffffffffff,81187ffffffffff,82392ffffffffff,83392dfffffffff,84392d9ffffffff,85392d9bfffffff
5,E_000023d8f4be44,island spa,14.518970,121.018702,"5th flr, newport mall, resorts world manila",pasay city,metro manila,-,ph,-,-,spas,P_020de174484ec6,8069fffffffffff,81697ffffffffff,82694ffffffffff,83694efffffffff,84694edffffffff,85694ec3fffffff
7,E_0000764d65557e,mcdonald's,-7.265894,112.749382,"plaza surabaya, pemuda building",-,-,-,id,-,-,fast food restaurants,P_be89c778befb23,808dfffffffffff,818dbffffffffff,828d87fffffffff,838d80fffffffff,848d809ffffffff,858d8083fffffff
15,E_0001031ce8d446,restoran nasi kandar daun pisang,3.131100,101.683762,brickfields,kuala lumpur,federal territory of kuala lum,-,my,-,-,malay restaurants,P_1357b65801de8a,8065fffffffffff,81653ffffffffff,826507fffffffff,836505fffffffff,8465051ffffffff,8565050bfffffff
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138775,E_fffde923648247,guanaco,49.287111,-123.117324,-,-,-,-,ca,-,-,food trucks,P_e416fda64302f1,8029fffffffffff,8128fffffffffff,8228dffffffffff,8328defffffffff,8428de1ffffffff,8528de8ffffffff
1138779,E_fffe21a2201133,oasi di baggero,45.771066,9.239384,via cava marna,merone,italy,22046,it,-,-,"scenic lookouts, lakes, other great outdoors",P_fedaf3809e50f1,801ffffffffffff,811fbffffffffff,821f9ffffffffff,831f99fffffffff,841f99dffffffff,851f99d7fffffff
1138783,E_fffe5a71e05990,giant gaplek,-6.347549,106.749049,gaplek,-,-,-,id,-,-,convenience stores,P_5cf17019780002,808dfffffffffff,818c3ffffffffff,828c17fffffffff,838c10fffffffff,848c107ffffffff,858c1073fffffff
1138805,E_ffff9509490675,kelab rahman putra hills course,3.218054,101.558533,-,-,-,-,my,-,-,golf courses,P_1a01d5511f3f36,8065fffffffffff,81653ffffffffff,826507fffffffff,836505fffffffff,8465057ffffffff,85650573fffffff


# Prune predictions

In [None]:
def pruned(preds, distances, n_to_prune=10):
    """
    prune to closest top N entities
    """
    preds_new = preds.copy()
    n = min(len(preds), n_to_prune)
    idx = np.argpartition(distances, n - 1)[:n]
    return preds_new[idx]

In [None]:
def eliminate_gts(preds, distances, gts):
    preds, distances, gts = preds.copy(), distances.copy(), gts.copy()
    mask = np.ones(len(preds), bool)

    for gt in gts:
        idx = np.argwhere(preds == gt).reshape(-1)
        if len(idx) > 0:
            mask[idx] = 0

    return preds[mask], distances[mask]

In [None]:
np.random.randint(0, 100, 3)

array([13,  6, 10])

In [None]:
def make_pair_dataset(preds_df, train_df, n_negatives=6, n_to_prune=10, monitor_iter=100):
    np.random.seed(CFG.RANDOM_SEED)

    preds_list = preds_df["preds"].to_numpy()
    distances_list = preds_df["distances"].to_numpy()
    ids_list = preds_df["id"].to_numpy()

    poi2gts = train_df.groupby("point_of_interest").agg(gt=("id", lambda x: list(x)))["gt"].to_dict()    
    id2poi = train_df.set_index("id")["point_of_interest"].to_dict()

    results = []
    for i, (id, preds, distances) in enumerate(tqdm(zip(ids_list, preds_list, distances_list), total=len(ids_list))):
        preds, distances = preds.copy(), distances.copy()
        gts = poi2gts[id2poi[id]]
        
        # extract Top-N closest false positives as negative candidates
        preds, distances = eliminate_gts(preds, distances, gts)
        n_pruned = min(len(preds), n_to_prune)
        preds = pruned(preds, distances, n_to_prune=n_pruned)

        gts_set = set(gts)
        preds_set = set(preds)

        # sample positive pair
        tp_others = np.array(list(gts_set.difference(set([id]))))
        pos_idx = np.random.randint(len(tp_others))
        positive = tp_others[pos_idx]
        results.append((id, positive, 1))

        # sample negative pair
        fps = np.array(list(preds_set.difference(gts_set)))
        if len(fps) > 0:
            n_samples = min(len(fps), n_negatives)
            neg_idxs = np.random.randint(0, len(fps), n_samples)
            negatives = fps[neg_idxs]
            for negative in negatives:
                results.append((id, negative, 0))

        if monitor_iter > 0 and i % monitor_iter == 0:
            print("=" * 80)
            print(f"positive pair: ({id}, {positive})")
            print(f"negative pair: ({id}, {negative})")

            # visualize text
            base_text = train_df.query("id == @id")["name"].item()
            pos_text = train_df.query("id == @positive")["name"].item()
            print(f"positive sentences pair: (\"{base_text}\", \"{pos_text}\")")

            if len(fps) > 0:
                neg_text = train_df.query("id == @negative")["name"].item()
                print(f"positive sentences pair: (\"{base_text}\", \"{neg_text}\")")
    results_df = pd.DataFrame(dict(zip(["id1", "id2", "matched"], zip(*results))))
    return results_df

In [None]:
results_dev_df = make_pair_dataset(preds_dev_df, train_dev, n_negatives=CFG.N_NEGATIVES, n_to_prune=CFG.N_CLOSEST, monitor_iter=1000)

  0%|          | 0/192549 [00:00<?, ?it/s]

positive pair: (E_4544b36e5c2b88, E_5f9804714e0bc9)
negative pair: (E_4544b36e5c2b88, E_a63073ab2c7675)
positive sentences pair: ("krystal height", "island glades-krystal heights")
positive sentences pair: ("krystal height", "klia reclaim baggage carousel f")
positive pair: (E_d863ef04e80ae8, E_ce8e20f64221ba)
negative pair: (E_d863ef04e80ae8, E_752eaaf69615bc)
positive sentences pair: ("บันดาหยา รีสอร์ท", "bundhaya resort")
positive sentences pair: ("บันดาหยา รีสอร์ท", "บ้านโอบฟ้า รีสอร์ท")
positive pair: (E_e543453375b402, E_c8ef4882faa979)
negative pair: (E_e543453375b402, E_14fd5dfa24d4e3)
positive sentences pair: ("school of bussiness engineering entrepreneurship (ppit)", "school of business, universiti malaysia perlis")
positive sentences pair: ("school of bussiness engineering entrepreneurship (ppit)", "smu school of economics (soe)")
positive pair: (E_26399a70715de9, E_62ef238875d773)
negative pair: (E_26399a70715de9, E_ef4597ad92c365)
positive sentences pair: ("新顺美食中心coffeesho

In [None]:
len(results_dev_df), results_dev_df.value_counts("matched") / len(results_dev_df)

(1346336, matched
 0    0.856983
 1    0.143017
 dtype: float64)

In [None]:
results_test_df = make_pair_dataset(preds_test_df, train_test, n_negatives=CFG.N_NEGATIVES, n_to_prune=CFG.N_CLOSEST, monitor_iter=1000)

  0%|          | 0/192401 [00:00<?, ?it/s]

positive pair: (E_df67d07f262621, E_c76a74840909fe)
negative pair: (E_df67d07f262621, E_a94e5778d0d8aa)
positive sentences pair: ("proton bunga raya parts service center", "bunga raya part @ service sdn. bhd.")
positive sentences pair: ("proton bunga raya parts service center", "toyota thonburi center service")
positive pair: (E_5d9fc6a881adb6, E_c6e9014b9d2b86)
negative pair: (E_5d9fc6a881adb6, E_665ae273badeee)
positive sentences pair: ("知粥常乐", "知足常乐")
positive sentences pair: ("知粥常乐", "老豆猪肉粉")
positive pair: (E_50711029ea07ad, E_a9b0d74a25b8b7)
negative pair: (E_50711029ea07ad, E_3c880ba348ff50)
positive sentences pair: ("harris resort-cabanna room", "harris resort")
positive sentences pair: ("harris resort-cabanna room", "renaissance resort & spa room 142")
positive pair: (E_7eedd32c04e901, E_5c11bf1e3fd04f)
negative pair: (E_7eedd32c04e901, E_c4bdffdc196033)
positive sentences pair: ("warung bambu pak iyek - makanan laut", "warung kajol")
positive sentences pair: ("warung bambu pa

In [None]:
len(results_test_df), results_test_df.value_counts("matched") / len(results_test_df)

(1344027, matched
 0    0.856847
 1    0.143153
 dtype: float64)

# Make Ditto's input

In [None]:
def make_ditto_output(result_df, train_df, output_path="example.tsv"):
    result_df = result_df.copy()

    feature_cols = [*CFG.TEXT_COLS, *CFG.NUM_COLS]
    result_df = result_df.merge(
        train_df[["id"] + feature_cols].rename({"id": "id1", **{col: f"{col}1" for col in feature_cols}}, axis=1), on="id1"
    ).merge(
        train_df[["id"] + feature_cols].rename({"id": "id2", **{col: f"{col}2" for col in feature_cols}}, axis=1), on="id2"
    )

    tqdm.pandas(desc="Create ditto label")
    result_df.progress_apply(
        lambda x: (
            " ".join([f"COL {col} VAL {x[col + '1']}" for col in CFG.TEXT_COLS] + [f"COL {col} VAL {x[col + '1']:.3f}" for col in CFG.NUM_COLS]),
            " ".join([f"COL {col} VAL {x[col + '2']}" for col in CFG.TEXT_COLS] + [f"COL {col} VAL {x[col + '2']:.3f}" for col in CFG.NUM_COLS]),
            x['matched']
        ), axis='columns', result_type='expand'
    ).to_csv(f"{output_path}.gz", index=False, header=False, sep="\t", compression="gzip")

In [None]:
make_ditto_output(results_dev_df, train_dev, output_path=f"ditto_dev_n{CFG.N_CLOSEST}_seed{CFG.RANDOM_SEED}.tsv")

creating ditto labels...:   0%|          | 0/1346336 [00:00<?, ?it/s]

In [None]:
make_ditto_output(results_test_df, train_test, output_path=f"ditto_test_n{CFG.N_CLOSEST}_seed{CFG.RANDOM_SEED}.tsv")

creating ditto labels...:   0%|          | 0/1344027 [00:00<?, ?it/s]

In [None]:
ditto_dev_df = pd.read_csv(f"ditto_dev_n{CFG.N_CLOSEST}_seed{CFG.RANDOM_SEED}.tsv.gz", names=["left", "rihgt", "matched"], sep="\t", compression="gzip")

In [None]:
ditto_test_df = pd.read_csv(f"ditto_test_n{CFG.N_CLOSEST}_seed{CFG.RANDOM_SEED}.tsv.gz", names=["left", "rihgt", "matched"], sep="\t", compression="gzip")

In [None]:
ditto_dev_df.value_counts("matched") / len(ditto_dev_df)

matched
0    0.856983
1    0.143017
dtype: float64

In [None]:
ditto_test_df.value_counts("matched") / len(ditto_test_df)

matched
0    0.856847
1    0.143153
dtype: float64

In [None]:
ditto_dev_df

Unnamed: 0,left,rihgt,matched
0,"COL name VAL krystal height COL address VAL lintang delima 14 COL city VAL island glades COL state VAL penang COL zip VAL 11700 COL country VAL my COL url VAL - COL phone VAL - COL categories VAL residential buildings (apartments / condos), buildings COL latitude VAL 5.384 COL longitude VAL 100.299",COL name VAL island glades-krystal heights COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 5.384 COL longitude VAL 100.298,1
1,"COL name VAL top bowl, skudai parade COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 1.538 COL longitude VAL 103.674",COL name VAL island glades-krystal heights COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 5.384 COL longitude VAL 100.298,0
2,COL name VAL cinenye garden COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL sg COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 1.341 COL longitude VAL 103.734,COL name VAL island glades-krystal heights COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 5.384 COL longitude VAL 100.298,0
3,COL name VAL luna's first home@compass heights COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL sg COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 1.391 COL longitude VAL 103.896,COL name VAL island glades-krystal heights COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 5.384 COL longitude VAL 100.298,0
4,COL name VAL esplanade by the bay COL address VAL esplanade COL city VAL singapore COL state VAL singapore COL zip VAL - COL country VAL sg COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 1.290 COL longitude VAL 103.856,COL name VAL island glades-krystal heights COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 5.384 COL longitude VAL 100.298,0
...,...,...,...
1346331,COL name VAL pullman grand hotel gym COL address VAL - COL city VAL kinşasa COL state VAL kinshasa city COL zip VAL - COL country VAL cd COL url VAL - COL phone VAL - COL categories VAL gyms COL latitude VAL -4.313 COL longitude VAL 15.273,COL name VAL pullmam gym COL address VAL - COL city VAL kinşasa COL state VAL kinshasa COL zip VAL - COL country VAL cd COL url VAL - COL phone VAL - COL categories VAL college gyms COL latitude VAL -4.312 COL longitude VAL 15.273,1
1346332,COL name VAL portobello COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL sc COL url VAL - COL phone VAL - COL categories VAL italian restaurants COL latitude VAL -4.478 COL longitude VAL 55.247,COL name VAL portobello restaraunt COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL sc COL url VAL - COL phone VAL - COL categories VAL italian restaurants COL latitude VAL -4.478 COL longitude VAL 55.247,1
1346333,COL name VAL portobello restaraunt COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL sc COL url VAL - COL phone VAL - COL categories VAL italian restaurants COL latitude VAL -4.478 COL longitude VAL 55.247,COL name VAL portobello COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL sc COL url VAL - COL phone VAL - COL categories VAL italian restaurants COL latitude VAL -4.478 COL longitude VAL 55.247,1
1346334,COL name VAL aguascalientes COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL xx COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL -4.215 COL longitude VAL -94.219,COL name VAL guayaquil COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL xx COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL -3.693 COL longitude VAL -95.757,1


In [None]:
ditto_test_df

Unnamed: 0,left,rihgt,matched
0,"COL name VAL proton bunga raya parts service center COL address VAL - COL city VAL melaka COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL professional & other places, auto workshops COL latitude VAL 2.233 COL longitude VAL 102.233",COL name VAL bunga raya part @ service sdn. bhd. COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL automotive shops COL latitude VAL 2.234 COL longitude VAL 102.233,1
1,COL name VAL yueka one-stop auto service COL address VAL #166 COL city VAL phnom penh COL state VAL 金边 COL zip VAL - COL country VAL kh COL url VAL - COL phone VAL - COL categories VAL auto garages COL latitude VAL 11.612 COL longitude VAL 104.928,COL name VAL bunga raya part @ service sdn. bhd. COL address VAL - COL city VAL - COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL automotive shops COL latitude VAL 2.234 COL longitude VAL 102.233,0
2,"COL name VAL proton bunga raya parts service center COL address VAL - COL city VAL melaka COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL professional & other places, auto workshops COL latitude VAL 2.233 COL longitude VAL 102.233",COL name VAL sin siang hin proton service dealer COL address VAL jalan gombak COL city VAL 吉隆坡 COL state VAL 吉隆坡 COL zip VAL 53000 COL country VAL my COL url VAL - COL phone VAL - COL categories VAL automotive shops COL latitude VAL 3.196 COL longitude VAL 101.704,0
3,COL name VAL sin siang hin (m) proton service centre COL address VAL jalan gombak COL city VAL kuala lumpur COL state VAL wp kuala lumpur COL zip VAL 53300 COL country VAL my COL url VAL - COL phone VAL - COL categories VAL automotive shops COL latitude VAL 3.217 COL longitude VAL 101.746,COL name VAL sin siang hin proton service dealer COL address VAL jalan gombak COL city VAL 吉隆坡 COL state VAL 吉隆坡 COL zip VAL 53000 COL country VAL my COL url VAL - COL phone VAL - COL categories VAL automotive shops COL latitude VAL 3.196 COL longitude VAL 101.704,1
4,"COL name VAL proton bunga raya parts service center COL address VAL - COL city VAL melaka COL state VAL - COL zip VAL - COL country VAL my COL url VAL - COL phone VAL - COL categories VAL professional & other places, auto workshops COL latitude VAL 2.233 COL longitude VAL 102.233",COL name VAL proton edar kepong branch COL address VAL jalan 3/34a COL city VAL kuala lumpur COL state VAL federal territory of kuala lum COL zip VAL 52100 COL country VAL my COL url VAL - COL phone VAL - COL categories VAL - COL latitude VAL 3.213 COL longitude VAL 101.646,0
...,...,...,...
1344022,COL name VAL petit anse COL address VAL chemin anse soleil COL city VAL - COL state VAL baie lazare COL zip VAL - COL country VAL sc COL url VAL - COL phone VAL - COL categories VAL beaches COL latitude VAL -4.755 COL longitude VAL 55.467,COL name VAL petite anse COL address VAL chemin anse soleil COL city VAL mahé COL state VAL baie lazare COL zip VAL - COL country VAL sc COL url VAL http://www.fourseasons.com/seychelles/ COL phone VAL 4393000 COL categories VAL beaches COL latitude VAL -4.755 COL longitude VAL 55.465,1
1344023,COL name VAL petite anse COL address VAL chemin anse soleil COL city VAL mahé COL state VAL baie lazare COL zip VAL - COL country VAL sc COL url VAL http://www.fourseasons.com/seychelles/ COL phone VAL 4393000 COL categories VAL beaches COL latitude VAL -4.755 COL longitude VAL 55.465,COL name VAL petit anse COL address VAL chemin anse soleil COL city VAL - COL state VAL baie lazare COL zip VAL - COL country VAL sc COL url VAL - COL phone VAL - COL categories VAL beaches COL latitude VAL -4.755 COL longitude VAL 55.467,1
1344024,COL name VAL broome district hospital COL address VAL - COL city VAL broome COL state VAL wa COL zip VAL - COL country VAL au COL url VAL - COL phone VAL - COL categories VAL hospitals COL latitude VAL -17.960 COL longitude VAL 122.237,COL name VAL broome hospital COL address VAL - COL city VAL broome COL state VAL western australia COL zip VAL - COL country VAL au COL url VAL - COL phone VAL - COL categories VAL hospitals COL latitude VAL -17.961 COL longitude VAL 122.237,1
1344025,COL name VAL familiprix extra - johanne giguère COL address VAL 710 montée paiement COL city VAL gatineau COL state VAL qc COL zip VAL j8r 4a3 COL country VAL ca COL url VAL - COL phone VAL - COL categories VAL pharmacies COL latitude VAL 59.029 COL longitude VAL -97.765,"COL name VAL familiprix - johanne giguère COL address VAL 710, montée paiement COL city VAL gatineau COL state VAL qc COL zip VAL j8r 4a3 COL country VAL ca COL url VAL - COL phone VAL +18196691734 COL categories VAL pharmacies COL latitude VAL 45.503 COL longitude VAL -75.677",1


# Make datasets of various size

In [None]:
import os


def make_various_sized_datasets(ditto_df, file_prefix):
    n_full_length = len(ditto_df)
    sizes = {
        "full": n_full_length,
        "xlarge": n_full_length // 4,
        "large": n_full_length // 16,
        "medium": n_full_length // 64,
        "small": n_full_length // 256,
    }
    for size, length in tqdm(sizes.items()):
        df = ditto_df.sample(length, random_state=CFG.RANDOM_SEED)
        out_filepath = f"{file_prefix}_{size}.tsv"
        df.to_csv(f"{out_filepath}.gz", index=False, header=False, sep="\t", compression="gzip")

    return sizes

In [None]:
make_various_sized_datasets(ditto_dev_df, f"ditto_dev_feat_all_n{CFG.N_CLOSEST}_seed{CFG.RANDOM_SEED}")

  0%|          | 0/5 [00:00<?, ?it/s]

cp: cannot stat '{out_filepath}.gz': No such file or directory
cp: cannot stat '{out_filepath}.gz': No such file or directory
cp: cannot stat '{out_filepath}.gz': No such file or directory
cp: cannot stat '{out_filepath}.gz': No such file or directory
cp: cannot stat '{out_filepath}.gz': No such file or directory


{'full': 1346336,
 'large': 84146,
 'medium': 21036,
 'small': 5259,
 'xlarge': 336584}

In [None]:
make_various_sized_datasets(ditto_test_df, f"ditto_test_feat_all_n{CFG.N_CLOSEST}_seed{CFG.RANDOM_SEED}")

  0%|          | 0/5 [00:00<?, ?it/s]

{'full': 1344027,
 'large': 84001,
 'medium': 21000,
 'small': 5250,
 'xlarge': 336006}

In [None]:
! cp ditto_*.tsv.gz "{DATA}"/.