In [1]:
%%file requirements.txt
datasets==1.16.1
pycountry==20.7.3
wandb==0.12.7
sentence-transformers==2.1.0
rapidfuzz==1.8.3

Writing requirements.txt


In [2]:
pip install -qr requirements.txt

[K     |████████████████████████████████| 298 kB 8.7 MB/s 
[K     |████████████████████████████████| 10.1 MB 46.5 MB/s 
[K     |████████████████████████████████| 1.7 MB 45.7 MB/s 
[K     |████████████████████████████████| 78 kB 6.7 MB/s 
[K     |████████████████████████████████| 854 kB 70.9 MB/s 
[K     |████████████████████████████████| 243 kB 44.6 MB/s 
[K     |████████████████████████████████| 132 kB 37.8 MB/s 
[K     |████████████████████████████████| 61 kB 452 kB/s 
[K     |████████████████████████████████| 1.1 MB 54.0 MB/s 
[K     |████████████████████████████████| 180 kB 73.8 MB/s 
[K     |████████████████████████████████| 140 kB 71.2 MB/s 
[K     |████████████████████████████████| 97 kB 6.3 MB/s 
[K     |████████████████████████████████| 3.1 MB 51.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 39.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 54.9 MB/s 
[K     |████████████████████████████████| 63 kB 1.6 MB/s 
[K     |███████████████████████

In [3]:
import traceback
import warnings
from typing import List

import numpy as np
import pandas as pd
import pycountry
import torch
import wandb
from datasets import load_dataset
from tqdm.auto import tqdm
from sentence_transformers import util

# Embedding

In [4]:
# initialize

In [5]:
from rapidfuzz import process

In [6]:
# embed


# Execution

In [7]:
config = wandb.config

config.embedder = "levenshtein"
config.name = f"tapaco-{config.embedder}"

In [8]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [9]:
wandb.init(
      entity="eenlp",
      project="paraphrase_detection",
      job_type=f"create-dataset",
      name=f"create-tapaco-{config.embedder}"
)

[34m[1mwandb[0m: Currently logged in as: [33mmatts[0m (use `wandb login --relogin` to force relogin)


In [10]:
# DATASET_ONLY_FIRST_N = 20_000
DATASET_ONLY_FIRST_N = 10_000

config.DATASET_ONLY_FIRST_N = DATASET_ONLY_FIRST_N

dataset_name = "tapaco"

dataset = load_dataset("tapaco", "all_languages")
df = dataset["train"].to_pandas()

df["paraphrase_set_id"] = df["paraphrase_set_id"].astype(int)

languages_to_keep = [
    "be",  # Belarusian
    "bg",  # Bulgarian
    "cs",  # Czech
    "en",  # English
    "et",  # Estonian
    "hr",  # Croatian
    "hu",  # Hungarian
    "hy",  # Armenian
    "lt",  # Lithuanian
    "mk",  # Macedonian
    "pl",  # Polish
    "ro",  # Romanian
    "ru",  # Russian
    "sl",  # Slovenian
    "sr",  # Serbian
    "uk",  # Ukrainian
]

df = df[df["language"].isin(languages_to_keep)]
assert len(languages_to_keep) == len(
    df["language"].unique()
), "Count of filtered languages doesn't match, probably a typo in 'languages_to_keep'?"

# paraphrase_set_id = 0 seems to mean "unassigned".
df = df[df["paraphrase_set_id"] != 0].copy()

# We need to generate example pairs somehow.
# As of the current version of the code, 50% of the generated examples are true paraphrases (label = 1),
# 25% are chosen from the most similar non-paraphrases (label = 0),
# and 25% is random negative examples (label = 0).

result = []
np.random.seed(0)
for language, df_language in tqdm(df.groupby("language"), "language", position=0):
    try:
        language_name = pycountry.languages.get(alpha_2=language).name

        df_language = df_language[:DATASET_ONLY_FIRST_N]

        for paraphrase_set_id, df_set in tqdm(
            df_language.groupby("paraphrase_set_id"),
            "paraphrase_set_id",
            position=1,
        ):
            if len(df_set) <= 1:
                continue

            df_negatives = df_language[
                df_language["paraphrase_set_id"] != paraphrase_set_id
            ]

            for row in df_set.itertuples():
                result.append(
                    {
                        "sentence1": row.paraphrase,
                        "sentence2": np.random.choice(
                            df_set[df_set.index != row.Index]["paraphrase"]
                        ),
                        "label": 1,
                        "lang": language_name,
                    }
                )

                # TODO this can sample the same pair twice, consider fixing it
                result.append(
                    {
                        "sentence1": row.paraphrase,
                        "sentence2": np.random.choice(
                            df_set[df_set.index != row.Index]["paraphrase"]
                        ),
                        "label": 1,
                        "lang": language_name,
                    }
                )

                # similar negative
                top_results = process.extract(
                    row.paraphrase,
                    df_negatives["paraphrase"],
                    limit=1,
                )

                result.append(
                    {
                        "sentence1": row.paraphrase,
                        "sentence2": top_results[0][0],
                        "label": 0,
                        "lang": language_name,
                    }
                )

                # random negative
                result.append(
                    {
                        "sentence1": row.paraphrase,
                        "sentence2": np.random.choice(df_negatives["paraphrase"]),
                        "label": 0,
                        "lang": language_name,
                    }
                )
    except KeyboardInterrupt:
        raise
    # except:
    #     warnings.warn(traceback.format_exc())
    #     continue

result = pd.DataFrame(result)
result["source"] = dataset_name
result["split"] = "train"

artifact = wandb.Artifact(name=config.name, type="tapaco-dataset")
artifact.add(wandb.Table(dataframe=result), "data")
wandb.run.log_artifact(artifact)

Downloading:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.69k [00:00<?, ?B/s]

Downloading and preparing dataset tapaco/all_languages (download: 30.72 MiB, generated: 155.26 MiB, post-processed: Unknown size, total: 185.98 MiB) to /root/.cache/huggingface/datasets/tapaco/all_languages/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698...


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/32.2M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

Dataset tapaco downloaded and prepared to /root/.cache/huggingface/datasets/tapaco/all_languages/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

language:   0%|          | 0/16 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/645 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/2638 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/2831 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/2256 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/112 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/230 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/2605 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/240 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/3465 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/3934 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/3547 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/940 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/1942 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/309 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/3236 [00:00<?, ?it/s]

paraphrase_set_id:   0%|          | 0/2699 [00:00<?, ?it/s]



<wandb.sdk.wandb_artifacts.Artifact at 0x7f4ad4006410>

In [11]:
wandb.run.finish()

VBox(children=(Label(value=' 30.38MB of 30.38MB uploaded (0.23MB deduped)\r'), FloatProgress(value=1.0, max=1.…