In [None]:
import traceback
import warnings
from typing import List

import numpy as np
import pandas as pd
import pycountry
import torch
import wandb
from datasets import load_dataset
from sentence_transformers import util
from tqdm.auto import tqdm

# Embedding

In [None]:
# initialize

In [None]:
# embed

def embed(sentences: List[str]) -> np.ndarray:
    pass

# Execution

In [None]:
config = wandb.config

config.embedder = ""
config.name = f"tapaco-{config.embedder}"

In [None]:
wandb.login()

In [None]:
wandb.init(
      entity="eenlp",
      project="paraphrase_detection",
      job_type=f"create-dataset",
      name=f"create-tapaco-{config.embedder}"
)

In [None]:
# DATASET_ONLY_FIRST_N = 20_000
DATASET_ONLY_FIRST_N = 10_000

config.DATASET_ONLY_FIRST_N = DATASET_ONLY_FIRST_N

dataset_name = "tapaco"

dataset = load_dataset("tapaco", "all_languages")
df = dataset["train"].to_pandas()

df["paraphrase_set_id"] = df["paraphrase_set_id"].astype(int)

languages_to_keep = [
    "be",  # Belarusian
    "bg",  # Bulgarian
    "cs",  # Czech
    "en",  # English
    "et",  # Estonian
    "hr",  # Croatian
    "hu",  # Hungarian
    "hy",  # Armenian
    "lt",  # Lithuanian
    "mk",  # Macedonian
    "pl",  # Polish
    "ro",  # Romanian
    "ru",  # Russian
    "sl",  # Slovenian
    "sr",  # Serbian
    "uk",  # Ukrainian
]

df = df[df["language"].isin(languages_to_keep)]
assert len(languages_to_keep) == len(
    df["language"].unique()
), "Count of filtered languages doesn't match, probably a typo in 'languages_to_keep'?"

# paraphrase_set_id = 0 seems to mean "unassigned".
df = df[df["paraphrase_set_id"] != 0].copy()

# We need to generate example pairs somehow.
# As of the current version of the code, 50% of the generated examples are true paraphrases (label = 1),
# 25% are chosen from the most similar non-paraphrases (label = 0),
# and 25% is random negative examples (label = 0).

result = []
np.random.seed(0)
for language, df_language in tqdm(df.groupby("language"), "language", position=0):
    try:
        language_name = pycountry.languages.get(alpha_2=language).name

        df_language = df_language[:DATASET_ONLY_FIRST_N]

        df_language["embeddings"] = embed(df_language["paraphrase"])

        for paraphrase_set_id, df_set in tqdm(
            df_language.groupby("paraphrase_set_id"),
            "paraphrase_set_id",
            position=1,
        ):
            if len(df_set) <= 1:
                continue

            df_negatives = df_language[
                df_language["paraphrase_set_id"] != paraphrase_set_id
            ]

            for row in df_set.itertuples():
                result.append(
                    {
                        "sentence1": row.paraphrase,
                        "sentence2": np.random.choice(
                            df_set[df_set.index != row.Index]["paraphrase"]
                        ),
                        "label": 1,
                        "lang": language_name,
                    }
                )

                # TODO this can sample the same pair twice, consider fixing it
                result.append(
                    {
                        "sentence1": row.paraphrase,
                        "sentence2": np.random.choice(
                            df_set[df_set.index != row.Index]["paraphrase"]
                        ),
                        "label": 1,
                        "lang": language_name,
                    }
                )

                # similar negative
                query_embedding = row.embedding
                corpus_embeddings = df_negatives["embeddings"]
                cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
                top_results = torch.topk(cos_scores, k=1)

                result.append(
                    {
                        "sentence1": row.paraphrase,
                        "sentence2": df_negatives.iloc[top_results[1][0]]["paraphrase"],
                        "label": 0,
                        "lang": language_name,
                    }
                )

                # random negative
                result.append(
                    {
                        "sentence1": row.paraphrase,
                        "sentence2": np.random.choice(df_negatives["paraphrase"]),
                        "label": 0,
                        "lang": language_name,
                    }
                )
    except KeyboardInterrupt:
        raise
    except:
        warnings.warn(traceback.format_exc())
        continue

result = pd.DataFrame(result)
result["source"] = dataset_name
result["split"] = "train"

artifact = wandb.Artifact(name=config.name, type="tapaco-dataset")
artifact.add(wandb.Table(dataframe=result), "data")
wandb.run.log_artifact(artifact)

In [None]:
wandb.run.finish()