In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets, DatasetDict
import random
import numpy as np
import torch
from huggingface_hub import notebook_login
import pandas as pd
import re
import os

# Setting the random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Set the Hugging Face token as an environment variable
os.environ['HUGGINGFACE_HUB_TOKEN'] = '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'

# Log in to Hugging Face
notebook_login()

In [2]:
NUM_PROC = 5

In [3]:
# Template filling function
def fill_template(templates, values):
    temp = random.sample(templates, 1)[0]
    for i in range(len(values)):
        temp = temp.replace("${"+str(i+1)+"}", values[i])
    return temp

In [4]:
# Load datasets
languages = ['de', 'en', 'es', 'fr', 'ja', 'ko', 'zh']
dataset = {lang: load_dataset("google-research-datasets/paws-x", lang) for lang in languages}

In [18]:
dataset['de']['train'][0]

{'id': 1,
 'sentence1': 'Im Oktober 1560 traf er sich in Paris heimlich mit dem englischen Botschafter Nicolas Throckmorton und bat ihn um einen Pass, um durch Schottland nach England zurückzukehren.',
 'sentence2': 'Im Oktober 1560 traf er sich heimlich mit dem englischen Botschafter Nicolas Throckmorton in Paris und bat ihn um einen Pass, um durch England nach Schottland zurückzukehren.',
 'label': 0}

In [5]:
# Define templates
pos_templates = [
    "\"${1}\" is a duplicate of \"${2}\"", "\"${1}\" duplicates \"${2}\"",
    "\"${1}\" is the same as \"${2}\"", "\"${1}\" can be stated as \"${2}\"",
    "\"${1}\" is a paraphrase of \"${2}\""
]
negative_templates = [
    "\"${1}\" is not a duplicate of \"${2}\"", "\"${1}\" does not duplicate \"${2}\"",
    "\"${1}\" doesn't duplicate \"${2}\"", "\"${1}\" is not the same as \"${2}\"",
    "\"${1}\" is unrelated to \"${2}\"", "\"${1}\" can't be stated as \"${2}\"",
    "\"${1}\" can not be stated as \"${2}\"", "\"${1}\" is not a paraphrase of \"${2}\"",
    "\"${1}\" isn't a paraphrase of \"${2}\""
]

# Function to process each row
def process_row(row):
    positive = random.choice([True, False])
    label = row['label']

    if label == 1:
        if positive:
            template = random.choice(pos_templates)
        else:
            template = random.choice(negative_templates)
    else:
        if positive:
            template = random.choice(negative_templates)
        else:
            template = random.choice(pos_templates)

    statement = fill_template([template], [row['sentence1'], row['sentence2']])
    return {'statement': statement, 'is_true': int(positive)}

In [6]:
# Process datasets
new_data = {}
for lang in dataset:
    new_data[lang] = {}
    for split in dataset[lang]:
        processed_dataset = dataset[lang][split].map(process_row, num_proc=6)
        new_data[lang][split] = processed_dataset

Map (num_proc=6):   0%|          | 0/49401 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/49401 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/49401 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/49401 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/49401 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/49401 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/49401 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
dataset_dict = {lang: DatasetDict(new_data[lang]) for lang in new_data}

In [11]:
for lang in dataset_dict:
    for split in dataset_dict[lang]:
        dataset_dict[lang][split] = dataset_dict[lang][split].remove_columns(['sentence1', 'sentence2', 'label'])

In [16]:
for lang in dataset_dict:
    dataset_dict[lang].push_to_hub(f"mbzuai-ugrip-statement-tuning/paws-x", lang)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/568 [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.18k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.72k [00:00<?, ?B/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/50 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.25k [00:00<?, ?B/s]