# Creating Statement Data

Import necessary libraries

In [1]:
from datasets import load_dataset, Dataset
import random
import numpy as np
import torch
from sklearn.utils import resample
import pandas as pd
import re

Setting the random seed for reproducibility 

In [23]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
SEED = 42
NUM_PROC=5
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

Setting the number of statements per corpus/dataset

In [3]:
SPC = 1000

In [4]:
def fill_template(templates, values):
    temp = random.sample(templates,1)[0]
    for i in range(len(values)):
        temp = temp.replace("${"+str(i+1)+"}", values[i])
    return temp

In [5]:
def create_statement_dataset_sent_comparsion(dataset, templates, columns, label_column, num_statements=10000, prop_negative=0.5, negative_templates=None,splits=["train"], num_proc=5):
    data = load_dataset(*dataset, split=splits, cache_dir="/scratch/afz225/.cache")
    downsample = [True if num_statements<len(split) else False for split in data]
    new_data = []
    for i, split in enumerate(data):
        split = pd.DataFrame(split)
        if downsample[i]:
            new_data.append(Dataset.from_pandas(resample(split, n_samples=num_statements, replace=False, random_state=SEED)))
        else:
            new_data.append(Dataset.from_pandas(resample(split, n_samples=num_statements, random_state=SEED)))
    data = new_data
    col_names = data[0].column_names
    
    def create_statements_labels(batch):
        return {"statement":[fill_template([template], [batch[column][example] for column in columns]) for example in range(len(batch[label_column])) for template in templates] + [fill_template([template], [batch[column][example] for column in columns]) for example in range(len(batch[label_column])) for template in negative_templates], "is_true":[example for example in batch[label_column] for template in templates]+[1-example for example in batch[label_column] for template in negative_templates]}
    
    updated_data = [split.map(create_statements_labels,batched=True, remove_columns=col_names, num_proc=num_proc).to_pandas() for split in data]
    updated_data = [Dataset.from_dict(resample(data,n_samples=min(SPC, len(data)), replace=False, random_state=SEED, stratify=data['is_true'])) for data in updated_data]
    return updated_data

In [6]:
def create_statement_dataset_multiple_choice(dataset, templates, question, answers, label_column, label_offset=0, num_statements=10000,splits=["train"], num_proc=5, replace=False):
    data = load_dataset(*dataset, split=splits, cache_dir="/scratch/afz225/.cache")
    downsample = [True if num_statements<len(split) else False for split in data]
    new_data = []
    for i, split in enumerate(data):
        split = pd.DataFrame(split)
        if downsample[i]:
            new_data.append(Dataset.from_pandas(resample(split, n_samples=num_statements, replace=False, random_state=SEED)))
        else:
            new_data.append(Dataset.from_pandas(resample(split, n_samples=num_statements, random_state=SEED)))
    data = new_data
    col_names = data[0].column_names
    def create_statements_labels(batch):
        answer_choice = random.choices(range(len(answers)), k=len(batch[label_column])*len(templates))
        statements = []
        for example in range(len(batch[label_column])):
            for template in templates:
                statements.append(fill_template([template], [batch[question][example], batch[answers[answer_choice[example]]][example]]))
            if replace:
                statements.append(batch[question][example].replace("_", batch[answers[answer_choice[example]]][example]))
        
        if replace:
            truth = [int(str(int(example)-label_offset)==str(answer_choice[i])) for i, example in enumerate(batch[label_column]) for template in range(len(templates)+1)]
        else:
            truth = [int(str(int(example)-label_offset)==str(answer_choice[i])) for i, example in enumerate(batch[label_column]) for template in templates]
        return {"statement":statements, "is_true":truth}
    updated_data = [split.map(create_statements_labels, batched=True, remove_columns=data[0].column_names, num_proc=num_proc).to_pandas() for split in data]
    updated_data = [Dataset.from_dict(resample(data,n_samples=min(SPC, len(data)), replace=False, random_state=SEED, stratify=data['is_true'])) for data in updated_data]
    return updated_data

In [7]:
dataset = ["SetFit/qqp"]
templates = ["\"${1}\" is a duplicate of \"${2}\"", "\"${1}\" duplicates \"${2}\"", "\"${1}\" is the same as \"${2}\"", "\"${1}\" can be stated as \"${2}\"", "\"${1}\" is a paraphrase of \"${2}\""]
negative_templates = ["\"${1}\" is not a duplicate of \"${2}\"", "\"${1}\" does not duplicate \"${2}\"", "\"${1}\" doesn't duplicate \"${2}\"", "\"${1}\" is not the same as \"${2}\"", "\"${1}\" is unrelated to \"${2}\"", "\"${1}\" can't be stated as \"${2}\"", "\"${1}\" can not be stated as \"${2}\"", "\"${1}\" is not a paraphrase of \"${2}\"", "\"${1}\" isn't a paraphrase of \"${2}\""]
columns = ['text1', 'text2']
label_column = 'label'

qqp_statements = create_statement_dataset_sent_comparsion(dataset, templates, columns, label_column, negative_templates=negative_templates)[0]



Map (num_proc=5):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
qqp_statements[:5]

{'statement': ['"How do I prepare for the IAS exam at home?" is unrelated to "How should one best prepare for IAS examination?"',
  '"How do I improve will power?" is a duplicate of "How do I strengthen my will power?"',
  '"Can you use celebrity photos in an app?" is not a duplicate of "Can you use publicly available celebrity photos for a mobile app?"',
  '"What are the emergency powers of the President of India?" is not a duplicate of "On whose advice can the president of india declare national emergency?"',
  '"Why did Myanmar change its name from Burma?" is the same as "Why did Burma change the name of the Arakan state to Rakhine?"'],
 'is_true': [0, 1, 0, 1, 0]}

In [10]:
dataset = ["winogrande", 'winogrande_xl']
templates = ["In \"${1}\", _ is: ${2}", "Q: \"${1}\", A: ${2}", "The missing word in: \"${1}\" is ${2}", "_ in: \"${1}\" is ${2}", "\"${1}\", _ is: ${2}"]
question = 'sentence'
answers = ['option1','option2']
label_column = 'answer'
winogrande_statements = create_statement_dataset_multiple_choice(dataset,templates, question, answers, label_column,num_proc=NUM_PROC, label_offset=1, replace=True)[0]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.97k [00:00<?, ?B/s]

Map (num_proc=5):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [11]:
pd.Series(winogrande_statements['is_true']).value_counts()

1    502
0    498
Name: count, dtype: int64

In [12]:
dataset = ["piqa"]
templates = ["${1} ${2}", "Goal:${1}, Solution: ${2}", "If the goal is: ${1}, then the solution is: ${2}", "Problem: ${1}, Solution: ${2} "]
question = 'goal'
answers = ['sol1','sol2']
label_column = 'label'
piqa_statements = create_statement_dataset_multiple_choice(dataset,templates, question, answers, label_column,num_proc=NUM_PROC)[0]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/815k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16113 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3084 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1838 [00:00<?, ? examples/s]

Map (num_proc=5):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [13]:
pd.Series(piqa_statements['is_true']).value_counts()

1    505
0    495
Name: count, dtype: int64

In [14]:
statement_data = {}

In [15]:
from datasets import concatenate_datasets
train_datasets = [qqp_statements, winogrande_statements,piqa_statements]
statement_data['train'] = concatenate_datasets(train_datasets)

In [16]:
pd.Series(statement_data['train']['is_true']).value_counts()

1    1544
0    1456
Name: count, dtype: int64

In [17]:
!mkdir STTS_trial

In [18]:
statement_data['train'].to_csv('STTS_trial/statement_train.csv')

Creating CSV from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

443163

In [25]:
statement_data['train'].push_to_hub("ashabrawy/STTS_trial")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ashabrawy/STTS_trial/commit/a876e19e2d9982ca735c10e84ac038bf65d67a15', commit_message='Upload dataset', commit_description='', oid='a876e19e2d9982ca735c10e84ac038bf65d67a15', pr_url=None, pr_revision=None, pr_num=None)