In [3]:
from datasets import load_dataset, load_from_disk
from promptsource import templates

CACHE_DIR = "/share/edc/home/antonis/datasets/huggingface"
import os
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR

# # Get a list of all supported datasets
# datasets = templates.get_dataset_names()
# print(datasets)

In [9]:
# import sys
# sys.path.append("/share/edc/home/antonis/LLM-Incidental-Supervision/incidental-supervision/src")
from src.dataset_configs import DatasetConfig

config_instance = DatasetConfig()

# Changing P_QA and updating 'p' values in dataset_configs
config_instance.update_p_values(new_value=0.5)

# Now, the 'p' values inside config_instance.dataset_configs are updated
print(config_instance.dataset_configs)

[{'dataset_type': 'QA', 'dataset_name': 'common_gen', 'dataset_config_name': 'common_gen', 'p': 0.5, 'train_split': 'train'}, {'dataset_type': 'QA', 'dataset_name': 'e2e_nlg', 'dataset_config_name': 'e2e_nlg', 'p': 0.5, 'train_split': 'train'}, {'dataset_type': 'QA', 'dataset_name': 'dart', 'dataset_config_name': 'dart', 'p': 0.5, 'validation_split': 'validation'}, {'dataset_type': 'QA', 'dataset_name': 'web_nlg', 'dataset_config_name': 'release_v3.0_en', 'p': 0.5, 'validation_split': 'test'}, {'dataset_type': 'text', 'dataset_name': 'wikitext', 'dataset_config_name': 'wikitext-2-v1', 'p': 1, 'train_split': 'train'}, {'dataset_type': 'text', 'dataset_name': 'bookcorpus', 'dataset_config_name': None, 'p': 1, 'train_split': 'train'}]


In [10]:
ds_pth = "/share/edc/home/antonis/datasets/huggingface/merged_datasets/dataset_1/dataset_validation.arrow"
dataset = load_from_disk(ds_pth)

In [12]:
dataset[0:3]['text']

["tripleset: [['Mars Hill College', 'JOINED', '1973'], ['Mars Hill College', 'LOCATION', 'Mars Hill, North Carolina']] annotations: 'source': ['WikiSQL_decl_sents'], 'text': ['A school from Mars Hill, North Carolina, joined in 1973.']",
 'tripleset: [[\'Newberry College\', \'NICKNAME\', \'Wolves\']] annotations: \'source\': [\'WikiSQL_decl_sents\'], \'text\': ["Newberry College\'s nickname is the wolves."]',
 "tripleset: [['Presbyterian College', 'TYPE', 'Private']] annotations: 'source': ['WikiSQL_decl_sents'], 'text': ['Presbyterian College is a private school.']"]

In [24]:
import re
import json

def generate_qa_prompt(example):
    # Extract the tripleset from the input example
    tripleset_match = re.search("tripleset: (.+?) annotations", example)
    if tripleset_match:
        tripleset_str = tripleset_match.group(1)
        tripleset = json.loads(tripleset_str.replace("\'", "\""))
    else:
        return "Error: Could not extract tripleset from example."
    
    # Extract the text (answer) from the input example
    text_match = re.search("text': \[(.+?)]", example)
    if text_match:
        text_str = text_match.group(1)
        # replace escaped single quotes with actual single quotes
        answer = text_str.replace("\\'", "'")
    else:
        return "Error: Could not extract text from example."
    
    # Construct the prompt using the information in the tripleset
    info_list = ["- {} {} {}".format(triple[0], triple[1], triple[2]) for triple in tripleset]
    info_text = "\n".join(info_list)
    prompt = ("Create a concise and grammatically correct sentence that "
              "incorporates the information provided in the triple set. "
              "Please ensure that your sentence naturally integrates this "
              "information:\n{}\nMake sure your sentence reads naturally and is informative."
              .format(info_text))
    
    return {'prompt': prompt, 'answer': answer}


# Example usage:
example = "tripleset: [['Mars Hill College', 'JOINED', '1973'], ['Mars Hill College', 'LOCATION', 'Mars Hill, North Carolina']] annotations: 'source': ['WikiSQL_decl_sents'], 'text': ['A school from Mars Hill, North Carolina, joined in 1973.']"

result = generate_qa_prompt(example)
print("Prompt:", result['prompt'])
print("Answer:", result['answer'])


Prompt: Create a concise and grammatically correct sentence that incorporates the information provided in the triple set. Please ensure that your sentence naturally integrates this information:
- Mars Hill College JOINED 1973
- Mars Hill College LOCATION Mars Hill, North Carolina
Make sure your sentence reads naturally and is informative.
Answer: 'A school from Mars Hill, North Carolina, joined in 1973.'


In [28]:
import re
import json
import random

def generate_qa_prompt(example):
    # Extract the tripleset from the input example
    tripleset_match = re.search("tripleset: (.+?) annotations", example)
    if tripleset_match:
        tripleset_str = tripleset_match.group(1)
        tripleset = json.loads(tripleset_str.replace("\'", "\""))
    else:
        return "Error: Could not extract tripleset from example."
    
    # Extract the text (answer) from the input example
    text_match = re.search("text': \[(.+?)]", example)
    if text_match:
        text_str = text_match.group(1)
        # replace escaped single quotes with actual single quotes
        answer = text_str.replace("\\'", "'")
    else:
        return "Error: Could not extract text from example."
    
    # Construct the info list
    info_list = ["- {} {} {}".format(triple[0], triple[1], triple[2]) for triple in tripleset]
    info_text = "\n".join(info_list)
    
    # Define 10 different prompt structures
    prompt_structures = [
        "Create a concise and grammatically correct sentence that incorporates the information provided in the triple set. Please ensure that your sentence naturally integrates this information:\n{}\nMake sure your sentence reads naturally and is informative.",
        "Compose a short, grammatically accurate sentence that seamlessly includes the following details:\n{}\nEnsure the sentence flows naturally.",
        "Using the following information, write a clear and concise sentence:\n{}\nThe sentence should be grammatically correct and easy to understand.",
        "Construct a sentence using the information given below. Your sentence should be brief and grammatically correct:\n{}",
        "Your task is to create a sentence that communicates the information below in a clear and natural way:\n{}",
        "Write a grammatical sentence that integrates the following information in a coherent manner:\n{}",
        "Please formulate a sentence using the information provided below. Ensure it is grammatically correct and makes logical sense:\n{}",
        "Using the data points below, create a single sentence that is grammatically correct and effectively communicates the information:\n{}",
        "Construct a grammatically accurate and informative sentence using the details given below:\n{}",
        "Combine the information provided into a single, grammatically correct sentence that reads naturally:\n{}"
    ]
    
    # Randomly select a prompt structure
    prompt = random.choice(prompt_structures).format(info_text)
    
    return {'prompt': prompt, 'answer': answer}


# Example usage:
example = "tripleset: [['Mars Hill College', 'JOINED', '1973'], ['Mars Hill College', 'LOCATION', 'Mars Hill, North Carolina']] annotations: 'source': ['WikiSQL_decl_sents'], 'text': ['A school from Mars Hill, North Carolina, joined in 1973.']"

result = generate_qa_prompt(example)
print("Prompt:", result['prompt'])
print("Answer:", result['answer'])

Prompt: Please formulate a sentence using the information provided below. Ensure it is grammatically correct and makes logical sense:
- Mars Hill College JOINED 1973
- Mars Hill College LOCATION Mars Hill, North Carolina
Answer: 'A school from Mars Hill, North Carolina, joined in 1973.'


In [15]:
ds_dart = load_dataset('web_nlg', 'release_v3.0_en')

Downloading builder script: 100%|██████████| 11.5k/11.5k [00:00<00:00, 10.8MB/s]
Downloading metadata: 100%|██████████| 31.7k/31.7k [00:00<00:00, 28.1MB/s]
Downloading readme: 100%|██████████| 22.1k/22.1k [00:00<00:00, 21.4MB/s]


Downloading and preparing dataset web_nlg/release_v3.0_en to /local/home/antonis/.cache/huggingface/datasets/web_nlg/release_v3.0_en/0.0.0/28ffb892f7f42450dd9558684aa43bcaf44b1b3bf0d77cb8d73534646af88dda...


Downloading data: 25.5MB [00:00, 65.6MB/s]
                                                                                      

Dataset web_nlg downloaded and prepared to /local/home/antonis/.cache/huggingface/datasets/web_nlg/release_v3.0_en/0.0.0/28ffb892f7f42450dd9558684aa43bcaf44b1b3bf0d77cb8d73534646af88dda. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 575.32it/s]


In [16]:
ds_dart

DatasetDict({
    train: Dataset({
        features: ['category', 'size', 'eid', 'original_triple_sets', 'modified_triple_sets', 'shape', 'shape_type', 'lex', 'test_category', 'dbpedia_links', 'links'],
        num_rows: 13211
    })
    dev: Dataset({
        features: ['category', 'size', 'eid', 'original_triple_sets', 'modified_triple_sets', 'shape', 'shape_type', 'lex', 'test_category', 'dbpedia_links', 'links'],
        num_rows: 1667
    })
    test: Dataset({
        features: ['category', 'size', 'eid', 'original_triple_sets', 'modified_triple_sets', 'shape', 'shape_type', 'lex', 'test_category', 'dbpedia_links', 'links'],
        num_rows: 5713
    })
})

In [None]:
dataset_files = [
    "/share/edc/home/antonis/datasets/huggingface/merged_datasets/dataset_1/dataset_train.arrow",
    "/share/edc/home/antonis/datasets/huggingface/merged_datasets/dataset_0/dataset_train.arrow",
]

for ds_file in dataset_files:
    ds = load_from_disk(ds_file)
    print(len(ds))

In [None]:

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    print("preds1, ", preds.shape)
    print("labels1, ", labels.shape)
    decoded_preds, decoded_labels = tokenizer.batch_decode((preds, labels), skip_special_tokens=True)
    labels = labels[:, 1:].tolist()
    preds = preds[:, :-1].tolist()

    print(f"labels: {labels.shape}")
    print(f"preds: {preds.shape}")
    print(f"decoded_labels: {decoded_labels.shape}")
    print(f"decoded_preds: {decoded_preds.shape}")

    metric1 = evaluate.load("accuracy")
    metric2 = evaluate.load("f1")
    metric3 = evaluate.load("bleu")
    metric4 = evaluate.load("bertscore")

    accuracy, f1 = [], []
    for i in range(len(labels)):
        accuracy.append(metric1.compute(predictions=preds[i], references=labels[i]))
        f1.append(metric2.compute(predictions=preds[i], references=labels[i], average='macro'))
    accuracy = np.mean(accuracy)
    f1 = np.mean(f1)
    # Specifying the average method for multiclass F1 score
    return {
        "accuracy": accuracy,
        "f1": f1,
        "bleu": metric3.compute(predictions=preds, references=labels),
        "bertscore": metric4.compute(predictions=preds, references=labels)
    }

In [None]:
dataset[0].keys()

In [None]:
dataset[2]

In [None]:
# print(dataset[1])
idx = 1
for k, v in dataset[idx].items():
    print(k, v)

In [None]:
ds_conf_1 = dataset_configs[1]
ds_1 = load_dataset(ds_conf_1['dataset_name'], ds_conf_1['dataset_config_name'], cache_dir=CACHE_DIR)

In [None]:
ds_conf_0 = dataset_configs[0]
ds_0 = load_dataset(ds_conf_0['dataset_name'], ds_conf_0['dataset_config_name'], cache_dir=CACHE_DIR)

In [None]:
from merge_datasets import concatenate_columns

ds_0_train = ds_0['train']
ds_0_text = ds_0_train.map(lambda x: concatenate_columns(x, new_col_name='text'))

In [None]:
ds_conf_2 = dataset_configs[2]
ds_2 = load_dataset(ds_conf_2['dataset_name'], ds_conf_2['dataset_config_name'], cache_dir=CACHE_DIR)