In [1]:
from datasets import load_dataset, load_from_disk, concatenate_datasets, DatasetDict
# from promptsource import templates
import json
import pickle
import collections
from itertools import combinations
import random
from tqdm import tqdm
import argparse

# from incidental-supervision.src.utils import concatenate_columns, count_gpt2_tokens

CACHE_DIR = "/share/edc/home/antonis/datasets/huggingface"
import os
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
DEVICE_ID = 3

import mauve

# # Get a list of all supported datasets
# datasets = templates.get_dataset_names()
# print(datasets)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
max_len = 1000
n_groups = 10

mauve_scaling_factor = 1
## pre-sampling data

from datasets import DatasetDict

# Define the file paths
task_samples_path = f"/share/edc/home/antonis/datasets/huggingface/flan_v1/task_ds_sampled_{max_len}_{n_groups}.pkl"
dataset_dict_path = f"/share/edc/home/antonis/datasets/huggingface/flan_v1/ds_c4_small_sampled_{max_len}_{n_groups}"

# Check if the task_samples file exists
if not os.path.exists(task_samples_path):
    ds_task = load_from_disk('/share/edc/home/antonis/datasets/huggingface/flan_v1_task_ds')
    task_samples_dict = collections.defaultdict(dict)
    # Pre-sample the ds_task['text']
    for task in tqdm(ds_task.keys(), desc="Tasks"):
        for n in tqdm(range(n_groups), desc="Groups"):
            task_samples_dict[task][str(n)] = random.sample(ds_task[task]['text'], max_len)

    # save 
    with open(task_samples_path, 'wb') as f:
        pickle.dump(dict(task_samples_dict), f)
else:
    # load
    with open(task_samples_path, 'rb') as f:
        task_samples_dict = collections.defaultdict(dict, pickle.load(f))

tasks = ['NLI', 'QA', 'Summarization', 'Commonsense']
# tasks = list(task_samples_dict.keys())

# Check if the dataset_dict file exists
if not os.path.exists(dataset_dict_path):
    # Pre-sample the ds['text']
    ds = load_from_disk("/share/edc/home/antonis/datasets/huggingface/flan_v1/ds_c4_small")
    dataset_dict = {str(n): ds.select(indices=random.sample(range(len(ds)), max_len)) for n in tqdm(range(n_groups), desc="Pre-sampling")}

    # Convert to DatasetDict
    dataset_dict = DatasetDict(dataset_dict)

    # Save the dataset_dict
    dataset_dict.save_to_disk(dataset_dict_path)
else:
    dataset_dict = load_from_disk(dataset_dict_path)


# # # Create the parser
# parser = argparse.ArgumentParser(description='Compute Mauve results')
# parser.add_argument('--mode', type=str, choices=['results', 'results_tasks', 'results_corpus', 'all'], required=True, help='Mode to run the script in')

# # Parse the arguments
# args = parser.parse_args()

# # args = argparse.Namespace()
# # args.mode = 'results_tasks'

# print(f"--- RUNNING IN MODE: {args.mode} ---")

args = argparse.Namespace()
args.mode = 'results'

In [7]:
task_samples_dict['NLI']['0'][0]

"Choose from options: Determine if the sentence is true based on the text below:\nTo cure astigmatism, you should go see a teacher\n\nHow to cure astigmatism<br>Speak to your optometrist about getting eyeglasses. Eyeglasses are a good option for correcting astigmatism. They help to correct the irregular curve of your cornea and allow your retina to focus properly.\nOptions:\n *Yes;\n *It's impossible to say;\n *No; No"

In [10]:
print(task_samples_dict.keys())

dict_keys(['NLI', 'QA', 'Summarization', 'Commonsense', 'Sentiment', 'Paraphrase', 'Reading Comp.', 'Reading Comp. w/ Commonsense', 'Coreference', 'Misc', 'Math'])


In [8]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

text_1 = dataset_dict['6']['text']
text_2 = task_samples_dict['Sentiment']['0'][0]

save_path = f"./incidental-supervision/results/mauve/corpus_task_sim"

out =  mauve.compute_mauve(p_text=text_1, q_text=text_2, 
                            device_id=0, max_text_length=512,
                            batch_size=32, verbose=True, 
                            featurize_model_name='gpt2-large',
                            mauve_scaling_factor=mauve_scaling_factor)



Tokenizing text...
Featurizing tokens


Featurizing p: 100%|██████████| 32/32 [00:54<00:00,  1.70s/it]


Tokenizing text...
Featurizing tokens


Featurizing q: 100%|██████████| 13/13 [00:00<00:00, 32.41it/s]


seed = 25
performing clustering in lower dimension = 273
Clustering 1413 points in 274D to 41 clusters, redo 5 times, 500 iterations
  Preprocessing in 0.00 s
Outer iteration 0 / 5
  Iteration 499 (0.12 s, search 0.04 s): objective=384.529 imbalance=1.442 nsplit=0       
Objective improved: keep new clusters
Outer iteration 1 / 5
  Iteration 344 (0.19 s, search 0.07 s): objective=388.364 imbalance=1.613 nsplit=0       



  Iteration 499 (0.22 s, search 0.08 s): objective=388.364 imbalance=1.613 nsplit=0       
Outer iteration 2 / 5
  Iteration 499 (0.34 s, search 0.12 s): objective=370.784 imbalance=1.415 nsplit=0       
Objective improved: keep new clusters
Outer iteration 3 / 5
  Iteration 499 (0.44 s, search 0.15 s): objective=383.556 imbalance=1.403 nsplit=0       
Outer iteration 4 / 5
  Iteration 499 (0.54 s, search 0.19 s): objective=385.008 imbalance=1.453 nsplit=0       
kmeans time: 0.55 s
total discretization time: 0.87 seconds


In [None]:
# Compute results or results_tasks based on the mode

# compute mauve between pretraining corpus and task groups
if args.mode == 'results' or args.mode == 'all':
    print("Computing results")
    if os.path.exists('mauve_results.pkl'):
        with open('mauve_results.pkl', 'rb') as f:
            results = collections.defaultdict(dict, pickle.load(f))
    else:
        results = collections.defaultdict(dict)
    for task in tqdm(tasks, desc=f"Tasks"):
        for n in tqdm(range(n_groups), desc=f"{task}_Groups"):
            if int(n) in results[task]:
                continue
            text_1 = dataset_dict[str(n)]['text']
            text_2 = task_samples_dict[task][str(n)]
            task_samples_dict[task][n] = text_2
            print(f"Computing Mauve for {task} {n}")
            out =  mauve.compute_mauve(p_text=text_1, q_text=text_2, 
                            device_id=DEVICE_ID, max_text_length=512,
                            batch_size=1, verbose=True, 
                            featurize_model_name='gpt2-large',
                            mauve_scaling_factor=mauve_scaling_factor)
            results[task][n] = out.__dict__
            with open('mauve_results.pkl', 'wb') as f:
                pickle.dump(results, f)

    # save 
    with open('mauve_results.pkl', 'wb') as f:
        pickle.dump(results, f)