In [33]:
import os
import glob
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

# Fine tuning dataset

In [2]:
train_file = "train.jsonl"
valid_file = "valid.jsonl"
test_file = "test.jsonl"


DATA_SIZE = 0.1 # we will use only 10% of the total data

In [3]:
import json


def sample_data(file_path):
    with open(file_path, 'r') as json_file:
        json_list = list(json_file)
        size = int(len(json_list)*DATA_SIZE)
        json_list = random.sample(json_list, size)
        json_file.close()
        
    return json_list


def write_samples_to_file(samples, dataset_type):
    with open(f"dataset_splits/{dataset_type}_{'_'.join(str(DATA_SIZE).split('.'))}.jsonl", 'a') as outfile:
        for json_str in samples:
            sample = json.loads(json_str)
            json.dump(sample, outfile)
            outfile.write('\n')
    outfile.close()

In [22]:
train_samples = sample_data(train_file)
valid_samples = sample_data(valid_file)
test_samples = sample_data(test_file)

In [30]:
write_samples_to_file(train_samples, dataset_type="train")
write_samples_to_file(valid_samples, dataset_type="valid")
write_samples_to_file(test_samples, dataset_type="test")

# Comparison dataset

In [35]:
comparison_files = glob.glob("comparisons/batch*.json")
comparison_files.remove("comparisons/batch0_cnndm.json")

In [36]:
comparisons_list = []

for comparison_file in tqdm(comparison_files): # iterate through all the files
    with open(comparison_file, 'r') as dst_file:
        dst_list = list(dst_file)
        comparisons_list.extend(dst_list)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:00<00:00, 26.76it/s]


In [58]:
comparison_list_train_split = []
comparison_list_valid1_split = []
comparison_list_valid2_split = []

split_names = []

for comparison_str in tqdm(comparisons_list):
    sample = json.loads(comparison_str)
    split_name = sample["split"]
    if split_name == "train":
        comparison_list_train_split.append(comparison_str)
    elif split_name == "valid1":
        comparison_list_valid1_split.append(comparison_str)
    else:
        comparison_list_valid2_split.append(comparison_str)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 176655/176655 [00:02<00:00, 75157.36it/s]


In [59]:
len(comparison_list_train_split), len(comparison_list_valid1_split), len(comparison_list_valid2_split)

(92858, 33082, 50715)

In [60]:
DATA_SIZE = 0.4 # Use 40% for comparison

def sample_data(split):
    size = int(len(split)*DATA_SIZE)
    split = random.sample(split, size)

    return split

In [61]:
comparison_list_train_split = sample_data(comparison_list_train_split)
comparison_list_valid1_split = sample_data(comparison_list_valid1_split)
comparison_list_valid2_split = sample_data(comparison_list_valid2_split)

len(comparison_list_train_split), len(comparison_list_valid1_split), len(comparison_list_valid2_split)

(37143, 13232, 20286)

In [62]:
def write_samples_to_file(samples, dataset_type):
    with open(f"comparison_splits/{dataset_type}_{'_'.join(str(DATA_SIZE).split('.'))}.jsonl", 'a') as outfile:
        for json_str in samples:
            sample = json.loads(json_str)
            json.dump(sample, outfile)
            outfile.write('\n')
    outfile.close()

In [63]:
write_samples_to_file(comparison_list_train_split, dataset_type="train")
write_samples_to_file(comparison_list_valid1_split, dataset_type="valid1")
write_samples_to_file(comparison_list_valid2_split, dataset_type="valid2")