In [1]:
## load packages
import pandas as pd
import numpy as np
import os
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from datasets import ClassLabel
from sklearn.model_selection import train_test_split
from IPython.display import display

## set global seed for reproducibility and against seed hacking
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dic_datasets_train = {}
dic_datasets_test = {}

In [35]:
def fix_label(example: dict):
    example['label'] = [abs(l-1) for l in example['label']]
    return example

In [None]:
dataset_zh_nli = load_dataset('asadfgglie/nli-zh-tw-all')  # split='train'

dataset_zh_nli['train'] = dataset_zh_nli['train'].rename_columns({'text1': 'premise', 'text2': 'hypothesis'})
dataset_zh_nli['train'] = dataset_zh_nli['train'].map(fix_label, batched=True, batch_size=100)

feature = dataset_zh_nli['train'].features.copy()
feature['label'] = ClassLabel(names=["entailment", "not_entailment"])

dataset_zh_nli['train'], dataset_zh_nli['test'] = train_test_split(dataset_zh_nli['train'], train_size=0.8, test_size=0.2, random_state=SEED_GLOBAL)
dataset_zh_nli['test'] = Dataset.from_dict(dataset_zh_nli['test']).cast(feature)
dataset_zh_nli['train'] = Dataset.from_dict(dataset_zh_nli['train']).cast(feature)

Casting the dataset: 100%|██████████| 8500/8500 [00:00<00:00, 1693822.88 examples/s]
Casting the dataset: 100%|██████████| 34000/34000 [00:00<00:00, 3777251.05 examples/s]


In [43]:
dataset_zh_nli.push_to_hub('asadfgglie/nli-zh-tw-all')

Creating parquet from Arrow format: 100%|██████████| 34/34 [00:00<00:00, 539.69ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]
Creating parquet from Arrow format: 100%|██████████| 9/9 [00:00<00:00, 600.05ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it]
Downloading metadata: 100%|██████████| 643/643 [00:00<?, ?B/s] 


In [33]:
dataset_zh_nli

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 34000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 8500
    })
})

In [2]:
dataset = load_dataset('asadfgglie/BanBan_2024-10-17-facial_expressions')['train']

Downloading readme: 100%|██████████| 527/527 [00:00<?, ?B/s] 
Downloading data: 100%|██████████| 84.6k/84.6k [00:00<00:00, 127kB/s]
Downloading data files: 100%|██████████| 1/1 [00:00<00:00,  1.48it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 181.41it/s]
Generating train split: 100%|██████████| 1441/1441 [00:00<00:00, 57888.76 examples/s]


In [6]:
template = '這是一句會使用{}表情說出來的話。'
def convert():
    for ex in dataset:
        for entail_label in ex['candidate_labels']:
            yield {
                'premise': ex['sequences'],
                'hypothesis': template.format(entail_label),
                'label': 0
            }
        for not_entail_label in ex['not_candidate_labels']:
            yield {
                'premise': ex['sequences'],
                'hypothesis': template.format(not_entail_label),
                'label': 1
            }

In [4]:
features = load_dataset('asadfgglie/nli-zh-tw-all')['train'].features

In [8]:
nli = Dataset.from_generator(convert, features)
nli.push_to_hub('asadfgglie/BanBan_2024-10-17-facial_expressions-nli')

Generating train split: 4726 examples [00:00, 25284.49 examples/s]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 1630.50ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.63s/it]
