In [None]:
import numpy as np
import os
from opacus import PrivacyEngine
from tqdm import tqdm
from datasets import load_dataset
from transformers import BertTokenizer
from datasets import load_from_disk

In [None]:
tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")

# Tokenize function
def tokenize_function(examples):
    # Depending on the dataset, we might have one or two text fields
    text_fields = ['sentence'] if 'sentence' in examples else ['question', 'sentence'] if 'question' in examples else ['premise', 'hypothesis'] if 'premise' in examples and 'hypothesis' in examples else ['question1', 'question2']
    return tokenizer(*[examples[field] for field in text_fields], padding='max_length', truncation=True, max_length=128, return_token_type_ids=True)

# List of tuples describing the dataset name and its type/name in the `datasets` library
dataset_info = [
    ('sst2', 'glue', 'sst2'),
    ('qnli', 'glue', 'qnli'),
    ('mnli', 'glue', 'mnli'),
    ('qqp', 'glue', 'qqp'),
]

# Load and tokenize the datasets in a loop
tokenized_datasets = {}
for dataset_name, library_name, dataset_type in dataset_info:
    # Load the dataset
    dataset = load_dataset(library_name, dataset_type)
    
    # Tokenize the dataset
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    
    # Rename the label column to 'labels'
    if "label" in tokenized_dataset["train"].column_names:
        tokenized_dataset = tokenized_dataset.map(lambda examples: {"labels": examples["label"]}, batched=True)
        tokenized_dataset = tokenized_dataset.remove_columns("label")
    
    # Set the format to PyTorch tensors
    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
    
    # Store the tokenized dataset
    tokenized_datasets[dataset_name] = tokenized_dataset

In [None]:
output_dir = "/kaggle/working/tokenized_datasets"
os.makedirs(output_dir, exist_ok=True)

# Save each tokenized dataset
for dataset_name, tokenized_dataset in tokenized_datasets.items():
    # Save the dataset to a file
    save_path = os.path.join(output_dir, f"{dataset_name}.dataset")
    tokenized_dataset.save_to_disk(save_path)