In [7]:
import os
import json
import argparse
import pandas as pd
import numpy as np

CACHE_DIR = "/share/edc/home/antonis/datasets/huggingface"
import os
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR

from datasets import load_dataset

In [3]:
def print_stats(token_counts_dir):
    # Load the token counts
    print(f"Loading token counts from {token_counts_dir}")
    with open(token_counts_dir, "r") as f:
        token_counts = json.load(f)

    # Print the token counts with scientific notation
    for val in token_counts.values():
        print(f"{val:.2e}")

    # Calculate and print the statistics
    n_tokens_training = token_counts["train"]
    n_params_gpt2 = 124e6

    print(f"Number of tokens in training set: {n_tokens_training:.2e}")
    print(f"Number of parameters in GPT2: {n_params_gpt2:.2e}")
    print(f"Ratio: {n_tokens_training/n_params_gpt2}")

    # Calculate required parameters and tokens
    required_ratio = 20
    n_tokens_required = required_ratio * n_params_gpt2
    n_parameters_required = n_tokens_training / required_ratio

    print(f"No. Parameters required: {n_parameters_required:.2e}")
    return token_counts

# Usage:
ds_dir = "/share/edc/home/antonis/datasets/huggingface/merged_datasets/sentiment_c4/P_1_PQA_5_promptsource_True/dataset_0"
token_counts_dir = os.path.join(ds_dir, "token_counts.json")
token_counts = print_stats(token_counts_dir)

ds_dir_2 = "/share/edc/home/antonis/datasets/huggingface/merged_datasets/sentiment_c4/P_1_PQA_5_promptsource_True/dataset_1"
token_counts_dir_2 = os.path.join(ds_dir_2, "token_counts.json")
token_counts_2 = print_stats(token_counts_dir_2)

token_diff = token_counts["train"] - token_counts_2["train"]
print(f"Token difference: {token_diff:.2e}")

token_diff_ratio = token_diff / token_counts["train"]
print(f"Token difference ratio: {token_diff_ratio * 100}%")

Loading token counts from /share/edc/home/antonis/datasets/huggingface/merged_datasets/sentiment_c4/P_1_PQA_5_promptsource_True/dataset_0/token_counts.json
3.41e+10
6.70e+06
Number of tokens in training set: 3.41e+10
Number of parameters in GPT2: 1.24e+08
Ratio: 275.2673945887097
No. Parameters required: 1.71e+09
Loading token counts from /share/edc/home/antonis/datasets/huggingface/merged_datasets/sentiment_c4/P_1_PQA_5_promptsource_True/dataset_1/token_counts.json
3.39e+10
6.70e+06
Number of tokens in training set: 3.39e+10
Number of parameters in GPT2: 1.24e+08
Ratio: 273.6619662096774
No. Parameters required: 1.70e+09
Token difference: 1.99e+08
Token difference ratio: 0.5832250424831484%


In [8]:
from transformers import GPT2Tokenizer

def count_gpt2_tokens(dataset, text_column):
    """
    Tokenize a Hugging Face Dataset using GPT-2 tokenizer and count the total number of tokens.

    Parameters:
    dataset (datasets.Dataset): Hugging Face Dataset to tokenize.
    text_column (str): Name of the column in the dataset that contains the text to tokenize.

    Returns:
    int: Total number of tokens.
    """

    # Load pre-trained GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Define a function to tokenize each example and return the number of tokens
    def count_tokens(example):
        tokens = tokenizer.encode(example[text_column], truncation=True)
        return {"num_tokens": len(tokens)}

    # Map the count_tokens function to the dataset
    dataset = dataset.map(count_tokens, remove_columns=dataset.column_names)

    # Sum the num_tokens column to get the total number of tokens
    num_tokens = sum(dataset['num_tokens'])

    return num_tokens

In [9]:
ds_yelp_review = load_dataset("yelp_review_full", split="train", cache_dir=CACHE_DIR)
ds_sentiment140 = load_dataset("sentiment140", split="train", cache_dir=CACHE_DIR)

Downloading builder script: 100%|██████████| 4.41k/4.41k [00:00<00:00, 4.21MB/s]
Downloading metadata: 100%|██████████| 2.04k/2.04k [00:00<00:00, 2.37MB/s]
Downloading readme: 100%|██████████| 6.55k/6.55k [00:00<00:00, 7.96MB/s]
Found cached dataset yelp_review_full (/share/edc/home/antonis/datasets/huggingface/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
Downloading builder script: 100%|██████████| 4.03k/4.03k [00:00<00:00, 2.36MB/s]
Downloading metadata: 100%|██████████| 1.59k/1.59k [00:00<00:00, 1.28MB/s]
Downloading readme: 100%|██████████| 6.84k/6.84k [00:00<00:00, 7.19MB/s]
Found cached dataset sentiment140 (/share/edc/home/antonis/datasets/huggingface/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997)


In [12]:
count_gpt2_tokens(ds_yelp_review, "text")

                                                                     

113910634

In [19]:
ds_yelp_review_tokens = 113910634
ds_sentiment140_tokens = 34122355

"""
ds_yelp_review_tokens = 113910634
ds_sentiment140_tokens = 34122355
"""

full_xy = ds_yelp_review_tokens + ds_sentiment140_tokens

with open(token_counts_dir, "r") as f:
    token_counts = json.load(f)

xy_ratio = full_xy / token_counts["train"]
print(f"Ratio of full dataset to training dataset: {xy_ratio * 100}%")

Ratio of full dataset to training dataset: 0.43369263882600073%


In [20]:
print(f"full_xy: {full_xy:.2e}")

full_xy: 1.48e+08


In [13]:
ds_sentiment140_tokens = count_gpt2_tokens(ds_sentiment140, "text")

                                                                       

In [14]:
print(ds_sentiment140_tokens)

34122355


In [None]:
len_xy = len(ds_yelp_review) + len(ds_sentiment140)