# Prepare Hugging Face Dataset card  

In [None]:
"""Dataset card."""

import math
import os
import random

import jsonlines
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(42)

sns.set_style("whitegrid")

In [None]:
def load_jsonl(file_name: str):
    """Loads jsonl file.

    Args:
        file_name (str):
            File name to load.

    Returns:
        list of dict:
            Data from file.
    """
    data = []
    with jsonlines.open(file_name, mode="r") as reader:
        for obj in reader:
            data.append(obj)
    return data

In [None]:
DATA_PATH = "dataset.jsonl"

In [None]:
data = load_jsonl(DATA_PATH)

## Dataset Statistics

Number of samples

In [None]:
len(data)

Size of dataset

In [None]:
size = os.path.getsize(DATA_PATH) / 1e6

print(f"Total size: {size:.2f} MB")

### PDF text length distribution

In [None]:
text_lengths = [pdf["text_len"] for pdf in data]
text_lengths_log10 = [math.log10(text_len) for text_len in text_lengths]

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(text_lengths_log10, bins=150)
plt.title("PDF Text Length Distribution")
plt.ylabel("Frequency")
plt.xlabel("Number of characters in text (log10 scale)")
plt.savefig("../figures/text_length_distribution.png")

In [None]:
min(text_lengths), max(text_lengths)