In [1]:
# DATA GENERATOR parameters

# length of array in arrFloat column
arr_length = 1000

# number of rows in the partition 0
nb_rows = 10000

# number of partition :
# each partition is copied and modified from the previous tableset and appended to it
# max number of copied rows is 200_000
nb_part = 13

In [2]:
import random
import string
from time import time

import numpy as np
import pandas as pd

# number of distinct values in each data type
nb_cats = min(int(nb_rows**0.5), 100)
shuffle_cols = False

# Generate Dataframe
nb_cols_int, nb_cols_date, nb_cols_str = 34, 33, 33

table_spec = dict(
    **{f"int{i}": "int" for i in range(nb_cols_int)},
    **{f"dttime{i}": "datetime" for i in range(nb_cols_date)},
    **{f"str{i}": "str" for i in range(nb_cols_str)},
)

# Generate first partition of data (partition=0) in Dafaframe, without array column
myrnd = random.Random(0)
np.random.seed(0)
myrnd.seed(0)
t0 = time()
print(
    f" --> Dataframe generation with {nb_rows:,d} rows x {len(table_spec)} cols and {'no ' if arr_length==0 else f'a {arr_length}-'}array col ... ",
    end="",
)
max_int = 10**6
date_min = pd.to_datetime("2018-01-01")
date_max = pd.to_datetime("2050-12-31")


# Create categorical values for each data type ie. decoration or attributes
chars = string.ascii_letters
cats = {
    # Generate a list of nb_cats random-sized strings built from random letters & digits
    "str": [
        "".join(myrnd.choice(chars) for _ in range(x))
        for x in np.random.randint(5, 20, size=nb_cats)
    ],
    # Generate a list of nb_cats random int64s
    "int": np.random.randint(-max_int, max_int, size=nb_cats, dtype=np.int64),
    # Generate a list of nb_cats random DateTimes
    "datetime": (
        np.random.randint(
            date_min.value // 10**9,
            date_max.value // 10**9,
            size=nb_cats,
            dtype=np.int64,
        )
    )
    * 10**9,
}

# Generate data
result = pd.DataFrame(
    {
        name: (
            np.random.choice(cats[typ], size=nb_rows)
            if typ in ["str", "int"]
            else np.random.choice(cats[typ], size=nb_rows).view("M8[ns]")
            if typ == "datetime"
            else np.nan
        )
        for name, typ in table_spec.items()
    }
).reset_index()

# Generate arrays column
if arr_length > 0:
    result["arrFloat"] = (np.random.rand(nb_rows, arr_length) * 1e6).tolist()

# Add a partition column
result["partition"] = 0
t1 = time()
print(f" --> done in {t1-t0:.2f} sec")
print(f" --> Memory used = {result.memory_usage(deep=True).sum()/1e6:,.1f} MBytes")

 --> Dataframe generation with 10,000 rows x 100 cols and a 1000-array col ...  --> done in 0.53 sec
 --> Memory used = 108.5 MBytes


In [3]:
# Dataset duplication, each partition recursively duplicates
# already accumulated rows (with a maximum of 200000 rows)
for part in range(1, nb_part):
    duplicate = result.copy().tail(200000)
    duplicate["partition"] = part
    result = pd.concat([result, duplicate])

print(
    f" --> Duplicated dataset generated with {len(result.index):,d} rows x {len(table_spec)} cols and {'no ' if arr_length==0 else f'a {arr_length}-'}array col ... ",
)

 --> Duplicated dataset generated with 1,720,000 rows x 100 cols and a 1000-array col ... 


In [4]:
# Write dataframe to a CSV file
result.to_csv("dataset.csv", index=False)

In [5]:
# Write dataframe to a parquet file,
# using pyarrow to specify the array column as float[].

import pyarrow as pa

schema = pa.Table.from_pandas(result, preserve_index=False).schema
new_schema = pa.schema(
    [
        f if f.name != "arrFloat" else pa.field("arrFloat", pa.list_(pa.float32()))
        for f in schema
    ]
)
result.to_parquet(
    "dataset.parquet", index=False, schema=new_schema, row_group_size=10000
)