# Atoti Value-at-Risk Benchmark Data Generator

A data generator to simulate the data used in a [blog post by Altinity on calculating Value-at-Risk (VaR)](https://altinity.com/blog/clickhouse-vs-redshift-performance-for-fintech-risk-management).

💡 **Note:** Generating VaR benchmark data with default parameters will produce a `33GB` CSV data file and a `6.7GB` parquet data file. This may take approximately `15` minutes or more depending on your machine specifications.

In [1]:
import random
import string
from time import time
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 500)

In [2]:
# DATA GENERATOR parameters

# length of array in arrFloat column
arr_length = 1000

# number of rows in the partition 0
nb_rows = 10000

# number of partition :
# each partition is copied and modified from the previous tableset and appended to it
# max number of copied rows is 200_000
nb_part = 13

In [3]:
# number of distinct values in each data type
nb_cats = min(int(nb_rows**0.5), 100)
shuffle_cols = False

# Generate Dataframe
nb_cols_int, nb_cols_date, nb_cols_str = 34, 33, 33

table_spec = dict(
    **{f"int{i}": "int" for i in range(nb_cols_int)},
    **{f"dttime{i}": "datetime" for i in range(nb_cols_date)},
    **{f"str{i}": "str" for i in range(nb_cols_str)},
)

# Generate first partition of data (partition=0) in Dafaframe, without array column
myrnd = random.Random(0)
np.random.seed(0)
myrnd.seed(0)
t0 = time()
print(
    f" --> Dataframe generation with {nb_rows:,d} rows x {len(table_spec)} cols and {'no ' if arr_length==0 else f'a {arr_length}-'}array col ... ",
    end="",
)
max_int = 10**6
date_min = pd.to_datetime("2018-01-01")
date_max = pd.to_datetime("2050-12-31")


# Create categorical values for each data type ie. decoration or attributes
chars = string.ascii_letters
cats = {
    # Generate a list of nb_cats random-sized strings built from random letters & digits
    "str": [
        "".join(myrnd.choice(chars) for _ in range(x))
        for x in np.random.randint(5, 20, size=nb_cats)
    ],
    # Generate a list of nb_cats random int64s
    "int": np.random.randint(-max_int, max_int, size=nb_cats, dtype=np.int64),
    # Generate a list of nb_cats random DateTimes
    "datetime": (
        np.random.randint(
            date_min.value // 10**9,
            date_max.value // 10**9,
            size=nb_cats,
            dtype=np.int64,
        )
    )
    * 10**9,
}

# Generate data
result = pd.DataFrame(
    {
        name: (
            np.random.choice(cats[typ], size=nb_rows)
            if typ in ["str", "int"]
            else (
                np.random.choice(cats[typ], size=nb_rows).view("M8[ns]")
                if typ == "datetime"
                else np.nan
            )
        )
        for name, typ in table_spec.items()
    }
).reset_index()

# Generate arrays column
if arr_length > 0:
    result["arrFloat"] = (np.random.rand(nb_rows, arr_length) * 1e6).tolist()

# Add a partition column
result["partition"] = 0
t1 = time()
print(f" --> done in {t1-t0:.2f} sec")
print(f" --> Memory used = {result.memory_usage(deep=True).sum()/1e6:,.1f} MBytes")

 --> Dataframe generation with 10,000 rows x 100 cols and a 1000-array col ...  --> done in 0.21 sec
 --> Memory used = 108.5 MBytes


In [4]:
# Dataset duplication, each partition recursively duplicates
# already accumulated rows (with a maximum of 200000 rows)
for part in range(1, nb_part):
    duplicate = result.copy().tail(200000)
    duplicate["partition"] = part
    result = pd.concat([result, duplicate])

print(
    f" --> Duplicated dataset generated with {len(result.index):,d} rows x {len(table_spec)} cols and {'no ' if arr_length==0 else f'a {arr_length}-'}array col ... ",
)

 --> Duplicated dataset generated with 1,720,000 rows x 100 cols and a 1000-array col ... 


In [5]:
result.head()

Unnamed: 0,index,int0,int1,int2,int3,int4,int5,int6,int7,int8,int9,int10,int11,int12,int13,int14,int15,int16,int17,int18,int19,int20,int21,int22,int23,int24,int25,int26,int27,int28,int29,int30,int31,int32,int33,dttime0,dttime1,dttime2,dttime3,dttime4,dttime5,dttime6,dttime7,dttime8,dttime9,dttime10,dttime11,dttime12,dttime13,dttime14,dttime15,dttime16,dttime17,dttime18,dttime19,dttime20,dttime21,dttime22,dttime23,dttime24,dttime25,dttime26,dttime27,dttime28,dttime29,dttime30,dttime31,dttime32,str0,str1,str2,str3,str4,str5,str6,str7,str8,str9,str10,str11,str12,str13,str14,str15,str16,str17,str18,str19,str20,str21,str22,str23,str24,str25,str26,str27,str28,str29,str30,str31,str32,arrFloat,partition
0,0,-138909,475712,-990641,-207047,-658155,-467426,-161629,490474,420514,793119,145566,319472,-161629,-161629,-227342,-147059,403368,-891517,-621960,213186,-604366,-667018,-59186,666654,545390,38359,-658155,-604366,-147059,-467426,234801,-467426,810688,-63768,2023-03-04 13:01:59,2047-02-27 23:49:30,2023-03-04 13:01:59,2043-04-19 09:29:10,2032-06-24 20:03:07,2047-10-20 15:33:44,2024-06-16 02:29:16,2038-05-05 02:46:21,2025-01-07 19:09:09,2039-10-20 06:28:29,2033-09-23 11:56:03,2022-07-09 16:36:05,2047-10-20 15:33:44,2037-11-19 14:02:07,2032-02-29 04:30:21,2023-01-15 09:35:01,2024-07-28 03:03:39,2035-06-26 09:26:00,2034-01-01 00:46:34,2044-03-15 19:08:09,2045-08-05 16:59:24,2018-03-07 20:03:02,2040-02-12 13:37:06,2041-01-25 00:38:00,2040-01-10 02:56:57,2037-11-19 14:02:07,2022-07-30 12:17:58,2039-01-18 13:56:44,2030-01-15 14:57:20,2032-06-24 20:03:07,2033-08-23 08:45:34,2041-01-25 00:38:00,2044-12-08 23:07:23,RjkOKyZO,jtgUe,jFMTfRSjZwAcNDyDd,CffuGFgtJsThJv,nRqJuxKcVSMPFTPDOBx,TVuucHjqMYjyLsT,nIgAOIzVXY,YOeccFqbHQKKn,anQRUhVasxSbMoj,InZMJLsC,YxTfvXNc,TZEeZfHce,mJOfUiazRAuana,RzTHrHZpnRLA,ssJOul,GthjBK,WRmhF,LrCFQPSYwfu,sWRmMfceYqtIvhHpWke,UpdtlH,WzrObhrQcaqzHLTz,sxAQciMbzeSeiA,gBDTvykvA,IlnyL,YmcYzCxWmDw,yYXXFhdNS,ehwSbwwlaoxeMjn,YxTfvXNc,PdCtjVFdNnbwEza,oxYkvBdgYjSoc,gBDTvykvA,BNgqeo,anQRUhVasxSbMoj,"[626854.9420948013, 79977.70645339353, 396823....",0
1,1,373476,-99861,-604366,-518622,-625686,223907,213186,989562,513123,-241085,-227342,-99861,-230387,914136,-898739,319472,-59186,-467426,132931,194775,-891517,-701633,824443,319472,113146,-241085,-658155,672802,-147059,697698,914136,145566,-701633,-181783,2027-08-31 07:00:40,2022-07-30 12:17:58,2038-11-07 15:41:32,2041-08-30 09:08:48,2047-11-03 22:27:02,2037-12-11 20:54:07,2032-06-24 20:03:07,2031-02-14 12:01:08,2027-08-31 07:00:40,2048-05-12 10:45:08,2025-10-20 08:11:00,2037-03-24 06:24:03,2039-10-20 06:28:29,2037-04-07 17:00:51,2025-01-07 19:09:09,2026-09-07 09:45:49,2042-09-10 08:37:23,2043-04-19 09:29:10,2047-10-20 15:33:44,2031-08-23 07:20:00,2035-12-06 14:24:21,2045-08-05 16:59:24,2022-07-09 16:36:05,2018-03-07 20:03:02,2023-02-11 02:05:28,2019-02-18 08:43:07,2049-05-17 20:36:35,2044-03-15 19:08:09,2025-01-07 19:09:09,2049-11-08 07:15:47,2047-02-27 23:49:30,2019-05-30 01:12:30,2045-02-10 04:09:04,rsCxKOikhShyzLDiJ,UuTemKopZjZI,tJAVjLBtOwfpCO,EzOfb,UuCnxsEflYg,gSzmqwUEKkSR,ofXOXGSHA,xchNarOSsUojWKsm,lcNQqEe,BfgAegAXj,czSKAXQTckCeqSkCHF,yYXXFhdNS,TWaRHNgmh,rhJMSjYSCzlX,RjkOKyZO,UuCnxsEflYg,yYXXFhdNS,InZMJLsC,YxTfvXNc,DxGyHGcK,iWgNZqITZM,BfgAegAXj,KzORBHFRuFFOQm,MPmtrSlg,DxGyHGcK,UuCnxsEflYg,YOeccFqbHQKKn,YmcYzCxWmDw,lCwYyBZFy,EfRFWoI,nNjgmDyxIjgMFj,fMZyuKpslm,jtgUe,"[96260.4474961476, 985831.9856156815, 459644.8...",0
2,2,129153,824443,-676286,259508,-701633,-975223,-975223,872245,132931,-593941,672802,-659160,545390,513123,-604366,513123,960319,319472,872245,917999,-467426,186660,517543,-891517,796107,545390,-604366,-241085,475712,-207047,654621,129153,-782544,-658155,2037-08-06 18:59:45,2025-01-07 19:09:09,2048-08-09 05:24:11,2037-03-24 06:24:03,2020-08-12 02:22:49,2047-04-27 09:33:34,2028-05-13 19:14:27,2025-01-19 19:12:01,2032-02-29 04:30:21,2033-09-23 11:56:03,2043-04-19 09:29:10,2025-01-07 19:09:09,2027-07-10 23:23:27,2031-08-23 07:20:00,2037-08-06 18:59:45,2026-08-09 14:24:29,2030-05-28 14:24:37,2034-03-04 00:08:04,2022-07-30 12:17:58,2045-08-05 16:59:24,2047-11-03 22:27:02,2034-01-01 00:46:34,2022-07-09 16:36:05,2039-10-20 06:28:29,2047-11-03 22:27:02,2019-02-18 08:43:07,2023-06-16 06:27:04,2042-09-10 08:37:23,2026-08-09 14:24:29,2019-02-18 08:43:07,2043-04-19 09:29:10,2042-09-10 08:37:23,2034-01-29 01:52:47,MPmtrSlg,BfgAegAXj,gSzmqwUEKkSR,KzORBHFRuFFOQm,NsRwLO,fMZyuKpslm,PRBjCTjHuin,DxGyHGcK,lCwYyBZFy,kyjiZouGppWl,nXdYRkkvHqhM,sWRmMfceYqtIvhHpWke,dkPipsUvdcEA,CQlaERAKGtPwyQqjJSa,yagzJHsCFYLTRnBfxoq,DNOvPhRTNs,oxYkvBdgYjSoc,DxGyHGcK,CgVqw,LfgILt,YxTfvXNc,yagzJHsCFYLTRnBfxoq,RjkOKyZO,ssJOul,lcNQqEe,nRqJuxKcVSMPFTPDOBx,LfgILt,INoav,nIgAOIzVXY,gJEEvZZvhEhSFBctvV,yYXXFhdNS,CQlaERAKGtPwyQqjJSa,CgVqw,"[792784.0747212492, 882513.4057647634, 723408....",0
3,3,490474,-99861,960319,-659078,531028,-497521,-134640,259689,793119,139504,666654,113146,308554,62830,-667018,-467426,403368,872245,62216,933619,113146,672802,186660,793119,-631085,697698,697698,673660,824443,62830,373476,-134640,-891517,960319,2032-06-24 20:03:07,2035-12-06 14:24:21,2045-02-24 04:02:53,2046-01-18 11:21:33,2035-06-26 09:26:00,2047-05-30 02:57:35,2031-07-21 14:32:43,2037-08-06 18:59:45,2030-05-15 07:25:45,2026-08-09 14:24:29,2032-02-29 04:30:21,2033-09-23 11:56:03,2031-07-21 14:32:43,2041-11-17 02:58:19,2040-01-10 02:56:57,2020-08-12 02:22:49,2031-07-21 14:32:43,2019-05-30 01:12:30,2030-01-15 14:57:20,2023-06-16 06:27:04,2039-10-20 06:28:29,2030-05-28 14:24:37,2034-03-04 00:08:04,2038-05-05 02:46:21,2049-02-09 07:44:03,2035-11-05 04:12:40,2037-12-11 20:54:07,2046-01-18 11:21:33,2047-10-20 15:33:44,2037-03-24 06:24:03,2023-03-04 13:01:59,2040-01-10 02:56:57,2034-11-08 02:59:35,LrCFQPSYwfu,gBDTvykvA,DxGyHGcK,TZEeZfHce,KOIMRebhOmMKhzfxh,nIgAOIzVXY,LXkBmwheSbHC,JMWacFutDdZZA,UuTemKopZjZI,LfgILt,nXdYRkkvHqhM,rhJMSjYSCzlX,kyjiZouGppWl,rsCxKOikhShyzLDiJ,YQOaNFvp,AssHiKHO,WRmhF,rsCxKOikhShyzLDiJ,yYXXFhdNS,fMZyuKpslm,LfgILt,ePtwBldGDcM,NsRwLO,HeRfRVQza,lCwYyBZFy,YQOaNFvp,lCwYyBZFy,TWaRHNgmh,sxAQciMbzeSeiA,DVfvVcIripWEw,AssHiKHO,yWAcqGFzYtEwLnGis,iWgNZqITZM,"[371241.94865894347, 837067.6999089712, 402934...",0
4,4,-975223,-215760,-891517,373476,194775,545390,-621960,-63768,960319,513123,234801,62216,-138909,810688,-207047,917999,113146,673660,697698,-181783,308554,-461888,-631085,-766862,403368,697698,545390,145566,-658155,914136,-230387,259689,960319,475712,2031-05-05 00:22:11,2039-10-20 06:28:29,2025-01-19 19:12:01,2048-02-12 00:39:46,2035-06-26 09:26:00,2039-09-25 03:50:17,2037-12-11 20:54:07,2025-10-20 08:11:00,2047-10-20 15:33:44,2048-08-09 05:24:11,2027-11-01 15:52:34,2037-11-16 14:35:06,2037-08-06 18:59:45,2042-07-04 14:39:57,2037-12-11 20:54:07,2047-12-17 19:55:50,2030-01-15 14:57:20,2036-03-01 10:38:16,2045-02-24 04:02:53,2037-11-16 14:35:06,2023-01-15 09:35:01,2025-01-19 19:12:01,2030-05-15 07:25:45,2048-05-12 10:45:08,2041-11-17 02:58:19,2039-10-20 06:28:29,2037-04-07 17:00:51,2046-01-18 11:21:33,2020-08-12 02:22:49,2043-04-19 09:29:10,2040-02-12 13:37:06,2040-01-10 02:56:57,2039-01-18 13:56:44,rhJMSjYSCzlX,lDhEwTqibnxvE,ehwSbwwlaoxeMjn,xOHdyAaA,NhFLOvmpbUrhT,ssJOul,YxTfvXNc,DxGyHGcK,UuTemKopZjZI,gEXjbcM,UuCnxsEflYg,xOHdyAaA,qtXAyZy,anQRUhVasxSbMoj,KOIMRebhOmMKhzfxh,CgVqw,cMbmlThEnUZdRbI,PdCtjVFdNnbwEza,RzTHrHZpnRLA,INoav,AssHiKHO,UuCnxsEflYg,tJAVjLBtOwfpCO,LXkBmwheSbHC,ePtwBldGDcM,oictaWCvkZjP,BNgqeo,zqnPcZ,LfgILt,KOIMRebhOmMKhzfxh,YiyZsVRZhH,LrCFQPSYwfu,sxAQciMbzeSeiA,"[579798.5940131244, 639977.9452908025, 53056.7...",0


In [6]:
# Write dataframe to a CSV file
result.to_csv("dataset.csv", index=False)

In [7]:
# Write dataframe to a parquet file,
# using pyarrow to specify the array column as float[].

import pyarrow as pa

schema = pa.Table.from_pandas(result, preserve_index=False).schema
new_schema = pa.schema(
    [
        f if f.name != "arrFloat" else pa.field("arrFloat", pa.list_(pa.float32()))
        for f in schema
    ]
)
result.to_parquet(
    "dataset.parquet", index=False, schema=new_schema, row_group_size=10000
)