In [39]:
import pandas as pd

In [55]:
df = pd.read_csv("data/census_2M.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,29,Private,170491,Assoc-voc,11,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,60,Yugoslavia,>50K
1,35,Private,205681,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,55,United-States,>50K
2,26,Local-gov,58344,HS-grad,9,Married-spouse-absent,Transport-moving,Not-in-family,White,Male,0,0,50,United-States,<=50K
3,56,State-gov,28368,HS-grad,9,Married-spouse-absent,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
4,29,Private,284241,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,55,United-States,>50K


In [56]:
len(df)

2000000

In [57]:
df_base = df.iloc[:10_000]
len(df_base)

10000

In [58]:
df_rest = df.iloc[10_000:]
len(df_rest)

1990000

In [59]:
df_base.to_csv("data/census_base.csv", index=False)
df_rest.to_csv("data/census_new.csv", index=False)

In [60]:
df = pd.read_csv("data/census_new.csv")
len(df)

1990000

## Create Delta table

In [1]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("MyApp2") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
df = spark.read.csv("data/census_base.csv", header=True)

In [3]:
df.show(5, truncate=9)

+---+---------+------+---------+-------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|education|education_num|marital_status|occupation|relationship| race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+---------+------+---------+-------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+
| 29|  Private|170491|Assoc-voc|           11|     Marrie...| Tech-s...|     Husband|White|  Male|           0|           0|            60|     Yugosl...|  >50K|
| 35|  Private|205681|Bachelors|           13|     Marrie...|     Sales|     Husband|White|  Male|           0|           0|            55|     United...|  >50K|
| 26|Local-gov| 58344|  HS-grad|            9|     Marrie...| Transp...|   Not-in...|White|  Male|           0|           0|            50|     United...| <=50K|
| 56|State-gov| 28368|  HS-g

In [4]:
df.count()

10000

In [5]:
df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'income']

In [6]:
# partitioned write on education col
df.write.format("delta").partitionBy("education").save("delta/census_table")

In [7]:
# read back in to confirm
df = spark.read.format("delta").load("delta/census_table")
df.show()

+---+------------+------+------------+-------------+--------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|age|   workclass|fnlwgt|   education|education_num|      marital_status|       occupation| relationship| race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+------------+------+------------+-------------+--------------------+-----------------+-------------+-----+------+------------+------------+--------------+--------------+------+
| 29|     Private|284241|Some-college|           10|  Married-civ-spouse|     Adm-clerical|      Husband|White|  Male|           0|           0|            55| United-States|  >50K|
| 30|Self-emp-inc|224938|Some-college|           10|           Separated|  Exec-managerial|Not-in-family|White|  Male|           0|           0|            60| United-States|  >50K|
| 36|     Private|231026|Some-college|           10|  Married-civ-spouse|     Craft-repair

In [8]:
df.count()

10000

Nice. This is working and done.

Our Delta table looks like this on disk:

In [9]:
!ls delta/census_table/

[34m_delta_log[m[m             [34meducation=7th-8th[m[m      [34meducation=HS-grad[m[m
[34meducation=10th[m[m         [34meducation=9th[m[m          [34meducation=Masters[m[m
[34meducation=11th[m[m         [34meducation=Assoc-acdm[m[m   [34meducation=Preschool[m[m
[34meducation=12th[m[m         [34meducation=Assoc-voc[m[m    [34meducation=Prof-school[m[m
[34meducation=1st-4th[m[m      [34meducation=Bachelors[m[m    [34meducation=Some-college[m[m
[34meducation=5th-6th[m[m      [34meducation=Doctorate[m[m


## Generate Synthetic Data

In [14]:
!pip install mostlyai

Collecting mostlyai
  Downloading mostlyai-0.3.8-py3-none-any.whl.metadata (3.2 kB)
Collecting pydantic<3.0.0,>=2.4.2 (from mostlyai)
  Downloading pydantic-2.7.2-py3-none-any.whl.metadata (108 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.5/108.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rich>=13.7.0 (from mostlyai)
  Using cached rich-13.7.1-py3-none-any.whl.metadata (18 kB)
Collecting annotated-types>=0.4.0 (from pydantic<3.0.0,>=2.4.2->mostlyai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.18.3 (from pydantic<3.0.0,>=2.4.2->mostlyai)
  Downloading pydantic_core-2.18.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.5 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=13.7.0->mostlyai)
  Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=13.7.0->mostlyai)
  Using cached mdurl-0.1.2-py3-none-any.whl.meta

In [18]:
from mostlyai import MostlyAI
import getpass

api_key = getpass.getpass()

# initialize client
mostly = MostlyAI(api_key=api_key, base_url='https://app.mostly.ai')

# fetch configuration via API
g = mostly.generators.get('32a4e79d-e592-4e90-a1a9-e9e3470813d9')
config = g.config()
config

 ········


{'name': 'Sample Census Data Generator',
 'description': 'Sample synthetic data generator based on the Census 1994 dataset, commonly known as the "Adult" dataset. Originating from the U.S. Census Bureau database, it was extracted from the 1994 census. The generator has been trained from a subset (48K rows) of the original dataset.',
 'tables': [{'name': 'census',
   'sourceConnectorId': '856b7b1d-0d5e-4c95-a614-db5c58e6d3c0',
   'location': None,
   'data': None,
   'modelConfiguration': {'maxSampleSize': 48842,
    'batchSize': None,
    'modelSize': 'M',
    'maxTrainingTime': 10,
    'maxEpochs': 100,
    'maxSequenceWindow': 100,
    'enableFlexibleGeneration': True,
    'valueProtection': True,
    'rareCategoryReplacementMethod': 'CONSTANT'},
   'textModelConfiguration': None,
   'primaryKey': None,
   'foreignKeys': None,
   'columns': [{'name': 'age',
     'included': True,
     'modelEncodingType': 'NUMERIC_AUTO'},
    {'name': 'workclass',
     'included': True,
     'modelEn

In [21]:
# probe for some samples
mostly.probe(g, size=10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,23,Self-emp-inc,129423,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,0,0,50,United-States,<=50K
1,54,Private,194063,Masters,14,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K
2,36,Private,294103,Some-college,10,Married-civ-spouse,Exec-managerial,Wife,Amer-Indian-Eskimo,Female,0,0,43,United-States,<=50K
3,50,Private,279291,Doctorate,16,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,65,United-States,>50K
4,30,Private,118517,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,35,United-States,<=50K
5,45,Private,285876,HS-grad,9,Divorced,Transport-moving,Not-in-family,White,Male,0,0,40,United-States,<=50K
6,53,Self-emp-not-inc,141405,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,55,United-States,<=50K
7,46,Private,222176,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K
8,38,Private,72099,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,2894,0,45,United-States,<=50K
9,52,Private,29764,HS-grad,9,Divorced,Sales,Unmarried,White,Female,0,0,40,United-States,<=50K


In [22]:
# use generator to create a synthetic dataset
sd = mostly.generate(g, size=2_000_000)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,29,Private,170491,Assoc-voc,11,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,60,Yugoslavia,>50K
1,35,Private,205681,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,55,United-States,>50K
2,26,Local-gov,58344,HS-grad,9,Married-spouse-absent,Transport-moving,Not-in-family,White,Male,0,0,50,United-States,<=50K
3,56,State-gov,28368,HS-grad,9,Married-spouse-absent,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
4,29,Private,284241,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,55,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,18,Private,148801,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,18,United-States,<=50K
99996,25,Private,212180,Bachelors,13,Never-married,Sales,Own-child,White,Male,675,0,50,United-States,<=50K
99997,32,State-gov,452849,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
99998,53,Private,282557,Some-college,10,Divorced,Sales,Not-in-family,Black,Male,0,0,40,Jamaica,<=50K


In [23]:
df_synth = sd.data()

In [52]:
df_synth

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,29,Private,170491,Assoc-voc,11,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,60,Yugoslavia,>50K
1,35,Private,205681,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,55,United-States,>50K
2,26,Local-gov,58344,HS-grad,9,Married-spouse-absent,Transport-moving,Not-in-family,White,Male,0,0,50,United-States,<=50K
3,56,State-gov,28368,HS-grad,9,Married-spouse-absent,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
4,29,Private,284241,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,55,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,18,Private,148801,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,18,United-States,<=50K
99996,25,Private,212180,Bachelors,13,Never-married,Sales,Own-child,White,Male,675,0,50,United-States,<=50K
99997,32,State-gov,452849,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
99998,53,Private,282557,Some-college,10,Divorced,Sales,Not-in-family,Black,Male,0,0,40,Jamaica,<=50K


In [54]:
df_synth.to_csv("data/census_2M.csv", index=False)

## Incremental Writes
Now let's move on to appending data in little increments.

We'll compare a few different options:

1. vanilla writes with no configs
2. `optimized write` (needs to be distributed?)
3. `auto compaction`

To start, let's iterate over every 1000 rows and write to the partitioned table.

### 1. vanilla append write

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in all new data
df_new = pd.read_csv("data/census_new.csv")
df_new = df_new.astype(str)

In [3]:
# def chunking params
n = len(df_new)
chunk_size = 1000
chunks = []
num_chunks = int(np.ceil(n / chunk_size))

In [4]:
# def chunks
for i in range(num_chunks):
    start = chunk_size * i
    stop = start + chunk_size
    chunks.append(df_new.iloc[start:stop])

### Parallelize for loop write with Dask

In [5]:
# create local dask cluster
import dask

from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=4)
client = Client(cluster)

In [9]:
import dask.array as da

# load chunks array onto dask cluster
chunks_dask = da.from_array(chunks, chunks=num_chunks)
chunks_dask

Unnamed: 0,Array,Chunk
Bytes,227.74 MiB,227.74 MiB
Shape,"(1990, 1000, 15)","(1990, 1000, 15)"
Dask graph,1 chunks in 1 graph layer,1 chunks in 1 graph layer
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 227.74 MiB 227.74 MiB Shape (1990, 1000, 15) (1990, 1000, 15) Dask graph 1 chunks in 1 graph layer Data type object numpy.ndarray",15  1000  1990,

Unnamed: 0,Array,Chunk
Bytes,227.74 MiB,227.74 MiB
Shape,"(1990, 1000, 15)","(1990, 1000, 15)"
Dask graph,1 chunks in 1 graph layer,1 chunks in 1 graph layer
Data type,object numpy.ndarray,object numpy.ndarray


In [6]:
# write chunks
@dask.delayed
def write_chunk(chunk, chunk_n):
    df = chunk
    df.to_csv(f"results/{chunk_n}.csv")
    return chunk_n
    #df.write.format("delta").partitionBy("education").mode("append").save("delta/census_table/")

In [7]:
results = []
for chunk in enumerate(chunks):
    results.append(write_chunk(chunk[1], chunk[0]))

    
# # iterate over chunks    
# for i in range(num_chunks):
#     pandas_df = chunks[i]
#     df = spark.createDataFrame(pandas_df)
#     df.write.format("delta").partitionBy("education").mode("append").save("delta/census_table/")
#     print(f"Write {i+1} succesfull.")

In [8]:
%%time
dask.compute(results)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


CPU times: user 9.56 s, sys: 2.88 s, total: 12.4 s
Wall time: 19.8 s


([0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  15

In [None]:
# damn that was fast
# serial writes with pyspark are taking ~1 hour
# to be fair - I'm writing 3 duplicate files at the same time
# but still

In [None]:
# just for fun
# what happens if I try to parallelize a spark write

In [9]:
# write chunks
@dask.delayed
def write_chunk(chunk, chunk_n):
    df = spark.createDataFrame(chunk)
    df.write.format("delta").partitionBy("education").mode("append").save("delta/census_table_from_dask/")
    print(f"Write {chunk_n + 1} complete.")
    return chunk_n

results = []
for chunk in enumerate(chunks):
    results.append(write_chunk(chunk[1], chunk[0]))

In [10]:
%%time
dask.compute(results)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


NameError: name 'spark' is not defined

In [None]:
# doesn't work out of the box
# need to have spark running on my dask cluster
# not trivial
# https://github.com/mrocklin/dask-spark

### debugging (some useful info about dtypes)

In [37]:
# ok we're getting a `DELTA_FAILED_TO_MERGE_FIELDS` error
# strange because these tables should have the same schema
# although the base file is older, maybe there's some difference
# let's see if any files got written

In [38]:
# read back in to confirm
df = spark.read.format("delta").load("delta/census_table")
df.count()

10000

In [73]:
# no
# so probably something has changed about the schema

# i've updated the `base` file to be a part of the newly generated data
# so schema discrepancy shouldn't be an issue
# let's see if there's something changing between pandas/pyspark

In [74]:
pandas_df = chunks[0]
pandas_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,Private,249331,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,45,United-States,<=50K
1,47,Self-emp-not-inc,171732,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,56,United-States,>50K
2,19,Private,61497,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,35,United-States,<=50K
3,51,Local-gov,262724,11th,7,Married-civ-spouse,Other-service,Husband,Black,Male,4410,0,40,United-States,>50K
4,31,Private,190364,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,4415,0,45,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,38,Private,80899,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,1907,45,?,>50K
996,31,Private,851983,Assoc-acdm,12,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,Mexico,<=50K
997,60,Private,173089,Doctorate,16,Widowed,Exec-managerial,Unmarried,White,Female,0,0,40,United-States,>50K
998,45,Private,145912,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,3112,0,40,United-States,>50K


In [75]:
spark_df = spark.createDataFrame(pandas_df)

In [76]:
spark_df.count()

1000

In [77]:
spark_df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'income']

In [78]:
spark_df.dtypes

[('age', 'bigint'),
 ('workclass', 'string'),
 ('fnlwgt', 'bigint'),
 ('education', 'string'),
 ('education_num', 'bigint'),
 ('marital_status', 'string'),
 ('occupation', 'string'),
 ('relationship', 'string'),
 ('race', 'string'),
 ('sex', 'string'),
 ('capital_gain', 'bigint'),
 ('capital_loss', 'bigint'),
 ('hours_per_week', 'bigint'),
 ('native_country', 'string'),
 ('income', 'string')]

In [79]:
df_base = spark.read.csv("data/census_base.csv", header=True)

In [80]:
df_base.dtypes

[('age', 'string'),
 ('workclass', 'string'),
 ('fnlwgt', 'string'),
 ('education', 'string'),
 ('education_num', 'string'),
 ('marital_status', 'string'),
 ('occupation', 'string'),
 ('relationship', 'string'),
 ('race', 'string'),
 ('sex', 'string'),
 ('capital_gain', 'string'),
 ('capital_loss', 'string'),
 ('hours_per_week', 'string'),
 ('native_country', 'string'),
 ('income', 'string')]

In [82]:
# spark is reading all the cols as string dtype
# interesting and inefficient
# quick fix: set all cols to string in pandas import
# longer fix: should probably provide schema upon spark read