## Create Delta table

In [1]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.appName("delta-write") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
df = spark.read.csv("data/census_base.csv", header=True)

In [3]:
df.show(5, truncate=9)

+---+---------+------+---------+-------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|education|education_num|marital_status|occupation|relationship| race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+---------+------+---------+-------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+
| 29|  Private|170491|Assoc-voc|           11|     Marrie...| Tech-s...|     Husband|White|  Male|           0|           0|            60|     Yugosl...|  >50K|
| 35|  Private|205681|Bachelors|           13|     Marrie...|     Sales|     Husband|White|  Male|           0|           0|            55|     United...|  >50K|
| 26|Local-gov| 58344|  HS-grad|            9|     Marrie...| Transp...|   Not-in...|White|  Male|           0|           0|            50|     United...| <=50K|
| 56|State-gov| 28368|  HS-g

In [4]:
df.count()

10000

In [5]:
df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'income']

In [6]:
# partitioned write on education col
df.write.format("delta").partitionBy("education").save("delta/census_table_raw")

In [7]:
# read back in to confirm
df = spark.read.format("delta").load("delta/census_table_raw")
df.show()

+---+------------+------+---------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
|age|   workclass|fnlwgt|education|education_num|      marital_status|       occupation| relationship|              race|   sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+------------+------+---------+-------------+--------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
| 26|   Local-gov| 58344|  HS-grad|            9|Married-spouse-ab...| Transport-moving|Not-in-family|             White|  Male|           0|           0|            50| United-States| <=50K|
| 56|   State-gov| 28368|  HS-grad|            9|Married-spouse-ab...|Machine-op-inspct|    Unmarried|             White|Female|           0|           0|            40| United-States| <=50K|
| 43|           ?|253250|  HS-grad|     

In [8]:
df.count()

10000

Nice. This is working and done.

Our Delta table looks like this on disk:

In [9]:
!ls delta/census_table_raw/

[34m_delta_log[m[m             [34meducation=7th-8th[m[m      [34meducation=HS-grad[m[m
[34meducation=10th[m[m         [34meducation=9th[m[m          [34meducation=Masters[m[m
[34meducation=11th[m[m         [34meducation=Assoc-acdm[m[m   [34meducation=Preschool[m[m
[34meducation=12th[m[m         [34meducation=Assoc-voc[m[m    [34meducation=Prof-school[m[m
[34meducation=1st-4th[m[m      [34meducation=Bachelors[m[m    [34meducation=Some-college[m[m
[34meducation=5th-6th[m[m      [34meducation=Doctorate[m[m


## Generate Synthetic Data

In [14]:
!pip install mostlyai

Collecting mostlyai
  Downloading mostlyai-0.3.8-py3-none-any.whl.metadata (3.2 kB)
Collecting pydantic<3.0.0,>=2.4.2 (from mostlyai)
  Downloading pydantic-2.7.2-py3-none-any.whl.metadata (108 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.5/108.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rich>=13.7.0 (from mostlyai)
  Using cached rich-13.7.1-py3-none-any.whl.metadata (18 kB)
Collecting annotated-types>=0.4.0 (from pydantic<3.0.0,>=2.4.2->mostlyai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.18.3 (from pydantic<3.0.0,>=2.4.2->mostlyai)
  Downloading pydantic_core-2.18.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.5 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=13.7.0->mostlyai)
  Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=13.7.0->mostlyai)
  Using cached mdurl-0.1.2-py3-none-any.whl.meta

In [18]:
from mostlyai import MostlyAI
import getpass

api_key = getpass.getpass()

# initialize client
mostly = MostlyAI(api_key=api_key, base_url='https://app.mostly.ai')

# fetch configuration via API
g = mostly.generators.get('32a4e79d-e592-4e90-a1a9-e9e3470813d9')
config = g.config()
config

 ········


{'name': 'Sample Census Data Generator',
 'description': 'Sample synthetic data generator based on the Census 1994 dataset, commonly known as the "Adult" dataset. Originating from the U.S. Census Bureau database, it was extracted from the 1994 census. The generator has been trained from a subset (48K rows) of the original dataset.',
 'tables': [{'name': 'census',
   'sourceConnectorId': '856b7b1d-0d5e-4c95-a614-db5c58e6d3c0',
   'location': None,
   'data': None,
   'modelConfiguration': {'maxSampleSize': 48842,
    'batchSize': None,
    'modelSize': 'M',
    'maxTrainingTime': 10,
    'maxEpochs': 100,
    'maxSequenceWindow': 100,
    'enableFlexibleGeneration': True,
    'valueProtection': True,
    'rareCategoryReplacementMethod': 'CONSTANT'},
   'textModelConfiguration': None,
   'primaryKey': None,
   'foreignKeys': None,
   'columns': [{'name': 'age',
     'included': True,
     'modelEncodingType': 'NUMERIC_AUTO'},
    {'name': 'workclass',
     'included': True,
     'modelEn

In [21]:
# probe for some samples
mostly.probe(g, size=10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,23,Self-emp-inc,129423,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,0,0,50,United-States,<=50K
1,54,Private,194063,Masters,14,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K
2,36,Private,294103,Some-college,10,Married-civ-spouse,Exec-managerial,Wife,Amer-Indian-Eskimo,Female,0,0,43,United-States,<=50K
3,50,Private,279291,Doctorate,16,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,65,United-States,>50K
4,30,Private,118517,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,35,United-States,<=50K
5,45,Private,285876,HS-grad,9,Divorced,Transport-moving,Not-in-family,White,Male,0,0,40,United-States,<=50K
6,53,Self-emp-not-inc,141405,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,55,United-States,<=50K
7,46,Private,222176,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,<=50K
8,38,Private,72099,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,2894,0,45,United-States,<=50K
9,52,Private,29764,HS-grad,9,Divorced,Sales,Unmarried,White,Female,0,0,40,United-States,<=50K


In [22]:
# use generator to create a synthetic dataset
sd = mostly.generate(g, size=2_000_000)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,29,Private,170491,Assoc-voc,11,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,60,Yugoslavia,>50K
1,35,Private,205681,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,55,United-States,>50K
2,26,Local-gov,58344,HS-grad,9,Married-spouse-absent,Transport-moving,Not-in-family,White,Male,0,0,50,United-States,<=50K
3,56,State-gov,28368,HS-grad,9,Married-spouse-absent,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
4,29,Private,284241,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,55,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,18,Private,148801,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,18,United-States,<=50K
99996,25,Private,212180,Bachelors,13,Never-married,Sales,Own-child,White,Male,675,0,50,United-States,<=50K
99997,32,State-gov,452849,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
99998,53,Private,282557,Some-college,10,Divorced,Sales,Not-in-family,Black,Male,0,0,40,Jamaica,<=50K


In [23]:
df_synth = sd.data()

In [52]:
df_synth

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,29,Private,170491,Assoc-voc,11,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,60,Yugoslavia,>50K
1,35,Private,205681,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,55,United-States,>50K
2,26,Local-gov,58344,HS-grad,9,Married-spouse-absent,Transport-moving,Not-in-family,White,Male,0,0,50,United-States,<=50K
3,56,State-gov,28368,HS-grad,9,Married-spouse-absent,Machine-op-inspct,Unmarried,White,Female,0,0,40,United-States,<=50K
4,29,Private,284241,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,55,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,18,Private,148801,11th,7,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,18,United-States,<=50K
99996,25,Private,212180,Bachelors,13,Never-married,Sales,Own-child,White,Male,675,0,50,United-States,<=50K
99997,32,State-gov,452849,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
99998,53,Private,282557,Some-college,10,Divorced,Sales,Not-in-family,Black,Male,0,0,40,Jamaica,<=50K


In [54]:
df_synth.to_csv("data/census_2M.csv", index=False)

## Incremental Writes
Now let's move on to appending data in little increments.

We'll compare a few different options:

1. vanilla writes with no configs
2. `optimized write` (needs to be distributed?)
3. `auto compaction`

To start, let's iterate over every 1000 rows and write to the partitioned table.

### 1. vanilla append write

In [10]:
import pandas as pd
import numpy as np

In [11]:
# read in all new data
df_new = pd.read_csv("data/census_new.csv")
df_new = df_new.astype(str)

In [12]:
# def chunking params
n = len(df_new)
chunk_size = 1000
chunks = []
num_chunks = int(np.ceil(n / chunk_size))

In [13]:
# def chunks
for i in range(num_chunks):
    start = chunk_size * i
    stop = start + chunk_size
    chunks.append(df_new.iloc[start:stop])

In [14]:
# iterate over chunks    
for i in range(num_chunks):
    pandas_df = chunks[i]
    df = spark.createDataFrame(pandas_df)
    df.write.format("delta").partitionBy("education").mode("append").save("delta/census_table_raw/")
    print(f"Write {i+1} succesfull.")

Write 1 succesfull.
Write 2 succesfull.
Write 3 succesfull.
Write 4 succesfull.
Write 5 succesfull.
Write 6 succesfull.
Write 7 succesfull.
Write 8 succesfull.
Write 9 succesfull.
Write 10 succesfull.
Write 11 succesfull.
Write 12 succesfull.
Write 13 succesfull.
Write 14 succesfull.
Write 15 succesfull.
Write 16 succesfull.
Write 17 succesfull.
Write 18 succesfull.
Write 19 succesfull.
Write 20 succesfull.
Write 21 succesfull.
Write 22 succesfull.
Write 23 succesfull.
Write 24 succesfull.
Write 25 succesfull.
Write 26 succesfull.
Write 27 succesfull.
Write 28 succesfull.
Write 29 succesfull.
Write 30 succesfull.
Write 31 succesfull.
Write 32 succesfull.
Write 33 succesfull.
Write 34 succesfull.
Write 35 succesfull.
Write 36 succesfull.
Write 37 succesfull.
Write 38 succesfull.
Write 39 succesfull.
Write 40 succesfull.
Write 41 succesfull.
Write 42 succesfull.
Write 43 succesfull.
Write 44 succesfull.
Write 45 succesfull.
Write 46 succesfull.
Write 47 succesfull.
Write 48 succesfull.
W

In [None]:
# this works
# but is not efficient
# what if we could write this data in parallel?

In [None]:
# also:
# let's take a look at file sizes once this is done

# i don't really understand how append operations can lead to small files
# aren't we just operating on the same parquet file and extending it?

# no
# parquet files are immutable
# so we're adding new parquet files each time, currently ~4KB each

In [18]:
# let's take a look at one partition directory
!ls delta/census_table/education\=10th

part-00000-000cf89b-8135-4d31-8621-2f508d65ff25.c000.snappy.parquet
part-00000-006f37b5-bd0c-40d3-977b-2c817c5fbe79.c000.snappy.parquet
part-00000-0079fed5-da16-4226-9e97-0df4c747ff1a.c000.snappy.parquet
part-00000-008a914f-febe-4c69-8ac6-bbc84276f02d.c000.snappy.parquet
part-00000-00ab2d0a-9732-4150-ab3a-b7d62d868466.c000.snappy.parquet
part-00000-00b9d031-2335-4544-8efc-1064b9072a54.c000.snappy.parquet
part-00000-00cb4c75-ca47-4359-ab08-35be55bfaf1e.c000.snappy.parquet
part-00000-01177e68-8ed5-4b3a-a7d0-2a169a68edea.c000.snappy.parquet
part-00000-0175f78a-fec8-46ae-80fe-6ad22ef14e27.c000.snappy.parquet
part-00000-01b80afe-b072-40be-8ff1-60543e4e6546.c000.snappy.parquet
part-00000-01de72d2-852c-4985-97e4-6b829ba444e2.c000.snappy.parquet
part-00000-021ca9ed-4eb0-4f14-b27b-ca869d717afc.c000.snappy.parquet
part-00000-0246b652-bd8f-44c3-80ee-ca9892961b89.c000.snappy.parquet
part-00000-02955cd0-91e1-4eca-96a4-e778b3604fbf.c000.snappy.parquet
part-00000-02cc2214-b7f0-488f-9090-7a9d8993e8d4.

In [None]:
# that's a LOT of files (one per every write that had this column value in it at least once)

In [20]:
# and the size of these files
!ls -lhS delta/census_table/education\=10th

total 113392
-rw-r--r--  1 rpelgrim  staff   8.2K  3 Jun 13:21 part-00000-fb8732d8-0292-48bc-937d-e2bbf9a5aa9f.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.6K  3 Jun 13:37 part-00004-45a6386e-0bed-4af0-b8f7-94e35a644c28.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.6K  3 Jun 13:32 part-00006-d91bb325-65ce-4294-9e35-91f3a04105af.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:26 part-00000-64eab389-4786-45c6-a4a4-3f5ffb4ac621.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:46 part-00007-a60ebc05-1bd7-471f-9a83-fb5749d94ab1.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:30 part-00003-6850e747-7a23-4dae-930f-63a9fc85cf52.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:33 part-00005-bcbaea4c-9d3e-4c18-818a-4775529746db.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:27 part-00007-b058577f-4b7d-4453-b7b4-2b1135e98800.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun

In [None]:
# these files are all around 4.5kb

In [None]:
# let's see how this affects performance

### Performance check

In [None]:
%%time
df.where()

In [None]:
# let's run small-file compaction to clear this up

### Run compaction manually

In [21]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, "delta/census_table/")
deltaTable.optimize().executeCompaction()

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterPar

In [None]:
# this optimization takes a while to run (~1-2mins in this case)
# when is it worth it?
# should show performance slowdown before running compaction

In [22]:
# let's look at n and size of files now
!ls -lhS delta/census_table/education\=10th

total 114032
-rw-r--r--  1 rpelgrim  staff   320K  3 Jun 13:59 part-00000-1ad91a86-9cda-4157-b6e4-41f7b103f822.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   8.2K  3 Jun 13:21 part-00000-fb8732d8-0292-48bc-937d-e2bbf9a5aa9f.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.6K  3 Jun 13:37 part-00004-45a6386e-0bed-4af0-b8f7-94e35a644c28.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.6K  3 Jun 13:32 part-00006-d91bb325-65ce-4294-9e35-91f3a04105af.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:26 part-00000-64eab389-4786-45c6-a4a4-3f5ffb4ac621.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:46 part-00007-a60ebc05-1bd7-471f-9a83-fb5749d94ab1.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:30 part-00003-6850e747-7a23-4dae-930f-63a9fc85cf52.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun 13:33 part-00005-bcbaea4c-9d3e-4c18-818a-4775529746db.c000.snappy.parquet
-rw-r--r--  1 rpelgrim  staff   4.5K  3 Jun

In [None]:
# still lots of files
# but notice the large file at the top
# all the data has been moved to this file
# older data is still there to enable time travel

In [30]:
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled=false") 

DataFrame[key: string, value: string]

In [31]:
# let's vacuum
# param = n of last hours to preserve
deltaTable.vacuum(0)

DataFrame[]

In [None]:
# vacuum also takes a while (~2mins) in this case

In [32]:
# let's look at n and size of files now
!ls -lhS delta/census_table/education\=10th

total 640
-rw-r--r--  1 rpelgrim  staff   320K  3 Jun 13:59 part-00000-1ad91a86-9cda-4157-b6e4-41f7b103f822.c000.snappy.parquet


In [None]:
# all data has been moved to 1 Parquet file

### 

### 3. Optimized write (distributed write)

### debugging (some useful info about dtypes)

In [37]:
# ok we're getting a `DELTA_FAILED_TO_MERGE_FIELDS` error
# strange because these tables should have the same schema
# although the base file is older, maybe there's some difference
# let's see if any files got written

In [38]:
# read back in to confirm
df = spark.read.format("delta").load("delta/census_table")
df.count()

10000

In [73]:
# no
# so probably something has changed about the schema

# i've updated the `base` file to be a part of the newly generated data
# so schema discrepancy shouldn't be an issue
# let's see if there's something changing between pandas/pyspark

In [74]:
pandas_df = chunks[0]
pandas_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,Private,249331,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,45,United-States,<=50K
1,47,Self-emp-not-inc,171732,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,56,United-States,>50K
2,19,Private,61497,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,35,United-States,<=50K
3,51,Local-gov,262724,11th,7,Married-civ-spouse,Other-service,Husband,Black,Male,4410,0,40,United-States,>50K
4,31,Private,190364,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,4415,0,45,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,38,Private,80899,Bachelors,13,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,1907,45,?,>50K
996,31,Private,851983,Assoc-acdm,12,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,Mexico,<=50K
997,60,Private,173089,Doctorate,16,Widowed,Exec-managerial,Unmarried,White,Female,0,0,40,United-States,>50K
998,45,Private,145912,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,3112,0,40,United-States,>50K


In [75]:
spark_df = spark.createDataFrame(pandas_df)

In [76]:
spark_df.count()

1000

In [77]:
spark_df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'income']

In [78]:
spark_df.dtypes

[('age', 'bigint'),
 ('workclass', 'string'),
 ('fnlwgt', 'bigint'),
 ('education', 'string'),
 ('education_num', 'bigint'),
 ('marital_status', 'string'),
 ('occupation', 'string'),
 ('relationship', 'string'),
 ('race', 'string'),
 ('sex', 'string'),
 ('capital_gain', 'bigint'),
 ('capital_loss', 'bigint'),
 ('hours_per_week', 'bigint'),
 ('native_country', 'string'),
 ('income', 'string')]

In [79]:
df_base = spark.read.csv("data/census_base.csv", header=True)

In [80]:
df_base.dtypes

[('age', 'string'),
 ('workclass', 'string'),
 ('fnlwgt', 'string'),
 ('education', 'string'),
 ('education_num', 'string'),
 ('marital_status', 'string'),
 ('occupation', 'string'),
 ('relationship', 'string'),
 ('race', 'string'),
 ('sex', 'string'),
 ('capital_gain', 'string'),
 ('capital_loss', 'string'),
 ('hours_per_week', 'string'),
 ('native_country', 'string'),
 ('income', 'string')]

In [82]:
# spark is reading all the cols as string dtype
# interesting and inefficient
# quick fix: set all cols to string in pandas import
# longer fix: should probably provide schema upon spark read