# Delta Lake for ETL: Snippets

In [1]:
# start by initializing a Spark session with Delta Lake

import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.master("local[4]").appName("parallel") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

## Query Performance: ELT vs ETL

### JSON

### S3

In [None]:
%%time

# read in 8M rows
df = spark.read.parquet("s3://avriiil/census-large.parquet")

# query relevant data
df.where(df.income == "=>50K").collect()

In [None]:
# read in 8M rows
df2= spark.read.csv("data/census_16M.csv", header=True)

# load data to Delta table
df2.write.format("delta").partitionBy("income").save("data/delta_census_part")

In [None]:
%%time
# query data
df4 = spark.read.format("delta").load("data/delta_census_part")
df4.where(df4.income == "=>50K").collect()

### Local

In [2]:
%%time

# read in 8M rows
df = spark.read.csv("data/census_16M.csv", header=True)

# query relevant data
df.where(df.income == "=>50K").collect()

CPU times: user 5.8 ms, sys: 3.19 ms, total: 8.99 ms
Wall time: 21.2 s


[]

In [3]:
#

In [3]:
# read in 8M rows
df2= spark.read.csv("data/census_16M.csv", header=True)

# load data to Delta table
df2.write.format("delta").partitionBy("income").save("data/delta_census_part")

In [2]:
%%time
# query data
df4 = spark.read.format("delta").load("data/delta_census_part")
df4.where(df4.income == "=>50K").collect()

CPU times: user 4.07 ms, sys: 2.61 ms, total: 6.67 ms
Wall time: 20.7 s


[]

In [3]:
# read in 8M rows
df= spark.read.csv("data/census_16M.csv", header=True)

# load data to Delta table
df.write.format("delta").partitionBy("income").save("data/delta_census")

In [6]:
# read in partitioned delta
df2 = spark.read.format("delta").load("data/delta_census")

# repartition overwrite
df2.write.format("delta").mode("overwrite").partitionBy("age").option("overwriteSchema", "true").save("data/delta_census")

# run query on new partition column
df3 = spark.read.format("delta").load("data/delta_census")
df3.where(df3.age == 25)

DataFrame[age: string, workclass: string, fnlwgt: string, education: string, education_num: string, marital_status: string, occupation: string, relationship: string, race: string, sex: string, capital_gain: string, capital_loss: string, hours_per_week: string, native_country: string, income: string]

In [8]:
df = spark.read.format("delta").option("versionAsOf", 0).load("data/delta_census")

# run query on old partition column
df.where(df.income == "=>50K")

DataFrame[age: string, workclass: string, fnlwgt: string, education: string, education_num: string, marital_status: string, occupation: string, relationship: string, race: string, sex: string, capital_gain: string, capital_loss: string, hours_per_week: string, native_country: string, income: string]

## create datasets

In [3]:
import dask.dataframe as dd

In [4]:
ddf = dd.read_csv("data/census_16M.csv")

In [13]:
ddf = dd.concat([ddf, ddf])

In [14]:
len(ddf)

256000000

In [18]:
ddf.to_parquet("s3://avriiil/census-large.parquet")

KeyboardInterrupt: 

In [None]:
ddf.to_csv("s3://avriiil/census-large.csv")

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("data/census_2M.csv")

In [5]:
df = pd.concat([df, df])

In [6]:
len(df)

4000000

In [7]:
df.to_csv("data/census_4M.csv", index=False)

## Schema Evolution

In [2]:
df = spark.createDataFrame([("bob", 47), ("li", 23), ("leonard", 51)]).toDF(
    "first_name", "age"
)

df.write.format("delta").save("data/toy_data")

In [3]:
df = spark.createDataFrame([("frank", 68, "usa"), ("jordana", 26, "brasil")]).toDF(
    "first_name", "age", "country"
)

df.write.format("delta").mode("append").save("data/toy_data")

AnalysisException: [_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: e5484814-093f-4565-a22e-1d7124c3755e).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- first_name: string (nullable = true)
-- age: long (nullable = true)


Data schema:
root
-- first_name: string (nullable = true)
-- age: long (nullable = true)
-- country: string (nullable = true)

         

In [4]:
df.write.option("mergeSchema", "true").mode("append").format("delta").save(
    "data/toy_data"
)
