## Using Delta Lake from a Jupyter Notebook

You want to use Delta Lake in a Jupyter Notebook. This means you’re working in Python. 

To use Delta Lake, you have 2 choices:
1. use PySpark for the most complete Delta Lake experience
2. use Delta-rs with other libraries/query engines (polars, daft, pandas, dask, DuckDB). Depending on the library not all Delta Lake features may be supported


## Use Delta Lake with pandas

In [1]:
%%capture
!pip install deltalake pandas

In [2]:
import pandas as pd
from deltalake import write_deltalake, DeltaTable

In [3]:
data = {'first_name': ['bob', 'li', 'leah'], 'age': [47, 23, 51]}
data_2 = {"first_name": ["suh", "anais"], "age": [33, 68]}

In [4]:
df = pd.DataFrame.from_dict(data)
write_deltalake("tmp/pandas-table", df)

In [5]:
print(DeltaTable("tmp/pandas-table/").to_pandas())

  first_name  age
0        bob   47
1         li   23
2       leah   51


In [6]:
df2 = pd.DataFrame(data_2)
write_deltalake("tmp/pandas-table", df2, mode="append")

In [7]:
print(DeltaTable("tmp/pandas-table/").to_pandas())

  first_name  age
0        suh   33
1      anais   68
2        bob   47
3         li   23
4       leah   51


In [28]:
print(DeltaTable("tmp/pandas-table/", version=0).to_pandas())

  first_name  age
0        bob   47
1         li   23
2       leah   51


## polars

In [10]:
%%capture
!pip install polars

Collecting polars
  Downloading polars-0.20.26-cp38-abi3-macosx_11_0_arm64.whl.metadata (14 kB)
Downloading polars-0.20.26-cp38-abi3-macosx_11_0_arm64.whl (24.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: polars
Successfully installed polars-0.20.26


In [11]:
import polars as pl

In [13]:
df = pl.DataFrame(data)
df.write_delta("tmp/polars_table")

In [14]:
print(pl.read_delta("tmp/polars_table"))

shape: (3, 2)
┌────────────┬─────┐
│ first_name ┆ age │
│ ---        ┆ --- │
│ str        ┆ i64 │
╞════════════╪═════╡
│ bob        ┆ 47  │
│ li         ┆ 23  │
│ leah       ┆ 51  │
└────────────┴─────┘


In [18]:
df = pl.DataFrame(data_2)
df.write_delta("tmp/polars_table", mode="append")

In [21]:
print(pl.read_delta("tmp/polars_table"))

shape: (5, 2)
┌────────────┬─────┐
│ first_name ┆ age │
│ ---        ┆ --- │
│ str        ┆ i64 │
╞════════════╪═════╡
│ suh        ┆ 33  │
│ anais      ┆ 68  │
│ bob        ┆ 47  │
│ li         ┆ 23  │
│ leah       ┆ 51  │
└────────────┴─────┘


In [20]:
print(pl.read_delta("tmp/polars_table", version=0))

shape: (3, 2)
┌────────────┬─────┐
│ first_name ┆ age │
│ ---        ┆ --- │
│ str        ┆ i64 │
╞════════════╪═════╡
│ bob        ┆ 47  │
│ li         ┆ 23  │
│ leah       ┆ 51  │
└────────────┴─────┘


## daft

In [22]:
%%capture
!pip install getdaft

Collecting getdaft
  Downloading getdaft-0.2.24-cp38-abi3-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting fsspec (from getdaft)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting tqdm (from getdaft)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m298.1 kB/s[0m eta [36m0:00:00[0m1m285.7 kB/s[0m eta [36m0:00:01[0m
Downloading getdaft-0.2.24-cp38-abi3-macosx_11_0_arm64.whl (16.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.2/16.2 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m0m
[?25hDownloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m[31m1.8 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)
[2K   [38;2;114;15

In [23]:
import daft

## dask

In [24]:
%%capture
!pip install dask-deltatable

Collecting dask-deltatable
  Using cached dask_deltatable-0.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting dask[dataframe] (from dask-deltatable)
  Downloading dask-2024.5.1-py3-none-any.whl.metadata (3.8 kB)
Collecting click>=8.1 (from dask[dataframe]->dask-deltatable)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting cloudpickle>=1.5.0 (from dask[dataframe]->dask-deltatable)
  Using cached cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting partd>=1.2.0 (from dask[dataframe]->dask-deltatable)
  Downloading partd-1.4.2-py3-none-any.whl.metadata (4.6 kB)
Collecting toolz>=0.10.0 (from dask[dataframe]->dask-deltatable)
  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Collecting dask-expr<1.2,>=1.1 (from dask[dataframe]->dask-deltatable)
  Downloading dask_expr-1.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting locket (from partd>=1.2.0->dask[dataframe]->dask-deltatable)
  Using cached locket-1.0.0-py2.py3-none-any.whl.metadata (2.8 kB

In [2]:
!pip install deltalake==0.13

Collecting deltalake==0.13
  Using cached deltalake-0.13.0-cp37-abi3-macosx_11_0_arm64.whl.metadata (5.0 kB)
Using cached deltalake-0.13.0-cp37-abi3-macosx_11_0_arm64.whl (18.6 MB)
Installing collected packages: deltalake
  Attempting uninstall: deltalake
    Found existing installation: deltalake 0.17.0
    Uninstalling deltalake-0.17.0:
      Successfully uninstalled deltalake-0.17.0
Successfully installed deltalake-0.13.0


In [1]:
import dask_deltatable as ddt

TypeError: descriptor '__call__' for 'type' objects doesn't apply to a 'property' object

In [2]:
import dask
dask.__version__

'2024.2.1'

In [None]:
dask.config.set({'dataframe.query-planning': False})

<dask.config.set at 0x103e89290>

In [4]:
!pip install dask==2024.2.1

Collecting dask==2024.2.1
  Downloading dask-2024.2.1-py3-none-any.whl.metadata (3.7 kB)
Downloading dask-2024.2.1-py3-none-any.whl (1.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m[31m3.5 MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: dask
  Attempting uninstall: dask
    Found existing installation: dask 2024.5.1
    Uninstalling dask-2024.5.1:
      Successfully uninstalled dask-2024.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-expr 1.1.1 requires dask==2024.5.1, but you have dask 2024.2.1 which is incompatible.[0m[31m
[0mSuccessfully installed dask-2024.2.1


In [3]:
# read delta table into Dask DataFrame
delta_path = "tmp/pandas-table/"
ddf = ddt.read_deltalake(delta_path)

NotImplementedError: dask_expr does not support a token argument.

## PySpark

In [1]:
!pip install pyspark delta-spark



## Use Delta Lake from a Jupyter Notebook

In [8]:
import pyspark
from delta import * 

builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [9]:
# create dataframe
data = [
    {'first_name': 'bob', 'age': 47},
    {'first_name': 'li', 'age': 23},
    {'first_name': 'leah', 'age': 51}
]

df = spark.createDataFrame(data=data)

In [10]:
# write to Delta table
df.write.format("delta").save("tmp/spark-table")

In [11]:
df = spark.read.format("delta").load("tmp/spark-table")
df.show()

+---+----------+
|age|first_name|
+---+----------+
| 51|      leah|
| 47|       bob|
| 23|        li|
+---+----------+



In [24]:
data2 = [
    {'first_name': 'suh', 'age': 33},
    {'first_name': 'anais', 'age': 68},
]

df2 = spark.createDataFrame(data=data2)

In [25]:
df2.show()

+---+----------+
|age|first_name|
+---+----------+
| 33|       suh|
| 68|     anais|
+---+----------+



In [27]:
df2.write.format("delta").mode("append").save("tmp/spark-table/")

In [28]:
df = spark.read.format("delta").load("tmp/spark-table")
df.show()

+---+----------+
|age|first_name|
+---+----------+
| 68|     anais|
| 51|      leah|
| 33|       suh|
| 47|       bob|
| 23|        li|
+---+----------+



In [30]:
df = spark.read.format("delta").option("versionAsOf", 0).load("tmp/spark-table")
df.show()

+---+----------+
|age|first_name|
+---+----------+
| 51|      leah|
| 47|       bob|
| 23|        li|
+---+----------+

