In [1]:
!pip install lakefs_client

Collecting lakefs_client
  Downloading lakefs_client-1.7.0-py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.6/328.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: lakefs_client
Successfully installed lakefs_client-1.7.0


In [47]:
! pip install deltalake

Collecting deltalake
  Downloading deltalake-0.15.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting pyarrow-hotfix
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix, deltalake
Successfully installed deltalake-0.15.1 pyarrow-hotfix-0.6


In [None]:
lakefsEndPoint = 'http://lakefs.lakefs.svc.cluster.local:80'
lakefsAccessKey = 'xxxxxxxxxxxx'
lakefsSecretKey = 'xxxxxxxxxxxxxxxx'

In [50]:
storageNamespace = 's3://mlflow'
repo_name = "delta-lake-demo3"

## Create lakeFSClient

In [51]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

## Create lakeFS Repository

In [52]:
repo= (lakefs.repositories
             .create_repository(
                  repository_creation=RepositoryCreation(
                      name=repo_name,
                      storage_namespace=f"{storageNamespace}/{repo_name}"
                  )
             )
      )
        
print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")

Created new repo delta-lake-demo3 using storage namespace s3://mlflow/delta-lake-demo3


## Verify lakeFS Repository

In [53]:
repo=lakefs.repositories.get_repository(repo_name)
print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")

Found existing repo delta-lake-demo3 using storage namespace s3://mlflow/delta-lake-demo3


# Lake Python

## Load some test data

In [54]:
import pandas as pd
import deltalake

storage_options = {"AWS_ACCESS_KEY_ID": lakefsAccessKey, 
                   "AWS_SECRET_ACCESS_KEY":lakefsSecretKey,
                   "AWS_ENDPOINT": lakefsEndPoint,
                   "AWS_REGION": "us-east-1",
                   "AWS_ALLOW_HTTP": "true",
                   "AWS_S3_ALLOW_UNSAFE_RENAME": "true"
                  }

In [55]:
df = pd.read_parquet('/home/jovyan/userdata1.parquet')

In [56]:
subset

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
492,2016-02-03 11:32:52,493,Anne,Warren,awarrendo@oaic.gov.au,Female,36.74.153.243,3538261440495771.0,United States,,76983.81,,
383,2016-02-03 14:23:23,384,Victor,Cunningham,vcunninghaman@mapy.cz,Male,211.58.176.112,5602219786393162.0,Serbia,10/22/1983,265265.73,Geologist II,
856,2016-02-03 07:00:28,857,Paul,Anderson,pandersonns@wufoo.com,,111.188.66.182,3585323481696329.0,Brunei,9/15/1958,,Software Consultant,
834,2016-02-03 00:18:47,835,Sean,Castillo,scastillon6@altervista.org,,211.77.61.195,,Portugal,6/15/1979,,Quality Control Specialist,
603,2016-02-03 23:49:09,604,Christine,Wilson,cwilsongr@answers.com,Female,41.182.6.194,4508325302658042.0,China,11/30/1960,87826.74,Financial Analyst,
712,2016-02-03 22:11:23,713,Eric,Owens,eowensjs@vk.com,Male,207.176.76.46,5.602242131534404e+17,Indonesia,9/2/1958,220176.18,Paralegal,
126,2016-02-03 23:01:52,127,Deborah,Porter,dporter3i@istockphoto.com,Female,171.36.77.142,4.903389517897807e+18,China,5/18/1959,271474.26,Engineer II,Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕͞...
290,2016-02-03 00:27:06,291,Julia,Medina,jmedina82@cbc.ca,Female,43.27.110.171,30163835573619.0,Russia,8/12/1991,109927.88,Software Engineer II,
549,2016-02-03 00:52:08,550,Cheryl,Evans,cevansf9@yolasite.com,Female,244.155.129.93,,Japan,7/24/1955,12380.49,Budget/Accounting Analyst II,
169,2016-02-03 04:13:57,170,Anne,Reed,areed4p@plala.or.jp,Female,223.33.106.169,,Russia,,244488.5,,


## Write the test data to the main branch as a Delta table

In [57]:
subset = df.sample(frac=0.011, random_state=42)
print(f"There are {subset.shape[0]} rows in the sample dataset")

There are 11 rows in the sample dataset


In [58]:
deltalake.write_deltalake(table_or_uri=f's3a://{repo_name}/main/userdata/', 
                          data = subset,
                          mode='overwrite',
                          storage_options=storage_options)

## Read Deltalake from lakeFS and Python

In [59]:
my_new_dt = deltalake.DeltaTable(f's3a://{repo_name}/main/userdata/', storage_options=storage_options)

In [60]:
my_new_dt.history()

[{'timestamp': 1705253778853,
  'operation': 'CREATE TABLE',
  'operationParameters': {'metadata': '{"configuration":{},"created_time":1705253778845,"description":null,"format":{"options":{},"provider":"parquet"},"id":"ee1d7b48-32c7-41fe-ae32-ac88b5b04413","name":null,"partition_columns":[],"schema":{"fields":[{"metadata":{},"name":"registration_dttm","nullable":true,"type":"timestamp"},{"metadata":{},"name":"id","nullable":true,"type":"integer"},{"metadata":{},"name":"first_name","nullable":true,"type":"string"},{"metadata":{},"name":"last_name","nullable":true,"type":"string"},{"metadata":{},"name":"email","nullable":true,"type":"string"},{"metadata":{},"name":"gender","nullable":true,"type":"string"},{"metadata":{},"name":"ip_address","nullable":true,"type":"string"},{"metadata":{},"name":"cc","nullable":true,"type":"string"},{"metadata":{},"name":"country","nullable":true,"type":"string"},{"metadata":{},"name":"birthdate","nullable":true,"type":"string"},{"metadata":{},"name":"sala

In [61]:
my_new_dt.version()

0

In [62]:
print(f"{my_new_dt.to_pandas().shape[0]} rows read in the table")

11 rows read in the table


## Write some more data to the table

In [63]:
subset = df.sample(frac=0.011, random_state=21)
print(f"There are {subset.shape[0]} rows in the sample dataset")

There are 11 rows in the sample dataset


In [64]:
subset

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
492,2016-02-03 11:32:52,493,Anne,Warren,awarrendo@oaic.gov.au,Female,36.74.153.243,3538261440495771.0,United States,,76983.81,,
383,2016-02-03 14:23:23,384,Victor,Cunningham,vcunninghaman@mapy.cz,Male,211.58.176.112,5602219786393162.0,Serbia,10/22/1983,265265.73,Geologist II,
856,2016-02-03 07:00:28,857,Paul,Anderson,pandersonns@wufoo.com,,111.188.66.182,3585323481696329.0,Brunei,9/15/1958,,Software Consultant,
834,2016-02-03 00:18:47,835,Sean,Castillo,scastillon6@altervista.org,,211.77.61.195,,Portugal,6/15/1979,,Quality Control Specialist,
603,2016-02-03 23:49:09,604,Christine,Wilson,cwilsongr@answers.com,Female,41.182.6.194,4508325302658042.0,China,11/30/1960,87826.74,Financial Analyst,
712,2016-02-03 22:11:23,713,Eric,Owens,eowensjs@vk.com,Male,207.176.76.46,5.602242131534404e+17,Indonesia,9/2/1958,220176.18,Paralegal,
126,2016-02-03 23:01:52,127,Deborah,Porter,dporter3i@istockphoto.com,Female,171.36.77.142,4.903389517897807e+18,China,5/18/1959,271474.26,Engineer II,Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕͞...
290,2016-02-03 00:27:06,291,Julia,Medina,jmedina82@cbc.ca,Female,43.27.110.171,30163835573619.0,Russia,8/12/1991,109927.88,Software Engineer II,
549,2016-02-03 00:52:08,550,Cheryl,Evans,cevansf9@yolasite.com,Female,244.155.129.93,,Japan,7/24/1955,12380.49,Budget/Accounting Analyst II,
169,2016-02-03 04:13:57,170,Anne,Reed,areed4p@plala.or.jp,Female,223.33.106.169,,Russia,,244488.5,,


In [65]:
deltalake.write_deltalake(table_or_uri=f's3a://{repo_name}/main/userdata/', 
                          data = subset,
                          mode='append',
                          storage_options=storage_options)

## Re-Read the Deltalake table

In [67]:
my_new_dt = deltalake.DeltaTable(f's3a://{repo_name}/main/userdata/', storage_options=storage_options)

In [68]:
my_new_dt.history()

[{'timestamp': 1705253785799,
  'operation': 'WRITE',
  'operationParameters': {'partitionBy': '[]', 'mode': 'Append'},
  'clientVersion': 'delta-rs.0.17.0',
  'version': 1},
 {'timestamp': 1705253778853,
  'operation': 'CREATE TABLE',
  'operationParameters': {'mode': 'ErrorIfExists',
   'metadata': '{"configuration":{},"created_time":1705253778845,"description":null,"format":{"options":{},"provider":"parquet"},"id":"ee1d7b48-32c7-41fe-ae32-ac88b5b04413","name":null,"partition_columns":[],"schema":{"fields":[{"metadata":{},"name":"registration_dttm","nullable":true,"type":"timestamp"},{"metadata":{},"name":"id","nullable":true,"type":"integer"},{"metadata":{},"name":"first_name","nullable":true,"type":"string"},{"metadata":{},"name":"last_name","nullable":true,"type":"string"},{"metadata":{},"name":"email","nullable":true,"type":"string"},{"metadata":{},"name":"gender","nullable":true,"type":"string"},{"metadata":{},"name":"ip_address","nullable":true,"type":"string"},{"metadata":{},"

In [69]:
my_new_dt.version()

1

In [70]:
my_new_dt.file_uris()

['s3a://delta-lake-demo3/main/userdata/0-91a074e4-1990-4ec1-a106-cc87050c3127-0.parquet',
 's3a://delta-lake-demo3/main/userdata/1-2e0f6d34-8d16-4b20-8a7a-028efa40a101-0.parquet']

In [71]:
print(f"{my_new_dt.to_pandas().shape[0]} rows read in the table")

22 rows read in the table


## Commit the data in lakeFS

In [72]:
lakefs.commits.commit(repo.id, "main", CommitCreation(
    message="Initial data load",
    metadata={'author': 'rmoff'}
) )

{'committer': 'admin',
 'creation_date': 1705253798,
 'id': 'c98c6a4a89c40eb5b4544540fd56fcff87ed2c52f11f90786bcc305033c66078',
 'message': 'Initial data load',
 'meta_range_id': '',
 'metadata': {'author': 'rmoff'},
 'parents': ['ae6ab42de01f1720d130b769d91ddf3b25dcc88eb6302a326858e1206010ad4c']}