In [1]:
!pip install lakefs_client

Collecting lakefs_client
  Downloading lakefs_client-1.7.0-py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.6/328.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: lakefs_client
Successfully installed lakefs_client-1.7.0


In [None]:
lakefsEndPoint = 'http://lakefs.lakefs.svc.cluster.local:80'
lakefsAccessKey = 'xxxxxxxxxx'
lakefsSecretKey = 'xxxxxxxxxxxxxxx'

In [14]:
storageNamespace = 's3://lakehouse'
repo_name = "spark-demo3"

## Create lakeFSClient

In [15]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

## Create lakeFS Repository

In [16]:
repo= (lakefs.repositories
             .create_repository(
                  repository_creation=RepositoryCreation(
                      name=repo_name,
                      storage_namespace=f"{storageNamespace}/{repo_name}"
                  )
             )
      )
        
print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")

Created new repo spark-demo3 using storage namespace s3://lakehouse/spark-demo3


## Verify lakeFS Repository

In [17]:
repo=lakefs.repositories.get_repository(repo_name)
print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")

Found existing repo spark-demo3 using storage namespace s3://lakehouse/spark-demo3


## Spark

In [18]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("lakeFS / Jupyter")
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.12.481,io.delta:delta-core_2.12:2.3.0")
        .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .config("spark.hadoop.fs.s3a.endpoint", lakefsEndPoint)
        .config("spark.hadoop.fs.s3a.path.style.access", "true")
        .config("spark.hadoop.fs.s3a.access.key", lakefsAccessKey)
        .config("spark.hadoop.fs.s3a.secret.key", lakefsSecretKey)
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .getOrCreate()
        )
spark.sparkContext.setLogLevel("INFO")

spark

In [44]:
spark.stop()

## Spark

In [19]:
from pyspark.sql.types import ByteType, IntegerType, LongType, StringType, StructType, StructField
from pyspark.sql.functions import *

In [20]:
mainBranch = "main"
deltaLakeETLBranch = "delta-lake-etl-branch"
customersTable = "customers"
ordersTable = "orders"
orderUpdatesTable = "order_updates"

In [21]:
def delta_table_compare_branches(table, refs):
  spark.createDataFrame(
    data=zip(
      refs,
      map(lambda r: spark.read.format('delta').load(f's3a://{repo.id}/{r}/{table}').count(), refs)
    ), 
    schema=StructType([ 
      StructField("Branch", StringType(), True),
      StructField("Count", IntegerType(), True)
    ])
  ).show(truncate=False)

In [22]:
def print_diff_refs(diff_refs):
    results = map(
        lambda n:[n.path,n.path_type,n.size_bytes,n.type],
        diff_refs.results)
    return results

In [23]:
customersSchema = StructType([
  StructField("Customer_ID", IntegerType(), False),
  StructField("Country", StringType(), False),
  StructField("Gender", StringType(), False),
  StructField("Personal_ID", IntegerType(), True),
  StructField("Customer_Name", StringType(), False),
  StructField("Customer_FirstName", StringType(), False),
  StructField("Customer_LastName", StringType(), False),
  StructField("Birth_Date", StringType(), False),
  StructField("Customer_Address", StringType(), False),
  StructField("Street_ID", LongType(), False),
  StructField("Street_Number", IntegerType(), False),
  StructField("Customer_Type_ID", IntegerType(), False)
])

In [24]:
ordersSchema = StructType([
  StructField("Customer_ID", IntegerType(), False),
  StructField("Employee_ID", IntegerType(), False),
  StructField("Street_ID", LongType(), False),
  StructField("Order_Date", StringType(), False),
  StructField("Delivery_Date", StringType(), False),
  StructField("Order_ID", LongType(), True),
  StructField("Order_Type", ByteType(), False),
  StructField("Product_ID", LongType(), False),
  StructField("Quantity", ByteType(), False),
  StructField("Total_Retail_Price", StringType(), False),
  StructField("CostPrice_Per_Unit", StringType(), False),
  StructField("Discount", LongType(), False)
])

In [25]:
customersTablePath = f"s3a://{repo.id}/{mainBranch}/{customersTable}"
print(customersTablePath)

s3a://spark-demo3/main/customers


In [27]:
df = spark.read.csv('/home/jovyan/CUSTOMER.csv',header=True,schema=customersSchema)
df.write.format("delta").mode("overwrite").save(customersTablePath)
df.show(10)

+-----------+-------+------+-----------+-----------------+------------------+-----------------+----------+--------------------+----------+-------------+----------------+
|Customer_ID|Country|Gender|Personal_ID|    Customer_Name|Customer_FirstName|Customer_LastName|Birth_Date|    Customer_Address| Street_ID|Street_Number|Customer_Type_ID|
+-----------+-------+------+-----------+-----------------+------------------+-----------------+----------+--------------------+----------+-------------+----------------+
|          4|     US|     M|       null|    James Kvarniq|             James|          Kvarniq| 27JUN1974|      4382 Gralyn Rd|9260106519|         4382|            1020|
|          5|     US|     F|       null|Sandrina Stephano|          Sandrina|         Stephano| 09JUL1979|    6468 Cog Hill Ct|9260114570|         6468|            2020|
|          9|     DE|     F|       null|   Cornelia Krahl|          Cornelia|            Krahl| 27FEB1974|   Kallstadterstr. 9|3940106659|            