In [0]:
# This scenario shows how to connect to OneLake via Azure Databricks
# Make sure to enable Azure Data Lake Storage credential passthrough on your cluster in the Advanced Options.

# Save the path of lakehouse
oneLakePath = 'abfss://AmberFabricWS@msit-onelake.dfs.fabric.microsoft.com/AmberFabricWSLakehouse.Lakehouse/Files/nyctaxi'

In [0]:
# Load data from a Databricks public dataset into a dataframe.
yellowTaxiDF = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2019-12.csv.gz")


In [0]:
# Filter, transform, or prep your data.
filteredTaxiDF = yellowTaxiDF.where(yellowTaxiDF.fare_amount<4).where(yellowTaxiDF.passenger_count==4)
display(filteredTaxiDF)

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
2,2019-12-01T00:00:31.000+0000,2019-12-01T00:02:17.000+0000,4,0.38,1,N,163,237,2,3.5,0.5,0.5,0.0,0.0,0.3,7.3,2.5
2,2019-12-01T00:19:42.000+0000,2019-12-01T00:20:09.000+0000,4,0.14,1,N,260,260,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,0.0
2,2019-12-01T00:28:55.000+0000,2019-12-01T00:36:20.000+0000,4,0.94,1,N,230,161,4,-6.5,-0.5,-0.5,0.0,0.0,-0.3,-10.3,-2.5
1,2019-12-01T01:30:43.000+0000,2019-12-01T01:32:45.000+0000,4,0.4,1,N,68,246,1,3.5,3.0,0.5,1.45,0.0,0.3,8.75,2.5
2,2019-12-01T01:52:02.000+0000,2019-12-01T01:53:24.000+0000,4,0.45,1,N,107,170,1,3.5,0.5,0.5,1.0,0.0,0.3,8.3,2.5
1,2019-12-01T01:42:55.000+0000,2019-12-01T01:44:35.000+0000,4,0.0,1,N,234,234,4,3.0,3.0,0.5,0.0,0.0,0.3,6.8,2.5
1,2019-12-01T02:15:56.000+0000,2019-12-01T02:17:50.000+0000,4,0.3,1,N,79,79,1,3.5,3.0,0.5,1.45,0.0,0.3,8.75,2.5
2,2019-12-01T02:06:05.000+0000,2019-12-01T02:07:54.000+0000,4,0.49,1,N,68,68,1,3.5,0.5,0.5,1.82,0.0,0.3,9.12,2.5
2,2019-12-01T02:44:06.000+0000,2019-12-01T02:45:15.000+0000,4,0.49,1,N,230,100,1,3.5,0.5,0.5,2.19,0.0,0.3,9.49,2.5
1,2019-12-01T02:54:03.000+0000,2019-12-01T02:55:57.000+0000,4,0.4,1,N,148,79,1,3.5,3.0,0.5,1.45,0.0,0.3,8.75,2.5


In [0]:
# Write your filtered dataframe to your Fabric Lakehouse using your OneLake path.
filteredTaxiDF.write.format("csv").option("header", "true").mode("overwrite").csv(oneLakePath)

In [0]:
# test data through newly loaded file
lakehouseRead = spark.read.format('csv').option("header", "true").load(oneLakePath)
display(lakehouseRead.limit(10))

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
2,2019-12-01T00:00:31.000Z,2019-12-01T00:02:17.000Z,4,0.38,1,N,163,237,2,3.5,0.5,0.5,0.0,0.0,0.3,7.3,2.5
2,2019-12-01T00:19:42.000Z,2019-12-01T00:20:09.000Z,4,0.14,1,N,260,260,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,0.0
2,2019-12-01T00:28:55.000Z,2019-12-01T00:36:20.000Z,4,0.94,1,N,230,161,4,-6.5,-0.5,-0.5,0.0,0.0,-0.3,-10.3,-2.5
1,2019-12-01T01:30:43.000Z,2019-12-01T01:32:45.000Z,4,0.4,1,N,68,246,1,3.5,3.0,0.5,1.45,0.0,0.3,8.75,2.5
2,2019-12-01T01:52:02.000Z,2019-12-01T01:53:24.000Z,4,0.45,1,N,107,170,1,3.5,0.5,0.5,1.0,0.0,0.3,8.3,2.5
1,2019-12-01T01:42:55.000Z,2019-12-01T01:44:35.000Z,4,0.0,1,N,234,234,4,3.0,3.0,0.5,0.0,0.0,0.3,6.8,2.5
1,2019-12-01T02:15:56.000Z,2019-12-01T02:17:50.000Z,4,0.3,1,N,79,79,1,3.5,3.0,0.5,1.45,0.0,0.3,8.75,2.5
2,2019-12-01T02:06:05.000Z,2019-12-01T02:07:54.000Z,4,0.49,1,N,68,68,1,3.5,0.5,0.5,1.82,0.0,0.3,9.12,2.5
2,2019-12-01T02:44:06.000Z,2019-12-01T02:45:15.000Z,4,0.49,1,N,230,100,1,3.5,0.5,0.5,2.19,0.0,0.3,9.49,2.5
1,2019-12-01T02:54:03.000Z,2019-12-01T02:55:57.000Z,4,0.4,1,N,148,79,1,3.5,3.0,0.5,1.45,0.0,0.3,8.75,2.5
