## Overview

This notebook shows how to use the Databand SDK to observe pipeline execution in Databand. The notebook assumes that the **Spark Listener** has been configured on the cluster that's used to run this notebook. Spark Listener configuration contains URL and credentials for connecting to Databand. It also provides automatic tracking of datasets.

In [0]:
# Import Databand libraries
from dbnd import dbnd_tracking, task

In [0]:
@task
def read_raw_data():
  
  retailData = spark.read.csv("/FileStore/tables/Retail_Products_and_Customers.csv", inferSchema=True, header=True, sep=",")
  retailData.show()
    
  return retailData

In [0]:
@task
def filter_data(rawData):
    
    # Drop a few columns
    # filteredRetailData = rawData.drop('Buy','PROFESSION','EDUCATION')
    
    # Get customers with medium LTV    
    filteredRetailData = rawData.where("LTV = 'MEDIUM VALUE'")
    
    filteredRetailData.show()
    
    return filteredRetailData

In [0]:
@task
def write_data_by_state(filteredData):
    
    from pyspark.sql.functions import col

    oregonSales = filteredData.where("State = 'Oregon'") 
    oregonSales.write.format("csv").mode("overwrite").save("dbfs:/retail_data/us/oregon/oregon_sales.csv")
             
    arizonaSales = filteredData.where("State = 'Arizona'")
    arizonaSales.write.format("csv").mode("overwrite").save("dbfs:/retail_data/us/arizona/arizona_sales.csv")

In [0]:
def prepareRetailData():
  
  # Call the step job - read data
  rawData = read_raw_data()

  # Filter data
  filteredData = filter_data(rawData)

  # Write data by product line
  write_data_by_state(filteredData)
  
#   if stop_spark:
#         spark.stop()

  print("Finished running the pipeline")

In [0]:
from datetime import datetime
import os

current_time = str(datetime.now())

os.environ["DBND__ENABLE__SPARK_CONTEXT_ENV"] = "True"

# Invoke the main function
with dbnd_tracking(job_name="simple_retail_demo_spark_DB", run_name="retail_run_"+current_time):
  prepareRetailData()

In [0]:
#spark.stop()