<span style="color:red">**Insert Project Token (us the menu in the top right corner). If you don't complete this step, you will get an error in write_data_by_product_line()**</span>

# Simple Retail Pipeline with Databand SKD

In [None]:
# Install databand - run once
!pip install databand

In [None]:
# Import databand libraries
from dbnd import dbnd_tracking, task, dataset_op_logger

In [None]:
# Run once - optional if you want to understand the data
# !pip install pandas-profiling

In [None]:
# Global variables

databand_url = 'insert_url'
databand_access_token = 'insert_token'

# Data used in this pipeline
RETAIL_FILE = "https://raw.githubusercontent.com/elenalowery/data-samples/main/Retail_Products_and_Customers.csv"

# Provide a unique suffix that will be added to various assets tracked in Databand. We use this approach because
# in a workshop many users are running the same sample pipelines. For example '_mi'
unique_suffix = '_mi'


In [None]:
@task
def read_raw_data():
    
    import pandas as pd
    
    url = 'https://raw.githubusercontent.com/elenalowery/data-samples/main/Retail_Products_and_Customers.csv'

    retailData = pd.read_csv(RETAIL_FILE)
    
    # Log the data read

    # Unique name for logging
    unique_file_name = RETAIL_FILE + unique_suffix

    # Log the data read
    with dataset_op_logger(unique_file_name,"read",with_schema=True,with_preview=True,with_stats=True,with_histograms=True,) as logger:
        retailData = pd.read_csv(RETAIL_FILE)
        logger.set(data=retailData)
    
    return retailData

In [None]:
@task
def filter_data(rawData):
    
    unique_file_name = 'script://Weekly_Sales/Filtered_df' + unique_suffix

    # Drop a few columns
    filteredRetailData = rawData.drop(['Buy', 'PROFESSION', 'EDUCATION'], axis=1)

    with dataset_op_logger(unique_file_name, "read", with_schema=True, with_preview=True) as logger:
        logger.set(data=filteredRetailData)
    
    return filteredRetailData

In [None]:
@task
def write_data_by_product_line(filteredData):
    
    unique_file_name_1 = 'local://Weekly_Sales/Camping_Equipment.csv' + unique_suffix
    unique_file_name_2 = 'local://Weekly_Sales/Golf_Equipment.csv' + unique_suffix

    # Select any product line - we will write it to a separate file
    campingEquipment = filteredData.loc[filteredData['Product line'] == 'Camping Equipment']

    # Log writing the Camping Equipment csv
    with dataset_op_logger(unique_file_name_1, "write", with_schema=True,with_preview=True) as logger:
        # Write the csv file
        project.save_data("CampingEquipment.csv", campingEquipment.to_csv(index=False), overwrite=True)
        logger.set(data=campingEquipment)

    # Select any product line
    golfEquipment = filteredData.loc[filteredData['Product line'] == 'Golf Equipment']

    # Log the filtered data read
    with dataset_op_logger(unique_file_name_2, "write", with_schema=True,with_preview=True) as logger:
        # Write the csv file
        project.save_data("GolfEquipment.csv", golfEquipment.to_csv(index=False), overwrite=True)
        logger.set(data=golfEquipment)


In [None]:
# Call and track all steps in a pipeline

# TODO: 
# - Update databand URL and token
# - Update project and job name (add your initials)

def prepare_retail_data():
    
    with dbnd_tracking(
            conf={
                "core": {
                    "databand_url": databand_url,
                    "databand_access_token": databand_access_token,
                }
            },
            job_name="prepare_sales_data" + unique_suffix,
            run_name="weekly",
            project_name="Retail Analytics" + unique_suffix,
    ):
        
        # Call the step job - read data
        rawData = read_raw_data()

        # Filter data
        filteredData = filter_data(rawData)

        # Write data by product line
        write_data_by_product_line(filteredData)

        print("Finished running the pipeline")


# Invoke the main function
prepare_retail_data()

In [None]:
# Optional if you want to understand the data
#import pandas_profiling
#retailData.profile_report()

In [None]:
# Optional if you want to understand the data
#retailData.columns

In [None]:
# Optional if you want to understand the data
# Unique values in Product Line column
# print(retailData['Product line'].unique())