### Databrew demo notebook

In [None]:
import boto3
import json 
import pprint
import time
import datetime
from aws_orbit_sdk.common import get_workspace

In [None]:
ssm = boto3.client("ssm")
databrew_client = boto3.client("databrew")

workspace = get_workspace()
workspace

In [None]:
env_name = workspace["env_name"]
team_name = workspace["team_space"]
pod_role_arn = workspace["EksPodRoleArn"]

In [None]:
demo_config = json.loads(ssm.get_parameter(Name=f"/orbit/{env_name}/demo")['Parameter']['Value'])
lake_bucket = demo_config.get("LakeBucket").split(':::')[1]
lake_bucket

In [None]:
dataset_name = 'Beneficiary-Summary-Dataset'
data_profile_job_name = 'Beneficiary-Summary-Data-Profile-Job'
recipe_name = 'Beneficiary-Summary-Recipe'
project_name = 'Beneficiary-Summary-Project'
project_recipe_job_name = 'Beneficiary-Summary-Project-Recipe-Job'

## Clean Account 


In [None]:
# Delete Data Profile job 
try:
    delete_profile_job_response = databrew_client.delete_job(
        Name=data_profile_job_name,      
    )
    pprint.pprint(delete_profile_job_response)
except Exception as e:
    print(f"Missing {data_profile_job_name}, create new data profiling job.")

In [None]:
# Delete project recipe job
try:
    delete_project_recipe_job_response = databrew_client.delete_job(
        Name=project_recipe_job_name
    )
    pprint.pprint(delete_project_recipe_job_response)
except Exception as e:
    print(f"Missing {project_recipe_job_name}, create new project recipe job")

In [None]:
# Delete project
try:
    delete_project_response = databrew_client.delete_project(
        Name=project_name
    )
    pprint.pprint(delete_project_response)
except Exception as e:
    print(f"Mising {project_name}, create new project")

In [None]:
# Delete recipe and versions
try:
    batch_delete_recipe_version_response = databrew_client.batch_delete_recipe_version(
        Name=recipe_name,
        RecipeVersions=[
            '1.0', # Published version
            '1.1'  # Working version      
        ]
    )
    pprint.pprint(batch_delete_recipe_version_response)
except Exception as e:
    print(f"Missing {recipe_name}, create new recipe and publish.")

In [None]:
# Delete Dataset
try:
    dataset_delete_response = databrew_client.delete_dataset(
        Name=dataset_name
    )
    pprint.pprint(dataset_delete_response)
except databrew_client.exceptions.ResourceNotFoundException as rnfe:
    print(f"Missing {dataset_name}, create new dataset. ")    
except Exception as e:    
    raise e

## Create data set 

In [None]:

create_dataset_response = databrew_client.create_dataset(
    Name=dataset_name,
    Format='CSV',
    FormatOptions={
        'Csv': {
            'Delimiter': ',',
            'HeaderRow': True
        }
    },
    Input={
        'S3InputDefinition': {
            'Bucket': lake_bucket,
            'Key': 'extracted/Beneficiary_Summary/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv'
        }
    },
    Tags={
        'env': env_name,
        'team': team_name
    }
)
pprint.pprint(create_dataset_response)

## Create Data profiling job

In [None]:
data_profile_response = databrew_client.create_profile_job(
    DatasetName=dataset_name,
    Name=data_profile_job_name,
    LogSubscription='ENABLE',
    MaxCapacity=5,
    MaxRetries=0,
    OutputLocation={
        'Bucket': lake_bucket,
        'Key': 'databrew/dataprofileoutput/'
    },
    RoleArn=pod_role_arn,
    Tags={
        'env': env_name,
        'team': team_name
    },
    Timeout=2800,
    JobSample={
        'Mode': 'CUSTOM_ROWS',
        'Size': 1000
    }
)
pprint.pprint(data_profile_response)

## Start data profiling job

In [None]:
start_job_run_response = databrew_client.start_job_run(
    Name=data_profile_job_name
)
pprint.pprint(start_job_run_response)

## Create Recipe

In [None]:

create_recipe_response = databrew_client.create_recipe(
    Description='Demo Data Transformation - convert birth date format ',
    Name=recipe_name,
    Steps=[
      {
        "Action": {
          "Operation": "DUPLICATE",
          "Parameters": {
            "sourceColumn": "BENE_BIRTH_DT",
            "targetColumn": "BENE_BIRTH_DT_FORMAT"
          }
        }
      },
      {
        "Action": {
          "Operation": "CHANGE_DATA_TYPE",
          "Parameters": {
            "columnDataType": "string",
            "sourceColumn": "BENE_BIRTH_DT_FORMAT"
          }
        }
      },
      {
        "Action": {
          "Operation": "FORMAT_DATE",
          "Parameters": {
            "sourceColumn": "BENE_BIRTH_DT_FORMAT",
            "targetDateFormat": "dd*month*yyyy"
          }
        }
      }
    ],
    Tags={
        'env': env_name,
        'team': team_name
    }
)
pprint.pprint(create_recipe_response)

## Publish recipe

In [None]:
publish_recipe_response = databrew_client.publish_recipe(
    Description='Publishing Demo Data Transformation - convert birth date format ',
    Name=recipe_name
)
pprint.pprint(publish_recipe_response)

## Create Project with a recipe that doesn't have an associated job.


In [None]:

create_project_response = databrew_client.create_project(
    DatasetName=dataset_name,    
    Name=project_name,
    RecipeName=recipe_name,
    Sample={
        'Size': 500,
        'Type': 'FIRST_N'
    },
    RoleArn= workspace["EksPodRoleArn"],
    Tags={
        'env': env_name,
        'team': team_name
    },
)

pprint.pprint(create_project_response)

In [None]:
# Create Project based Recipe job 

create_recipe_job_response = databrew_client.create_recipe_job(
    #DatasetName='Beneficiary-Summary-Dataset',    
    Name=project_recipe_job_name,
    LogSubscription='ENABLE',
    MaxCapacity=5,
    MaxRetries=0,
    Outputs=[
        {
           'Format': 'CSV',            
            'Location': {
                'Bucket': lake_bucket,
                'Key': 'databrew/recipejob/output/Beneficiary_Summary/'
            },
            'Overwrite': True,
            'FormatOptions': {
                'Csv': {
                    'Delimiter': ','
                }
            }
        },
    ],
    ProjectName=project_name,
    #RecipeReference={
        #'Name': 'Beneficiary-Summary-Sample-2-Recipe',
        #'RecipeVersion': '1.0'
    #},
    RoleArn=pod_role_arn,
    Tags={
        'env': env_name,
        'team': team_name
    },
    Timeout=2880
)
pprint.pprint(create_recipe_job_response)

## Start job run

In [None]:
start_job_run_response = databrew_client.start_job_run(
    Name=project_recipe_job_name
)
pprint.pprint(start_job_run_response)

In [None]:
run_id = start_job_run_response["RunId"]
print(run_id)

In [None]:
describe_job_run_response = databrew_client.describe_job_run(
    Name=project_recipe_job_name,
    RunId= run_id
)
pprint.pprint(describe_job_run_response)

In [None]:
# Wait for job completion 
# Possible job run states - 'STARTING'|'RUNNING'|'STOPPING'|'STOPPED'|'SUCCEEDED'|'FAILED'|'TIMEOUT'

import time 
from datetime import datetime
wait_time_seconds = 60
time.sleep(wait_time_seconds)

    
job_status = describe_job_run_response["State"]
while job_status == 'RUNNING':
    time.sleep(wait_time_seconds)
    print(f"{datetime.now()} - Sleeping {wait_time_seconds} seconds ")
    job_status = databrew_client.describe_job_run(Name=project_recipe_job_name, RunId= run_id)["State"]


In [None]:
!aws s3 ls s3://$lake_bucket/databrew/recipejob/output/Beneficiary_Summary/ --recursive 

In [None]:
!aws s3 cp s3://$lake_bucket/databrew/recipejob/output/Beneficiary_Summary/Beneficiary-Summary-Project-Recipe-Job_part00000.csv - | head -n 5

In [None]:
assert job_status == 'SUCCEEDED'