### Databrew demo notebook

In [99]:
import boto3
import json 
import pprint
import time
import datetime
from aws_orbit_sdk.common import get_workspace

In [100]:
ssm = boto3.client("ssm")
databrew_client = boto3.client("databrew")

workspace = get_workspace()
workspace

{'BaseImageAddress': '750821335976.dkr.ecr.us-west-2.amazonaws.com/orbit-dev-env-jupyter-user:latest',
 'BootstrapS3Prefix': 'teams/dev-env/bootstrap/',
 'ContainerDefaults': {'cpu': 4, 'memory': 16384},
 'ContainerRunnerArn': None,
 'EcsClusterName': None,
 'EfsApId': 'fsap-0eb1a1cd92fb90a0a',
 'EfsId': 'fs-1901281e',
 'EfsLifeCycle': 'AFTER_7_DAYS',
 'EksK8SApiArn': None,
 'EksPodRoleArn': 'arn:aws:iam::750821335976:role/orbit-dev-env-lake-user-role',
 'Elbs': {'lake-user/jupyterhub-public': {'AvailabilityZones': ['us-west-2b',
    'us-west-2a'],
   'DNSName': 'a014b72dea49c4e5192e4a0b2f19d9ac-1247107658.us-west-2.elb.amazonaws.com',
   'Instances': [{'InstanceId': 'i-0f7129a311215afe3'},
    {'InstanceId': 'i-0c6d4c9bc4267c5e9'},
    {'InstanceId': 'i-085b4da3cf97c2a33'},
    {'InstanceId': 'i-069cd7ac22ad2c935'},
    {'InstanceId': 'i-064f5478e9c0b0fc9'},
    {'InstanceId': 'i-027177c067b12a152'},
    {'InstanceId': 'i-01955b4e63b346da7'}],
   'ListenerDescriptions': [{'Listener': 

In [101]:
env_name = workspace["env_name"]
team_name = workspace["team_space"]
pod_role_arn = workspace["EksPodRoleArn"]

In [102]:
demo_config = json.loads(ssm.get_parameter(Name=f"/orbit/{env_name}/demo")['Parameter']['Value'])
lake_bucket = demo_config.get("LakeBucket").split(':::')[1]
lake_bucket

'orbit-dev-env-demo-lake-750821335976-b82a3f'

In [103]:
dataset_name = 'Beneficiary-Summary-Dataset'
data_profile_job_name = 'Beneficiary-Summary-Data-Profile-Job'
recipe_name = 'Beneficiary-Summary-Recipe'
project_name = 'Beneficiary-Summary-Project'
project_recipe_job_name = 'Beneficiary-Summary-Project-Recipe-Job'

## Clean Account 


In [107]:
# Delete Data Profile job 
try:
    delete_profile_job_response = databrew_client.delete_job(
        Name=data_profile_job_name,      
    )
    pprint.pprint(delete_profile_job_response)
except Exception as e:
    print(f"Missing {data_profile_job_name}, create new data profiling job.")

Missing Beneficiary-Summary-Data-Profile-Job, create new data profiling job.


In [108]:
# Delete project recipe job
try:
    delete_project_recipe_job_response = databrew_client.delete_job(
        Name=project_recipe_job_name
    )
    pprint.pprint(delete_project_recipe_job_response)
except Exception as e:
    print(f"Missing {project_recipe_job_name}, create new project recipe job")

Missing Beneficiary-Summary-Project-Recipe-Job, create new project recipe job


In [109]:
# Delete project
try:
    delete_project_response = databrew_client.delete_project(
        Name=project_name
    )
    pprint.pprint(delete_project_response)
except Exception as e:
    print(f"Mising {project_name}, create new project")

Mising Beneficiary-Summary-Project, create new project


In [110]:
# Delete recipe and versions
try:
    batch_delete_recipe_version_response = databrew_client.batch_delete_recipe_version(
        Name=recipe_name,
        RecipeVersions=[
            '1.0', # Published version
            '1.1'  # Working version      
        ]
    )
    pprint.pprint(batch_delete_recipe_version_response)
except Exception as e:
    print(f"Missing {recipe_name}, create new recipe and publish.")

Missing Beneficiary-Summary-Recipe, create new recipe and publish.


In [111]:
# Delete Dataset
try:
    dataset_delete_response = databrew_client.delete_dataset(
        Name=dataset_name
    )
    pprint.pprint(dataset_delete_response)
except databrew_client.exceptions.ResourceNotFoundException as rnfe:
    print(f"Missing {dataset_name}, create new dataset. ")    
except Exception as e:    
    raise e

Missing Beneficiary-Summary-Dataset, create new dataset. 


## Create data set 

In [112]:

create_dataset_response = databrew_client.create_dataset(
    Name=dataset_name,
    Format='CSV',
    FormatOptions={
        'Csv': {
            'Delimiter': ',',
            'HeaderRow': True
        }
    },
    Input={
        'S3InputDefinition': {
            'Bucket': lake_bucket,
            'Key': 'extracted/Beneficiary_Summary/DE1_0_2008_Beneficiary_Summary_File_Sample_1.csv'
        }
    },
    Tags={
        'env': env_name,
        'team': team_name
    }
)
pprint.pprint(create_dataset_response)

{'Name': 'Beneficiary-Summary-Dataset',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '38',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 15:55:09 GMT',
                                      'x-amz-apigw-id': 'eFvmuEsRvHcF7cQ=',
                                      'x-amzn-requestid': '87f78086-aba1-4375-9c1d-11d34ad8d0ce',
                                      'x-amzn-trace-id': 'Root=1-607ef95e-0fd12327e14ebb2dede889ab'},
                      'HTTPStatusCode': 200,
                      'RequestId': '87f78086-aba1-4375-9c1d-11d34ad8d0ce',
                      'RetryAttempts': 0}}


## Create Data profiling job

In [113]:
data_profile_response = databrew_client.create_profile_job(
    DatasetName=dataset_name,
    Name=data_profile_job_name,
    LogSubscription='ENABLE',
    MaxCapacity=5,
    MaxRetries=0,
    OutputLocation={
        'Bucket': lake_bucket,
        'Key': 'databrew/dataprofileoutput/'
    },
    RoleArn=pod_role_arn,
    Tags={
        'env': env_name,
        'team': team_name
    },
    Timeout=2800,
    JobSample={
        'Mode': 'CUSTOM_ROWS',
        'Size': 1000
    }
)
pprint.pprint(data_profile_response)

{'Name': 'Beneficiary-Summary-Data-Profile-Job',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '47',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 15:56:02 GMT',
                                      'x-amz-apigw-id': 'eFvu2FcKvHcFurg=',
                                      'x-amzn-requestid': 'da6ec332-8eb6-410f-8e98-864da59784ca',
                                      'x-amzn-trace-id': 'Root=1-607ef992-4a0455ae5eefb3afe7898632'},
                      'HTTPStatusCode': 200,
                      'RequestId': 'da6ec332-8eb6-410f-8e98-864da59784ca',
                      'RetryAttempts': 0}}


## Start data profiling job

In [114]:
start_job_run_response = databrew_client.start_job_run(
    Name=data_profile_job_name
)
pprint.pprint(start_job_run_response)

{'ResponseMetadata': {'HTTPHeaders': {'content-length': '79',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 15:56:29 GMT',
                                      'x-amz-apigw-id': 'eFvzEGvhPHcFRiQ=',
                                      'x-amzn-requestid': '2dcf9250-04b0-466a-88c5-543ea70acaf4',
                                      'x-amzn-trace-id': 'Root=1-607ef9ad-7ebc7fd829237650580303ab'},
                      'HTTPStatusCode': 200,
                      'RequestId': '2dcf9250-04b0-466a-88c5-543ea70acaf4',
                      'RetryAttempts': 0},
 'RunId': 'db_f2beead8f7fed118d7456bab6827fb265622f2c0befeda99e506632d726c4eca'}


## Create Recipe

In [115]:

create_recipe_response = databrew_client.create_recipe(
    Description='Demo Data Transformation - convert birth date format ',
    Name=recipe_name,
    Steps=[
      {
        "Action": {
          "Operation": "DUPLICATE",
          "Parameters": {
            "sourceColumn": "BENE_BIRTH_DT",
            "targetColumn": "BENE_BIRTH_DT_FORMAT"
          }
        }
      },
      {
        "Action": {
          "Operation": "CHANGE_DATA_TYPE",
          "Parameters": {
            "columnDataType": "string",
            "sourceColumn": "BENE_BIRTH_DT_FORMAT"
          }
        }
      },
      {
        "Action": {
          "Operation": "FORMAT_DATE",
          "Parameters": {
            "sourceColumn": "BENE_BIRTH_DT_FORMAT",
            "targetDateFormat": "dd*month*yyyy"
          }
        }
      }
    ],
    Tags={
        'env': env_name,
        'team': team_name
    }
)
pprint.pprint(create_recipe_response)

{'Name': 'Beneficiary-Summary-Recipe',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '37',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 15:56:48 GMT',
                                      'x-amz-apigw-id': 'eFv2BEcRPHcFRiQ=',
                                      'x-amzn-requestid': '297b26d0-efd7-479d-bb87-fc6102477b01',
                                      'x-amzn-trace-id': 'Root=1-607ef9c0-2d8f32b7dc8a80e5bcd1d65f'},
                      'HTTPStatusCode': 200,
                      'RequestId': '297b26d0-efd7-479d-bb87-fc6102477b01',
                      'RetryAttempts': 0}}


## Publish recipe

In [116]:
publish_recipe_response = databrew_client.publish_recipe(
    Description='Publishing Demo Data Transformation - convert birth date format ',
    Name=recipe_name
)
pprint.pprint(publish_recipe_response)

{'Name': 'Beneficiary-Summary-Recipe',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '37',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 15:56:50 GMT',
                                      'x-amz-apigw-id': 'eFv2WEoJvHcFRiQ=',
                                      'x-amzn-requestid': '29186ee7-10b1-4f4a-a7b4-3af94b25f44c',
                                      'x-amzn-trace-id': 'Root=1-607ef9c2-fa9d8b00480a669005cf1b8b'},
                      'HTTPStatusCode': 200,
                      'RequestId': '29186ee7-10b1-4f4a-a7b4-3af94b25f44c',
                      'RetryAttempts': 0}}


## Create Project with a recipe that doesn't have an associated job.


In [117]:

create_project_response = databrew_client.create_project(
    DatasetName=dataset_name,    
    Name=project_name,
    RecipeName=recipe_name,
    Sample={
        'Size': 500,
        'Type': 'FIRST_N'
    },
    RoleArn= workspace["EksPodRoleArn"],
    Tags={
        'env': env_name,
        'team': team_name
    },
)

pprint.pprint(create_project_response)

{'Name': 'Beneficiary-Summary-Project',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '38',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 15:57:05 GMT',
                                      'x-amz-apigw-id': 'eFv4vGCMvHcFRiQ=',
                                      'x-amzn-requestid': '7b2751ac-5c83-490e-a0ab-a1cbe985c6af',
                                      'x-amzn-trace-id': 'Root=1-607ef9d1-e85c637684836633e45e5ad7'},
                      'HTTPStatusCode': 200,
                      'RequestId': '7b2751ac-5c83-490e-a0ab-a1cbe985c6af',
                      'RetryAttempts': 0}}


In [118]:
# Create Project based Recipe job 

create_recipe_job_response = databrew_client.create_recipe_job(
    #DatasetName='Beneficiary-Summary-Dataset',    
    Name=project_recipe_job_name,
    LogSubscription='ENABLE',
    MaxCapacity=5,
    MaxRetries=0,
    Outputs=[
        {
           'Format': 'CSV',            
            'Location': {
                'Bucket': lake_bucket,
                'Key': 'databrew/recipejob/output/Beneficiary_Summary/'
            },
            'Overwrite': True,
            'FormatOptions': {
                'Csv': {
                    'Delimiter': ','
                }
            }
        },
    ],
    ProjectName=project_name,
    #RecipeReference={
        #'Name': 'Beneficiary-Summary-Sample-2-Recipe',
        #'RecipeVersion': '1.0'
    #},
    RoleArn=pod_role_arn,
    Tags={
        'env': env_name,
        'team': team_name
    },
    Timeout=2880
)
pprint.pprint(create_recipe_job_response)

{'Name': 'Beneficiary-Summary-Project-Recipe-Job',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '49',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 15:57:22 GMT',
                                      'x-amz-apigw-id': 'eFv7UFTuvHcFurg=',
                                      'x-amzn-requestid': 'b0990b68-bb30-41ab-8e53-b94f87b43448',
                                      'x-amzn-trace-id': 'Root=1-607ef9e1-60278bb8fae7ca9a33136dbf'},
                      'HTTPStatusCode': 200,
                      'RequestId': 'b0990b68-bb30-41ab-8e53-b94f87b43448',
                      'RetryAttempts': 0}}


## Start job run

In [123]:
start_job_run_response = databrew_client.start_job_run(
    Name=project_recipe_job_name
)
pprint.pprint(start_job_run_response)

{'ResponseMetadata': {'HTTPHeaders': {'content-length': '79',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 16:06:22 GMT',
                                      'x-amz-apigw-id': 'eFxPrGXhvHcF61Q=',
                                      'x-amzn-requestid': '7a636b61-7440-4d60-abfa-0d8de0759bde',
                                      'x-amzn-trace-id': 'Root=1-607efbfd-40079143706679ad10435dd4'},
                      'HTTPStatusCode': 200,
                      'RequestId': '7a636b61-7440-4d60-abfa-0d8de0759bde',
                      'RetryAttempts': 0},
 'RunId': 'db_d8ce2ca3d0a9c254db327abf24fea1f663e2c03f4174827c7f081106e0543a20'}


In [124]:
run_id = start_job_run_response["RunId"]
print(run_id)

db_d8ce2ca3d0a9c254db327abf24fea1f663e2c03f4174827c7f081106e0543a20


In [125]:
describe_job_run_response = databrew_client.describe_job_run(
    Name=project_recipe_job_name,
    RunId= run_id
)
pprint.pprint(describe_job_run_response)

{'Attempt': 0,
 'DatasetName': 'Beneficiary-Summary-Dataset',
 'ExecutionTime': 0,
 'JobName': 'Beneficiary-Summary-Project-Recipe-Job',
 'LogGroupName': '/aws-glue-databrew/jobs-Beneficiary-Summary-Project-Recipe-Job',
 'LogSubscription': 'ENABLE',
 'Outputs': [{'Format': 'CSV',
              'FormatOptions': {'Csv': {'Delimiter': ','}},
              'Location': {'Bucket': 'orbit-dev-env-demo-lake-750821335976-b82a3f',
                           'Key': 'databrew/recipejob/output/Beneficiary_Summary/'},
              'Overwrite': True}],
 'RecipeReference': {'Name': 'Beneficiary-Summary-Recipe',
                     'RecipeVersion': 'LATEST_WORKING'},
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '873',
                                      'content-type': 'application/json',
                                      'date': 'Tue, 20 Apr 2021 16:06:22 GMT',
                                      'x-amz-apigw-id': 'eFxP0Gc2PHcF61Q=',
                                      'x-amzn-r

In [126]:
# Wait for job completion 
# Possible job run states - 'STARTING'|'RUNNING'|'STOPPING'|'STOPPED'|'SUCCEEDED'|'FAILED'|'TIMEOUT'

import time 
from datetime import datetime
wait_time_seconds = 60
time.sleep(wait_time_seconds)

    
job_status = describe_job_run_response["State"]
while job_status == 'RUNNING':
    time.sleep(wait_time_seconds)
    print(f"{datetime.now()} - Sleeping {wait_time_seconds} seconds ")
    job_status = databrew_client.describe_job_run(Name=project_recipe_job_name, RunId= run_id)["State"]


2021-04-20 16:08:23.032180 - Sleeping 60 seconds 


In [128]:
!aws s3 ls s3://$lake_bucket/databrew/recipejob/output/Beneficiary_Summary/ --recursive 

2021-04-20 16:07:46   15324305 databrew/recipejob/output/Beneficiary_Summary/Beneficiary-Summary-Project-Recipe-Job_part00000.csv


In [130]:
!aws s3 cp s3://$lake_bucket/databrew/recipejob/output/Beneficiary_Summary/Beneficiary-Summary-Project-Recipe-Job_part00000.csv - | head -n 5

DESYNPUF_ID,BENE_BIRTH_DT,BENE_BIRTH_DT_FORMAT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,BENE_HMO_CVRAGE_TOT_MONS,PLAN_CVRG_MOS_NUM,SP_ALZHDMTA,SP_CHF,SP_CHRNKIDN,SP_CNCR,SP_COPD,SP_DEPRESSN,SP_DIABETES,SP_ISCHMCHT,SP_OSTEOPRS,SP_RA_OA,SP_STRKETIA,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR
00013D2EFD8E45D1,19230501,"01 May 1923",,1,1,0,26,950,12,12,12,12,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,50.0,10.0,0.0,0.0,0.0,0.0
00016F745862898F,19430101,"01 January 1943",,1,1,0,39,230,12,12,0,0,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,700.0,240.0,0.0
0001FDD721E223DC,19360901,"01 September 1936",,2,1,0,39,280,12,12,0,12,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00021CA6FF03E670,19410601,"01 June 1941",,1,5,0,6,290,0,0,0,0,2,2,2,2,2,2,2,2,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
download failed: s3://orbit-dev-env-demo-lake-7508213

In [127]:
assert job_status == 'SUCCEEDED'