# EMR on EKS Example

## Demonstration of running Spark AWS EMR Jobs on your cluster


### Author: AWS Professional Services Emerging Technology and Intelligent Platforms Group
### Date: Mar 29 2021

In [1]:
import boto3
from aws_orbit_sdk.common import get_workspace,get_properties
from aws_orbit_sdk import emr as orbit_emr
workspace = get_workspace()
workspace

{'BaseImageAddress': '495869084367.dkr.ecr.us-west-2.amazonaws.com/orbit-dev-env2-jupyter-user:latest',
 'BootstrapS3Prefix': 'teams/dev-env2/bootstrap/',
 'ContainerDefaults': {'cpu': 4, 'memory': 16384},
 'ContainerRunnerArn': None,
 'EcsClusterName': None,
 'EfsApId': 'fsap-0379712f3188879e5',
 'EfsId': 'fs-2854b42f',
 'EfsLifeCycle': 'AFTER_7_DAYS',
 'EksK8SApiArn': None,
 'EksPodRoleArn': 'arn:aws:iam::495869084367:role/orbit-dev-env2-lake-user-role',
 'Elbs': {'lake-user/jupyterhub-public': {'AvailabilityZones': ['us-west-2b',
    'us-west-2a'],
   'DNSName': 'afe82996085924da38cc4bf82db2c99f-959051000.us-west-2.elb.amazonaws.com',
   'Instances': [{'InstanceId': 'i-0fa19a516a528f8af'},
    {'InstanceId': 'i-0c18236c7afa274f6'},
    {'InstanceId': 'i-0ad2217263d8bebd5'},
    {'InstanceId': 'i-0a0e9972a765c5405'},
    {'InstanceId': 'i-0813ad717d454ac3e'},
    {'InstanceId': 'i-03eb196d6920cba51'}],
   'ListenerDescriptions': [{'Listener': {'InstancePort': 32254,
      'InstancePr

In [2]:
team = workspace['team_space']
env = workspace['env_name']
role = workspace['EksPodRoleArn']
ScratchBucket = f"s3://{workspace['ScratchBucket']}/{team}"
(env,team,role,ScratchBucket)

('dev-env2',
 'lake-user',
 'arn:aws:iam::495869084367:role/orbit-dev-env2-lake-user-role',
 's3://orbit-foundation-dev-env-scratch-495869084367-77f116/lake-user')

### If your team has deployed with the EMR_ON_EKS Plugin, the following will provide your virtual cluster id:

In [3]:
virtualClusterId=orbit_emr.get_virtual_cluster_id()
print(f"Virtual cluster id: {virtualClusterId}")

Virtual cluster id: q7c92nsczd321hlreve947pym


In [35]:
!aws s3 rm --recursive --quiet $ScratchBucket/$virtualClusterId/jobs/

In [36]:
emr = boto3.client('emr-containers')
response = emr.start_job_run(
    name='myjob1',
    virtualClusterId=virtualClusterId,
    executionRoleArn=role,
    releaseLabel='emr-6.2.0-latest',
    jobDriver={
        'sparkSubmitJobDriver': {
            'entryPoint': "local:///efs/shared/samples/notebooks/B-DataAnalyst/pi.py",
            'entryPointArguments': [
                '10',
            ],
            'sparkSubmitParameters': "--conf spark.executor.instances=2 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1"
        }
    },
    configurationOverrides={
        "applicationConfiguration": [
          {
            "classification": "spark-defaults", 
            "properties": {
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.sparkdata.options.claimName":"jupyterhub",
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.sparkdata.mount.path":"/efs",
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.sparkdata.mount.readOnly":"false",
             }
          }
        ], 
        'monitoringConfiguration': {
            'persistentAppUI': 'ENABLED',
            'cloudWatchMonitoringConfiguration': {
                'logGroupName': f'/orbit/emr/{env}-{team}',
                'logStreamNamePrefix': 'spark2'
            },
            's3MonitoringConfiguration': {
                'logUri': ScratchBucket
            }
        }
    },
    tags={
        'env': env
    }
)

In [37]:
response['id']

'00000002u3ugdk56qgl'

In [38]:
%%time

import time
while True:
    describe_response = emr.describe_job_run(
        id=response['id'],
        virtualClusterId=response['virtualClusterId']
    )
    print(f"STATE: {describe_response['jobRun']['state']}")
    if 'jobRun' in describe_response and 'state' in describe_response['jobRun'] and describe_response['jobRun']['state'] in ['COMPLETED', 'FAILED', 'CANCELLED']:
        break
    time.sleep(5)    
describe_response['jobRun']['state']

STATE: PENDING
STATE: SUBMITTED
STATE: SUBMITTED
STATE: SUBMITTED
STATE: SUBMITTED
STATE: SUBMITTED
STATE: SUBMITTED
STATE: SUBMITTED
STATE: SUBMITTED
STATE: RUNNING
STATE: RUNNING
STATE: RUNNING
STATE: RUNNING
STATE: RUNNING
STATE: RUNNING
STATE: COMPLETED


'COMPLETED'

In [39]:
job_id = describe_response['jobRun']['id']
describe_response['jobRun']

{'id': '00000002u3ugdk56qgl',
 'name': 'myjob1',
 'virtualClusterId': 'q7c92nsczd321hlreve947pym',
 'arn': 'arn:aws:emr-containers:us-west-2:495869084367:/virtualclusters/q7c92nsczd321hlreve947pym/jobruns/00000002u3ugdk56qgl',
 'state': 'COMPLETED',
 'clientToken': '4a263790-7829-4a31-8cb5-144a745f7100',
 'executionRoleArn': 'arn:aws:iam::495869084367:role/orbit-dev-env2-lake-user-role',
 'releaseLabel': 'emr-6.2.0-latest',
 'configurationOverrides': {'applicationConfiguration': [{'classification': 'spark-defaults',
    'properties': {'spark.kubernetes.driver.volumes.persistentVolumeClaim.sparkdata.mount.readOnly': 'false',
     'spark.kubernetes.driver.volumes.persistentVolumeClaim.sparkdata.options.claimName': 'jupyterhub',
     'spark.kubernetes.driver.volumes.persistentVolumeClaim.sparkdata.mount.path': '/efs'}}],
  'monitoringConfiguration': {'persistentAppUI': 'ENABLED',
   'cloudWatchMonitoringConfiguration': {'logGroupName': '/orbit/emr/dev-env2-lake-user',
    'logStreamNamePr

In [40]:
!mkdir -p ~/private/spark/logs
!rm -fR ~/private/spark/logs/*

In [41]:
!aws s3 sync $ScratchBucket/$virtualClusterId/jobs/ ~/private/spark/logs

download: s3://orbit-foundation-dev-env-scratch-495869084367-77f116/lake-user/q7c92nsczd321hlreve947pym/jobs/00000002u3ugdk56qgl/containers/spark-cd968f83a98742dc834c76e54940ef50/pythonpi-4699ce787e83b576-exec-1/stdout.gz to ../../../../../home/jovyan/private/spark/logs/00000002u3ugdk56qgl/containers/spark-cd968f83a98742dc834c76e54940ef50/pythonpi-4699ce787e83b576-exec-1/stdout.gz
download: s3://orbit-foundation-dev-env-scratch-495869084367-77f116/lake-user/q7c92nsczd321hlreve947pym/jobs/00000002u3ugdk56qgl/containers/spark-cd968f83a98742dc834c76e54940ef50/pythonpi-4699ce787e83b576-exec-2/stderr.gz to ../../../../../home/jovyan/private/spark/logs/00000002u3ugdk56qgl/containers/spark-cd968f83a98742dc834c76e54940ef50/pythonpi-4699ce787e83b576-exec-2/stderr.gz
download: s3://orbit-foundation-dev-env-scratch-495869084367-77f116/lake-user/q7c92nsczd321hlreve947pym/jobs/00000002u3ugdk56qgl/containers/spark-cd968f83a98742dc834c76e54940ef50/spark-00000002u3ugdk56qgl-driver/stderr.gz to ../../.

In [42]:
!find ~/private/spark/logs -name "*.gz"  -exec gzip -d {} -f \;

In [43]:
assert(describe_response['jobRun']['state']=='COMPLETED')

In [44]:
!cat ~/shared/jobs/output.txt

Pi is roughly 3.144544env:PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
HADOOP_CONF_DIR=/etc/hadoop/conf
SPARK_MASTER_WEBUI_PORT=8080
JUPYTERHUB_PRIVATE_PORT_8081_TCP_PROTO=tcp
PYSPARK_MAJOR_PYTHON_VERSION=3
SPARK_ENV_LOADED=1
JUPYTERHUB_PUBLIC_PORT_80_TCP_PROTO=tcp
LD_LIBRARY_PATH=/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native
SPARK_SUBMIT_OPTS=
JUPYTERHUB_PRIVATE_PORT_8081_TCP_ADDR=172.20.173.240
PYTHON_VERSION=3.7.9
JUPYTERHUB_PUBLIC_PORT_80_TCP_PORT=80
JUPYTERHUB_PRIVATE_SERVICE_HOST=172.20.173.240
PWD=/home/hadoop
KUBERNETES_PORT_443_TCP=tcp://172.20.0.1:443
PYTHONPATH=/usr/lib/spark/python/lib/pyspark.zip:/usr/lib/spark/python/lib/py4j-0.10.9-src.zip:/usr/lib/spark/jars/spark-core_2.12-3.0.1-amzn-0.jar
JUPYTERHUB_API_PORT_8001_TCP_PROTO=tcp
AWS_REGION=us-west-2
HIVE_SERVER2_THRIFT_BIND_HOST=0.0.0.0
LIVY_HOME=/usr/lib/livy
PYSPARK_DRIVER_PYTHON=/usr/bin/python3
SPARK_MASTER_PORT=