# EMR on EKS Example

## Demonstration of running Spark AWS EMR Jobs on your cluster


### Author: AWS Professional Services Emerging Technology and Intelligent Platforms Group
### Date: Mar 29 2021

In [None]:
import boto3
from aws_orbit_sdk.common import get_workspace,get_properties
from aws_orbit_sdk import emr as orbit_emr
workspace = get_workspace()
workspace

In [None]:
team = workspace['team_space']
env = workspace['env_name']
role = workspace['EksPodRoleArn']
ScratchBucket = f"{workspace['ScratchBucket']}"
(env,team,role,ScratchBucket)

### If your team has deployed with the EMR_ON_EKS Plugin, the following will provide your virtual cluster id:

In [None]:
virtualClusterId=orbit_emr.get_virtual_cluster_id()
print(f"Virtual cluster id: {virtualClusterId}")

In [None]:
!aws s3 rm --recursive --quiet $ScratchBucket/$virtualClusterId/jobs/
!mkdir -p /home/jovyan/shared/jobs
!rm -fR /home/jovyan/shared/jobs/*

In [None]:
username = %env USERNAME
username

In [None]:
emr = boto3.client('emr-containers')
response = emr.start_job_run(
    name='myjob1',
    virtualClusterId=virtualClusterId,
    executionRoleArn=role,
    releaseLabel='emr-6.2.0-latest',
    jobDriver={
        'sparkSubmitJobDriver': {
            'entryPoint': "local:///home/jovyan/shared/samples/notebooks/B-DataAnalyst/pi.py",
            'entryPointArguments': [
                '10',
            ],
            'sparkSubmitParameters': "--conf spark.executor.instances=2 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1"
        }
    },
    configurationOverrides={
        "applicationConfiguration": [
          {
            "classification": "spark-defaults", 
            "properties": {
              "spark.kubernetes.driver.label.orbit/node-type": "ec2", 
              "spark.kubernetes.driver.label.username": username, 
              "spark.kubernetes.driver.label.orbit/attach-security-group": "yes",  
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.team.options.claimName":"team-efs",
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.team.mount.path":"/home/jovyan/share",
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.team.mount.readOnly":"false",
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.user.options.claimName":"user-efs",
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.user.mount.path":"/home/jovyan",
              "spark.kubernetes.driver.volumes.persistentVolumeClaim.user.mount.readOnly":"false",
             }
          }
        ], 
        'monitoringConfiguration': {
            'persistentAppUI': 'ENABLED',
            'cloudWatchMonitoringConfiguration': {
                'logGroupName': f'/orbit/emr/{env}-{team}',
                'logStreamNamePrefix': 'spark2'
            },
            's3MonitoringConfiguration': {
                'logUri': ScratchBucket
            }
        }
    },
    tags={
        'env': env
    }
)

In [None]:
response['id']

In [None]:
%%time

import time
while True:
    describe_response = emr.describe_job_run(
        id=response['id'],
        virtualClusterId=response['virtualClusterId']
    )
    print(f"STATE: {describe_response['jobRun']['state']}")
    if 'jobRun' in describe_response and 'state' in describe_response['jobRun'] and describe_response['jobRun']['state'] in ['COMPLETED', 'FAILED', 'CANCELLED']:
        break
    time.sleep(5)    
describe_response['jobRun']['state']

In [None]:
job_id = describe_response['jobRun']['id']
describe_response['jobRun']

In [None]:
!mkdir -p ~/private/spark/logs
!rm -fR ~/private/spark/logs/*

In [None]:
!aws s3 sync $ScratchBucket/$virtualClusterId/jobs/ ~/private/spark/logs

In [None]:
!find ~/private/spark/logs -name "*.gz"  -exec gzip -d {} -f \;

In [None]:
assert(describe_response['jobRun']['state']=='COMPLETED')

In [None]:
!cat ~/shared/jobs/output.txt