[![AWS Data Wrangler](_static/logo.png "AWS Data Wrangler")](https://github.com/awslabs/aws-data-wrangler)

# 16 - EMR & Docker

In [2]:
import awswrangler as wr
import boto3

## Enter your bucket name:

In [3]:
import getpass
bucket = getpass.getpass()

 ··········································


## Enter your Subnet ID:

In [4]:
subnet = getpass.getpass()

 ························


## Build and Upload Docker Image to ECR repository

Replace the `{ACCOUNT_ID}` placeholder.

In [None]:
%%writefile Dockerfile

FROM amazoncorretto:8

RUN yum -y update
RUN yum -y install yum-utils
RUN yum -y groupinstall development

RUN yum list python3*
RUN yum -y install python3 python3-dev python3-pip python3-virtualenv

RUN python -V
RUN python3 -V

ENV PYSPARK_DRIVER_PYTHON python3
ENV PYSPARK_PYTHON python3

RUN pip3 install --upgrade pip
RUN pip3 install awswrangler

RUN python3 -c "import awswrangler as wr"

In [None]:
%%bash

docker build -t 'local/emr-wrangler' .
aws ecr create-repository --repository-name emr-wrangler
docker tag local/emr-wrangler {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler
eval $(aws ecr get-login --region us-east-1 --no-include-email)
docker push {ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler

## Creating EMR Cluster

In [5]:
DOCKER_IMAGE = f"{wr.get_account_id()}.dkr.ecr.us-east-1.amazonaws.com/emr-wrangler:emr-wrangler"

cluster_id = wr.emr.create_cluster(
    subnet_id=subnet,
    spark_docker=True,
    spark_docker_image=DOCKER_IMAGE,
    ecr_credentials_step=True
)

## Uploading our PySpark script to Amazon S3

In [6]:
script = """
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("docker-awswrangler").getOrCreate()
sc = spark.sparkContext

print("Spark Initialized")

import awswrangler as wr

print(f"Wrangler version: {wr.__version__}")
"""

_ = boto3.client("s3").put_object(
    Body=script,
    Bucket=bucket,
    Key="test_docker.py"
)

## Submit PySpark step

In [7]:
step_id = wr.emr.submit_step(cluster_id, command=f"spark-submit --deploy-mode cluster s3://{bucket}/test_docker.py")

## Wait Step

In [8]:
while wr.emr.get_step_state(cluster_id, step_id) != "COMPLETED":
    pass

## Terminate Cluster

In [9]:
wr.emr.terminate_cluster(cluster_id)