# Automated Data Transformations notebook

### Author: AWS Professional Services Emerging Technology and Intelligent Platforms Group
### Date: January 30 2019

In this notebook, we will demonstrate how users can transform and profile different databases and tables

## This job will perform data profiling for the given table.

In [1]:
%%local
import logging
import os
import datetime

logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

logger = logging.getLogger()

if 'cluster_name' in  os.environ.keys():
    clusterName = os.environ['cluster_name']
else:
    clusterName = 'lake-user-TestCluster'
clusterName

INFO:root:Starting profiling...


'lake-user-TestCluster'

In [4]:
%%spark config 

{"executorMemory": "1000M", "executorCores": 1, "numExecutors": 1}

In [None]:
%%local
import sparkmagic.utils.configuration as conf
conf.override(conf.livy_session_startup_timeout_seconds.__name__, 180)
conf.livy_session_startup_timeout_seconds()


In [None]:
%%local
import aws.utils.notebooks.spark.emr as sparkConnection

logger.info("connecting to EMR")

(livy_url, cluster_id, started) = sparkConnection.connect_to_spark(clusterName,
                                                             reuseCluster=True, 
                                                             startCluster=True, clusterArgs={})

logger.info("received connection (%s %s %s)" , livy_url, cluster_id, started)

In [None]:
%spark -s spark -c spark -l python -u $livy_url -t None ADD

In [None]:
# Sample user parameters
PAPERMILL_WORKBOOK_NAME = ''
PAPERMILL_INPUT_PATH = ''
PAPERMILL_OUTPUT_PATH = ''
PAPERMILL_OUTPUT_DIR_PATH = ''
database_name = ''
table_to_profile = ''
samplingRatio = 0


In [None]:
%%local
logger.info("Spark session established")

In [None]:
%%spark 

spark.sql("use {0}".format(database_name))

In [None]:
%%spark -o data_to_profile -m sample -n 100000
data_to_profile = spark.sql("select * from {}".format(table_to_profile)).sample(False, samplingRatio, 42)


In [None]:
%%spark -o args

args = spark.createDataFrame([({ "params" :{
                                            'PAPERMILL_WORKBOOK_NAME':PAPERMILL_WORKBOOK_NAME, 
                                            'PAPERMILL_INPUT_PATH':PAPERMILL_INPUT_PATH, 
                                            'PAPERMILL_OUTPUT_PATH':PAPERMILL_OUTPUT_PATH,
                                            'PAPERMILL_OUTPUT_DIR_PATH': PAPERMILL_OUTPUT_DIR_PATH,
                                            'database_name': database_name,
                                            'table_to_profile': table_to_profile,
                                            'samplingRatio': samplingRatio
                                           } 
                               })])

In [None]:
%%local

database_name = args.at[0,'params']['database_name']
table_to_profile = args.at[0,'params']['table_to_profile']
samplingRatio = args.at[0,'params']['samplingRatio']
PAPERMILL_INPUT_PATH = args.at[0,'params']['PAPERMILL_INPUT_PATH']
PAPERMILL_OUTPUT_PATH = args.at[0,'params']['PAPERMILL_OUTPUT_PATH']
PAPERMILL_OUTPUT_DIR_PATH = args.at[0,'params']['PAPERMILL_OUTPUT_DIR_PATH']
PAPERMILL_WORKBOOK_NAME = args.at[0,'params']['PAPERMILL_WORKBOOK_NAME']


In [None]:
%%local 

import pandas_profiling as pp

report = data_to_profile.profile_report()

report.to_file(output_file="profile.html")

# Writing the reports to parent directory
output_array = PAPERMILL_OUTPUT_DIR_PATH.replace('s3://','').split('/')
del output_array[len(output_array) - 1] # Move to parent folder
PAPERMILL_OUTPUT_DIR_PATH = 's3://' + '/'.join(output_array)

s3Path = "{}/{}-{}.{}.html".format(PAPERMILL_OUTPUT_DIR_PATH, database_name, table_to_profile, PAPERMILL_WORKBOOK_NAME)

print(s3Path)



In [None]:
%%bash --out output --err error -s "$s3Path"
echo "aws s3 cp --recursive ./$1 $2"
aws s3 cp profile.html $1



In [None]:
%%local 
print(output)
print(error)
assert "upload" in output
assert len(error) == 0

In [None]:
%spark delete -s spark

In [None]:
%%local
if started:
    sparkConnection.stop_cluster(cluster_id)

logger.info("Done profiling %s...",table_to_profile)