<!--
#  Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License").
#    You may not use this file except in compliance with the License.
#    You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS,
#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#    See the License for the specific language governing permissions and
#    limitations under the License.
-->

# Data Discovery using Athena

* Functions:  https://docs.aws.amazon.com/redshift/latest/dg/c_SQL_functions.html
* UDF: https://docs.aws.amazon.com/redshift/latest/dg/user-defined-functions.html
* Store Procedure: https://docs.aws.amazon.com/redshift/latest/dg/stored-procedure-overview.html

Using CMS Data at: https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Inpatient2016.html

## Contents
1. [Reference Links](#Reference-Links)
2. [Setup](#Setup)
  1. [Import Libraries](#Import-Libraries)
  2. [Initialize Functions](#Initialize-Functions)
  3. [Define Athena Parameters](#Define-Athena-Parameters)
  4. [Establish Athena Connection](#Establish-Athena-Connection)
  5. [Use SQL Query to Grab Sample Database Data](#Use-SQL-Query-to-Grab-Sample-Database-Data)
3. [Data Analysis](#Data-Analysis)
  1. [Select all Elements from the Database Sample File](#Select-all-Elements-from-the-Database-Sample-File)
  2. [Provide an Input Dataset](#Provide-an-Input-Dataset)
  3. [Error with missing column](#Error-with-missing-column)
  4. [Visualize Data](#Vizualize-Data)
  5. [Populate Data](#Populate-Data)
4. [Create New Table with Analysis](#Create-New-Table-with-Analysis)
  1. [Run Analysis](#Run-Analysis)
  2. [Display Analysis](#Display-Analysis)
  3. [Test Code](#Test-Code)

In [1]:
%reload_ext sql

## Setup

#### Import Athena Libraries

In [2]:
from aws_orbit_sdk.database import get_athena
from aws_orbit_sdk.common import get_workspace,get_scratch_database
import aws_orbit_sdk.glue_catalog as datamaker_catalog_api
import matplotlib.pyplot as plt

#### Initialize athena,workspace and scratch database functions

In [3]:
athena = get_athena()
%config SqlMagic.autocommit=False # for engines that do not support autommit
workspace = get_workspace()
scratch_glue_db = get_scratch_database()
team_space = workspace['team_space']
#DO NOT RUN THIS NOTEBOOK IN LAKE CREATOR TEAM SPACE 
#assert team_space == 'lake-user'
workspace

{'BaseImageAddress': '495869084367.dkr.ecr.us-west-2.amazonaws.com/orbit-dev-env-jupyter-user',
 'BootstrapS3Prefix': 'teams/dev-env/bootstrap/',
 'ContainerDefaults': {'cpu': 4, 'memory': 16384},
 'ContainerRunnerArn': None,
 'EcsClusterName': None,
 'EfsApId': 'fsap-0894c868f87247e58',
 'EfsId': 'fs-2854b42f',
 'EfsLifeCycle': 'AFTER_7_DAYS',
 'EksK8SApiArn': None,
 'EksPodRoleArn': 'arn:aws:iam::495869084367:role/orbit-dev-env-lake-creator-role',
 'Elbs': {'lake-creator/jupyterhub-public': {'AvailabilityZones': ['us-west-2b',
    'us-west-2a'],
   'DNSName': 'af34c68f9392e4c5ea51e556806fb7fd-499719595.us-west-2.elb.amazonaws.com',
   'Instances': [{'InstanceId': 'i-0dc2f63685d54e49a'},
    {'InstanceId': 'i-0af50b7c421fb1b3f'},
    {'InstanceId': 'i-09ee9d125abd5ecfb'},
    {'InstanceId': 'i-08c0d3df6659813be'},
    {'InstanceId': 'i-08b2d3f31f79f19bc'}],
   'ListenerDescriptions': [{'Listener': {'InstancePort': 30986,
      'InstanceProtocol': 'TCP',
      'LoadBalancerPort': 80,
 

#### Define Athena parameters

In [None]:
glue_db = "cms_raw_db"
target_db = "users"

#### Establish Athena Connection

In [None]:
%connect_to_athena -database $glue_db

#### Use SQL Query to Grab Sample Database Data

%%sql 

SELECT 1 as "Test"

In [None]:
%catalog -database $glue_db

## Now lets start Data Analysis

In [None]:
# Now we can show how you can bind a variable to use within the SQL 

ben_id = "F72554149E321FF9"	

%sql select * from cms_raw_db.beneficiary_summary where desynpuf_id = :ben_id

*** Maybe we want to write multi-line SQL directly and output it into a variable *** :

#### Run the Dataset SQL Query to Select all Elements from the Database Sample File

In [None]:
%%sql dataset << 

SELECT * 
FROM cms_raw_db.beneficiary_summary 
limit 1

#### Provide an Input Dataset

In [None]:
dataset

#### Showing how error looks like with the missing column below

In [None]:
%%sql population_by_age_rs <<


select age, count(desynpuf_id) as pop_size 
from
    (select least(year(current_date),year(bene_death_dt)) - year(bene_birth_dt) as age
    from cms_raw_db.beneficiary_summary) 
group by age 
order by age

#### With a bit of python , we can also visualize data

In [None]:
%%sql population_by_age_rs <<
select age,count(desynpuf_id) as pop_size 
from (
    select desynpuf_id, least(year(current_date),year(bene_death_dt)) - year(bene_birth_dt) as age
    from cms_raw_db.beneficiary_summary 

) A
group by age
order by age


#### Populate the Data into a Chart with Age and Population Size Columns

In [None]:
# Lets see what we got into our variable

population_by_age = population_by_age_rs.DataFrame()
population_by_age.head()

#### Visualize the Dataset Using a Scatter Plot

In [None]:
# Play with visualization:

ax1 = population_by_age.plot.scatter(x='age',
                      y='pop_size',
                       c='DarkBlue')

## Lets create a new table with our analysis

In [None]:
%%sql 

DROP TABLE IF EXISTS  users.population_by_age 

#### The Following SQL Query Creates a New Table

In [None]:
%%sql 

CREATE TABLE users.population_by_age 
WITH (format = 'PARQUET') 
AS 
select age,count(desynpuf_id) as pop_size 
from (
    select desynpuf_id, least(year(current_date),year(bene_death_dt)) - year(bene_birth_dt) as age
    from cms_raw_db.beneficiary_summary 

) A
group by age
order by age


#### Run an Analysis SQL Query on the New Table

In [None]:
%%sql analysis << 

select * from users.population_by_age 

#### Display the Analysis as Input on a Grid

In [None]:
analysis.DataFrame()

#### lets test our code

In [None]:
assert population_by_age.at[0,'age'] > 20.