
## Quick Demo using LakeFormation without direct access
Below is a quick diagram showing the architecture:

<img src="images/AthenaArchitecture.png"/>


In [26]:
import boto3

database_name = 'sagemaker_demo'
table_name = 'flight_delays_csv'

glue_client = boto3.client('glue')


response = glue_client.get_table(
    DatabaseName=database_name,
    Name=table_name
)
print('Table location for', table_name, response['Table']['StorageDescriptor']['Location'])

Table location for flight_delays_csv s3://snively-datalake-us-west-2/flights


## We currently have ls permissions but not get permissions directly

We expect to see the file listed, but not be able to read it.

In [28]:
!aws s3 ls {response['Table']['StorageDescriptor']['Location']}/
!aws s3 cp {response['Table']['StorageDescriptor']['Location']}/2016-cleaned.csv .

2019-10-28 01:35:46          0 
2019-10-28 01:36:16  392680672 2016-cleaned.csv
fatal error: An error occurred (403) when calling the HeadObject operation: Forbidden


## let's try to read it anyways

In [30]:
import boto3
import pandas as pd
data = "didn't read anything"
try:
    data = pd.read_csv('s3://snively-datalake-us-west-2/flights/2016-cleaned.csv')
except:
    print('We expect this to fail...')

print(data)

We expect this to fail...
didn't read anything


In [None]:
print('installing the PyAthena Drivers to perform SQL queries natively')
print('alternatives include using the boto3 libraries or other Athena Data APIs')
!pip install --upgrade pip > /dev/null
!pip install PyAthena > /dev/null

In [2]:
import sagemaker
sagemaker_session = sagemaker.Session()
athena_data_bucket = sagemaker_session.default_bucket()
print('using the athena data bucket:', athena_data_bucket)
print('running in region: ', sagemaker_session.boto_region_name)

using the athena data bucket: sagemaker-us-west-2-783526147575
running in region:  us-west-2


In [3]:
from pyathena import connect
import pandas as pd

sagemaker_session = sagemaker.Session()

conn = connect(s3_staging_dir="s3://" + athena_data_bucket,
               region_name=sagemaker_session.boto_region_name)

In [10]:
df = pd.read_sql("SELECT * FROM sagemaker_demo.flight_delays_csv limit 5", conn)
df

Unnamed: 0,yr,quarter,month,dayofmonth,dayofweek,flightdate,carrier,tailnum,flightnum,originairportid,...,div4wheelsoff,div4tailnum,div5airport,div5airportid,div5airportseqid,div5wheelson,div5totalgtime,div5longestgtime,div5wheelsoff,div5tailnum
0,2016,1,1,21,4,2016-01-21,"""WN""","""N432WN""","""163""",15376,...,"""""","""""","""""",,,"""""",,,"""""",""""""
1,2016,1,1,21,4,2016-01-21,"""WN""","""N602SW""","""1094""",15376,...,"""""","""""","""""",,,"""""",,,"""""",""""""
2,2016,1,1,21,4,2016-01-21,"""WN""","""N707SA""","""1334""",15376,...,"""""","""""","""""",,,"""""",,,"""""",""""""
3,2016,1,1,21,4,2016-01-21,"""WN""","""N704SW""","""2225""",15376,...,"""""","""""","""""",,,"""""",,,"""""",""""""
4,2016,1,1,21,4,2016-01-21,"""WN""","""N768SW""","""276""",15376,...,"""""","""""","""""",,,"""""",,,"""""",""""""
