# ADS-508-01-SP23 Team 8: Final Project

# Setup Database and Athena Tables

Much of the code is modified from `Fregly, C., & Barth, A. (2021). Data science on AWS: Implementing end-to-end, continuous AI and machine learning pipelines. O’Reilly.`

## Install missing dependencies

[PyAthena](https://pypi.org/project/PyAthena/) is a Python DB API 2.0 (PEP 249) compliant client for Amazon Athena.

In [21]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0

[0m

## Globally import libraries

In [22]:
import boto3
from botocore.client import ClientError
import sagemaker
import pandas as pd
from pyathena import connect
from IPython.core.display import display, HTML

## Instantiate AWS SageMaker session

In [23]:
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
def_bucket = sagemaker_session.default_bucket()
bucket = 'sagemaker-us-east-ads508-sp23-t8'

s3 = boto3.Session().client(service_name="s3",
                            region_name=region)

In [24]:
setup_s3_bucket_passed = False
ingest_create_athena_db_passed = False
ingest_create_athena_table_tsv_passed = False

In [25]:
print(f"Default bucket: {def_bucket}")
print(f"Public T8 bucket: {bucket}")

Default bucket: sagemaker-us-east-1-657724983756
Public T8 bucket: sagemaker-us-east-ads508-sp23-t8


## Verify S3 Bucket Creation

In [26]:
%%bash

aws s3 ls s3://${bucket}/

2023-03-16 17:05:02 aws-athena-query-results-657724983756-us-east-1
2023-03-02 16:56:48 sagemaker-studio-657724983756-5nh7ydsouq7
2023-03-02 17:25:41 sagemaker-studio-657724983756-7yc8bp8xk0b
2023-03-02 17:01:51 sagemaker-us-east-1-657724983756
2023-03-17 05:19:31 sagemaker-us-east-ads508-sp23-t8


In [27]:
response = None

try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print(f"[ERROR] Cannot find bucket {bucket} in {response} due to {e}.")

{'ResponseMetadata': {'RequestId': 'FT4H4196DCHNBFZG', 'HostId': 'cPyVKQZKCz8x486G/bZv+Dz+jVz97hAyXBolUxjS6fY5x2j/irXtVKSs13rmFzlo62IIvZ2QBnY=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'cPyVKQZKCz8x486G/bZv+Dz+jVz97hAyXBolUxjS6fY5x2j/irXtVKSs13rmFzlo62IIvZ2QBnY=', 'x-amz-request-id': 'FT4H4196DCHNBFZG', 'date': 'Fri, 24 Mar 2023 23:05:09 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [28]:
%store setup_s3_bucket_passed

Stored 'setup_s3_bucket_passed' (bool)


## Create Athena Database

In [29]:
database_name = "ads508_t8"

In [30]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = f"s3://{bucket}/athena/staging"
print(s3_staging_dir)

s3://sagemaker-us-east-ads508-sp23-t8/athena/staging


In [31]:
conn = connect(region_name=region,
               s3_staging_dir=s3_staging_dir)

### Verify The Database Has Been Created Succesfully

In [32]:
show_db_stmnt = "SHOW DATABASES"

df_show = pd.read_sql(show_db_stmnt,
                      conn)
df_show.head(17)

Unnamed: 0,database_name
0,ads508_t8
1,default
2,dsoaws
3,sagemaker_featurestore


In [33]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [34]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


In [35]:
cen_tsv_tbl_name = 'census'
ceb_tsv_tbl_name = 'census_block'
evi_tsv_tbl_name = 'evictions'
cri_pqt_tbl_name = 'crime_pqt'

In [36]:
# Set S3 path to Parquet data
abt_s3_data_path = f"s3://{bucket}/ABT"

# Execute Statement
_This can take a few minutes.  Please be patient._

cen_tsv_tbl_name = 'census'
abt_tbl_name = 'abt'
drop_abt_tbl_stmnt = f"""DROP TABLE IF EXISTS {database_name}.{abt_tbl_name}"""

# SQL statement to execute
create_abt_tble_stmnt = f"""
CREATE TABLE IF NOT EXISTS {database_name}.{abt_tbl_name}
WITH (
    external_location = '{abt_s3_data_path}'
    )
AS
SELECT
        cen.censustract,
        cen.borough,
        cen.totalpop,
        cen.men,
        cen.women,
        cen.hispanic,
        cen.white,
        cen.black,
        cen.native,
        cen.asian,
        cen.citizen,
        cen.income,
        cen.poverty,
        cen.childpoverty,
        cen.professional,
        cen.service,
        cen.office,
        cen.construction,
        cen.production,
        cen.drive,
        cen.carpool,
        cen.transit,
        cen.walk,
        cen.othertransp,
        cen.workathome,
        cen.meancommute,
        cen.employed,
        cen.privatework,
        cen.publicwork,
        cen.selfemployed,
        cen.familywork,
        cen.unemployment,
        ceb.blockCode,
        ceb.min_lat,
        ceb.max_lat,
        ceb.min_long,
        ceb.max_long
FROM {database_name}.{cen_tsv_tbl_name} AS cen
LEFT JOIN (
    SELECT
        substr(blockCode,1,11) AS blockCode,
        count(*),
        min(latitude) AS min_lat,
        max(latitude) AS max_lat,
        min(longitude) AS min_long,
        max(longitude) AS max_long     
    FROM {database_name}.{ceb_tsv_tbl_name}
    GROUP BY substr(blockCode,1,11)
    ORDER BY count(*) DESC
    LIMIT 50000
    ) AS ceb
    ON cen.censustract = substr(ceb.blockCode,1,11)
"""

print(f'Create table statement:\n{create_abt_tble_stmnt}')

pd.read_sql(drop_abt_tbl_stmnt,
            conn)

pd.read_sql(create_abt_tble_stmnt,
            conn)

In [37]:
abt_select_to_join_stmnt01 = f"""
SELECT
    cen.censustract,
    cen.borough,
    cen.totalpop,
    cen.men,
    cen.women,
    cen.hispanic,
    cen.white,
    cen.black,
    cen.native,
    cen.asian,
    cen.citizen,
    cen.income,
    cen.poverty,
    cen.childpoverty,
    cen.professional,
    cen.service,
    cen.office,
    cen.construction,
    cen.production,
    cen.drive,
    cen.carpool,
    cen.transit,
    cen.walk,
    cen.othertransp,
    cen.workathome,
    cen.meancommute,
    cen.employed,
    cen.privatework,
    cen.publicwork,
    cen.selfemployed,
    cen.familywork,
    cen.unemployment,
    cvi.blockCode,
    cvi.eviction_count_x_lat_long
FROM {database_name}.{cen_tsv_tbl_name} AS cen
LEFT JOIN (
    SELECT
        ceb.blockCode AS blockCode,
        sum(evi.eviction_count_x_lat_long) AS eviction_count_x_lat_long
    FROM (
        SELECT
            substr(blockCode,1,11) AS blockCode,
            count(*),
            min(latitude) AS min_lat,
            max(latitude) AS max_lat,
            min(longitude) AS min_long,
            max(longitude) AS max_long     
        FROM {database_name}.{ceb_tsv_tbl_name}
        GROUP BY substr(blockCode,1,11)
        ORDER BY count(*) DESC
        ) AS ceb
    INNER JOIN (
        SELECT
            CAST(latitude AS DOUBLE) AS latitude,
            CAST(longitude AS DOUBLE) AS longitude,
            count(*) AS eviction_count_x_lat_long
        FROM {database_name}.{evi_tsv_tbl_name}
        WHERE latitude != ''
        GROUP BY latitude, longitude
        ORDER BY count(*) DESC
        ) AS evi
    ON evi.latitude >= ceb.min_lat
        AND evi.latitude <= ceb.max_lat
        AND evi.longitude >= ceb.min_long
        AND evi.longitude <= ceb.max_long
    GROUP BY ceb.blockCode
    LIMIT 50000
    ) AS cvi
    ON cen.censustract = cvi.blockCode
ORDER BY cen.censustract
"""

print(abt_select_to_join_stmnt01)

abt_select_to_join_df01 = pd.read_sql(abt_select_to_join_stmnt01,
                                      conn)
print(abt_select_to_join_df01.shape)
display(abt_select_to_join_df01.head(15))


SELECT
    cen.censustract,
    cen.borough,
    cen.totalpop,
    cen.men,
    cen.women,
    cen.hispanic,
    cen.white,
    cen.black,
    cen.native,
    cen.asian,
    cen.citizen,
    cen.income,
    cen.poverty,
    cen.childpoverty,
    cen.professional,
    cen.service,
    cen.office,
    cen.construction,
    cen.production,
    cen.drive,
    cen.carpool,
    cen.transit,
    cen.walk,
    cen.othertransp,
    cen.workathome,
    cen.meancommute,
    cen.employed,
    cen.privatework,
    cen.publicwork,
    cen.selfemployed,
    cen.familywork,
    cen.unemployment,
    cvi.blockCode,
    cvi.eviction_count_x_lat_long
FROM ads508_t8.census AS cen
LEFT JOIN (
    SELECT
        ceb.blockCode AS blockCode,
        sum(evi.eviction_count_x_lat_long) AS eviction_count_x_lat_long
    FROM (
        SELECT
            substr(blockCode,1,11) AS blockCode,
            count(*),
            min(latitude) AS min_lat,
            max(latitude) AS max_lat,
            min(longitude)

Unnamed: 0,censustract,borough,totalpop,men,women,hispanic,white,black,native,asian,...,workathome,meancommute,employed,privatework,publicwork,selfemployed,familywork,unemployment,blockCode,eviction_count_x_lat_long
0,36005000100,Bronx,7703,7133,570,29.9,6.1,60.9,0.2,1.6,...,,,0,,,,,,,
1,36005000200,Bronx,5403,2659,2744,75.8,2.3,16.0,0.0,4.2,...,0.0,43.0,2308,80.8,16.2,2.9,0.0,7.7,36005000200.0,31.0
2,36005000400,Bronx,5915,2896,3019,62.7,3.6,30.7,0.0,0.3,...,2.1,45.0,2675,71.7,25.3,2.5,0.6,9.5,36005000400.0,46.0
3,36005001600,Bronx,5879,2558,3321,65.1,1.6,32.4,0.0,0.0,...,1.7,38.8,2120,75.0,21.3,3.8,0.0,8.7,36005001600.0,10.0
4,36005001900,Bronx,2591,1206,1385,55.4,9.0,29.0,0.0,2.1,...,6.2,45.4,1083,76.8,15.5,7.7,0.0,19.2,36005001900.0,230.0
5,36005002000,Bronx,8516,3301,5215,61.1,1.6,31.1,0.3,3.3,...,0.0,46.0,2508,71.0,21.3,7.7,0.0,17.2,36005002000.0,69.0
6,36005002300,Bronx,4774,2130,2644,62.3,0.2,36.5,1.0,0.0,...,4.1,42.7,1191,74.2,16.1,9.7,0.0,18.9,,
7,36005002400,Bronx,150,109,41,0.0,52.0,48.0,0.0,0.0,...,0.0,,113,62.8,37.2,0.0,0.0,0.0,36005002400.0,169.0
8,36005002500,Bronx,5355,2338,3017,76.5,1.5,18.9,0.0,3.0,...,2.7,35.5,1691,85.1,8.3,6.1,0.5,9.4,36005002500.0,22.0
9,36005002701,Bronx,3016,1375,1641,68.0,0.0,31.2,0.0,0.0,...,1.6,42.8,1102,86.9,8.5,4.5,0.0,15.2,,


In [38]:
evi_ceb_join_select_stmnt01 = f"""
SELECT
    ceb.blockCode,
    ceb.min_lat,
    ceb.max_lat,
    ceb.min_long,
    ceb.max_long,
    evi.latitude,
    evi.longitude,
    evi.eviction_count_x_lat_long
FROM (SELECT
    substr(blockCode,1,11) AS blockCode,
    count(*),
    min(latitude) AS min_lat,
    max(latitude) AS max_lat,
    min(longitude) AS min_long,
    max(longitude) AS max_long     
FROM {database_name}.{ceb_tsv_tbl_name}
GROUP BY substr(blockCode,1,11)
ORDER BY count(*) DESC) AS ceb
INNER JOIN (
SELECT
    CAST(latitude AS DOUBLE) AS latitude,
    CAST(longitude AS DOUBLE) AS longitude,
    count(*) AS eviction_count_x_lat_long
FROM {database_name}.{evi_tsv_tbl_name}
WHERE latitude != ''
GROUP BY latitude, longitude
ORDER BY count(*) DESC
) AS evi
ON evi.latitude >= ceb.min_lat
    AND evi.latitude <= ceb.max_lat
    AND evi.longitude >= ceb.min_long
    AND evi.longitude <= ceb.max_long
ORDER BY ceb.blockCode
LIMIT 50000
"""

print(evi_ceb_join_select_stmnt01)

evi_ceb_join_select_df01 = pd.read_sql(evi_ceb_join_select_stmnt01,
                           conn)
print(evi_ceb_join_select_df01.shape)
display(evi_ceb_join_select_df01.head(15))


SELECT
    ceb.blockCode,
    ceb.min_lat,
    ceb.max_lat,
    ceb.min_long,
    ceb.max_long,
    evi.latitude,
    evi.longitude,
    evi.eviction_count_x_lat_long
FROM (SELECT
    substr(blockCode,1,11) AS blockCode,
    count(*),
    min(latitude) AS min_lat,
    max(latitude) AS max_lat,
    min(longitude) AS min_long,
    max(longitude) AS max_long     
FROM ads508_t8.census_block
GROUP BY substr(blockCode,1,11)
ORDER BY count(*) DESC) AS ceb
INNER JOIN (
SELECT
    CAST(latitude AS DOUBLE) AS latitude,
    CAST(longitude AS DOUBLE) AS longitude,
    count(*) AS eviction_count_x_lat_long
FROM ads508_t8.evictions
WHERE latitude != ''
GROUP BY latitude, longitude
ORDER BY count(*) DESC
) AS evi
ON evi.latitude >= ceb.min_lat
    AND evi.latitude <= ceb.max_lat
    AND evi.longitude >= ceb.min_long
    AND evi.longitude <= ceb.max_long
ORDER BY ceb.blockCode
LIMIT 50000

(19535, 8)


Unnamed: 0,blockCode,min_lat,max_lat,min_long,max_long,latitude,longitude,eviction_count_x_lat_long
0,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.80168,-73.96871,1
1,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.801043,-73.967901,1
2,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.800299,-73.968049,1
3,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.799468,-73.96939,1
4,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.804287,-73.966985,3
5,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.79998,-73.966702,1
6,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.800041,-73.968107,1
7,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.801334,-73.968778,3
8,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.802632,-73.968792,1
9,34003013001,40.798844,40.823719,-73.99191,-73.966583,40.799407,-73.967335,1


In [53]:
# Display full Evictions table for review
evi_full_select_stmnt01 = f"""
    SELECT
        *
    FROM {database_name}.{evi_tsv_tbl_name}
    WHERE executed_date <> ''
    LIMIT 1000
    """

# Display SQL statement
print(evi_full_select_stmnt01)

# Run SQL statement against Athena table
evi_full_select_df01 = pd.read_sql(evi_full_select_stmnt01,
                           conn)
# Display results
print(evi_full_select_df01.shape)
display(evi_full_select_df01.head(11))


    SELECT
        *
    FROM ads508_t8.evictions
    WHERE executed_date <> ''
    LIMIT 1000
    
(1000, 20)


Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,marshal_first_name,marshal_last_name,residential_or_commercial,borough,eviction_postcode,ejectment,eviction_or_legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,56037/17,339568,547 EAST 168TH STREET,3H,02/26/2018,Thomas,Bia,Residential,BRONX,10456,Not an Ejectment,Possession,40.830857,-73.905191,3,16,145,2004227,2026100065,Claremont-Bathgate
1,B047517/19,409031,4014 CARPENTER AVENUE,4B,11/16/2022,Richard,McCoy,Residential,BRONX,10466,Not an Ejectment,Possession,40.889878,-73.862686,12,12,408,2063060,2048280031,Williamsbridge-Olinville
2,15068/17,334442,655 EAST 224TH STREET,1,09/29/2017,Thomas,Bia,Residential,BRONX,10467,Not an Ejectment,Possession,40.887599,-73.862391,12,12,394,2062985,2048260028,Williamsbridge-Olinville
3,58273/18,25388,1551 DEAN STREET,1ST FLOOR,07/12/2018,Gary,Rose,Residential,BROOKLYN,11213,Not an Ejectment,Possession,40.676166,-73.936661,8,36,311,3388499,3013400049,Crown Heights North
4,14866/19A,97278,718 PENFIELD STREET,2-F,10/24/2019,Justin,Grossman,Residential,BRONX,10470,Not an Ejectment,Possession,40.904888,-73.849089,12,11,442,2071873,2051130039,Woodlawn-Wakefield
5,66703/18BX,90391,2032 EAST 177TH ST A /K/A 2032 CROSS BRONX EXP...,1E,07/30/2019,Justin,Grossman,Residential,BRONX,10472,Not an Ejectment,Possession,40.831685,-73.856168,9,18,78,2026230,2038030019,Westchester-Unionport
6,98925/17,75402,175 WOODRUFF AVENUE,GARDEN APARTMENT,06/01/2018,Justin,Grossman,Residential,BROOKLYN,11226,Not an Ejectment,Possession,40.654641,-73.960291,14,40,50803,3115933,3050540052,Flatbush
7,304057/20,107717,555 TENTH AVENUE,32I,04/18/2022,Justin,Grossman,Residential,MANHATTAN,10018,Not an Ejectment,Possession,40.758888,-73.996022,4,3,117,1089722,1010697501,Hudson Yards-Chelsea-Flatiron-Union Square
8,210706/18,85502,2201 FIRST AVENUE,05B,03/14/2019,Henry,Daley,Residential,MANHATTAN,10029,Not an Ejectment,Possession,40.794176,-73.936754,11,8,180,1081091,1016840001,East Harlem North
9,B806500/18,396012,281 EAST 143RD STREET,07A,01/17/2019,Richard,McCoy,Residential,BRONX,10451,Not an Ejectment,Possession,40.814845,-73.924083,1,8,51,2091116,2023240001,Mott Haven-Port Morris


In [55]:
# Aggregate Evictions table based on borough and event year
evi_date_stmnt01_borough = f"""
    SELECT
        borough,
        CAST(YEAR(date_parse(executed_date, '%m/%d/%Y')) AS INT) AS year,
        count(*) AS annual_eviction_counts
    FROM {database_name}.{evi_tsv_tbl_name}
    WHERE executed_date <> ''
    GROUP BY borough, YEAR(date_parse(executed_date, '%m/%d/%Y'))
    ORDER BY borough, YEAR(date_parse(executed_date, '%m/%d/%Y'))
    LIMIT 10000
    """

# Display SQL statement
print(evi_date_stmnt01_borough)

# Run SQL statement against Athena table
evi_df01_s06_borough = pd.read_sql(evi_date_stmnt01_borough,
                           conn)
# Display results
print(evi_df01_s06_borough.shape)
display(evi_df01_s06_borough.head(35))

evi_df01_s07_borough = evi_df01_s06_borough.pivot_table(index = 'borough', columns = 'year', values = 'annual_eviction_counts', aggfunc = 'sum', fill_value = 0)
print(evi_df01_s07_borough.shape)
display(evi_df01_s07_borough.head(35))


    SELECT
        borough,
        CAST(YEAR(date_parse(executed_date, '%m/%d/%Y')) AS INT) AS year,
        count(*) AS annual_eviction_counts
    FROM ads508_t8.evictions
    WHERE executed_date <> ''
    GROUP BY borough, YEAR(date_parse(executed_date, '%m/%d/%Y'))
    ORDER BY borough, YEAR(date_parse(executed_date, '%m/%d/%Y'))
    LIMIT 10000
    
(35, 3)


Unnamed: 0,borough,year,annual_eviction_counts
0,BRONX,2017,7658
1,BRONX,2018,7140
2,BRONX,2019,6244
3,BRONX,2020,1088
4,BRONX,2021,29
5,BRONX,2022,1174
6,BRONX,2023,186
7,BROOKLYN,2017,6355
8,BROOKLYN,2018,6157
9,BROOKLYN,2019,5312


(5, 7)


year,2017,2018,2019,2020,2021,2022,2023
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BRONX,7658,7140,6244,1088,29,1174,186
BROOKLYN,6355,6157,5312,1005,100,1864,273
MANHATTAN,3450,3390,2818,521,68,930,144
QUEENS,4325,4452,3705,696,36,811,57
STATEN ISLAND,734,691,636,112,35,271,21


In [57]:
# Run query to review a sample of records
cri_law_cat_cd01 = "misdemeanor"
cri_bourough01 = "bronx"

cri_select_borough_stmnt01 = f"""
    SELECT
        *
    FROM {database_name}.{cri_pqt_tbl_name}
    TABLESAMPLE BERNOULLI(10)
    LIMIT 10000
    """

# Display SQL statement
print(cri_select_borough_stmnt01)

# Run SQL statement against Athena table
cri_df01_s01 = pd.read_sql(cri_select_borough_stmnt01,
                           conn)
# Display results
print(cri_df01_s01.shape)
display(cri_df01_s01.head(5))


    SELECT
        *
    FROM ads508_t8.crime_pqt
    TABLESAMPLE BERNOULLI(10)
    LIMIT 10000
    
(10000, 35)


Unnamed: 0,cmplnt_num,cmplnt_fr_dt,cmplnt_fr_tm,cmplnt_to_dt,cmplnt_to_tm,addr_pct_cd,rpt_dt,ky_cd,ofns_desc,pd_cd,...,latitude,longitude,lat_lon,patrol_boro,station_name,vic_age_group,vic_race,vic_sex,law_cat_cd,borough
0,702196756,09/30/2019,13:49:00,,,23,09/30/2019,101,MURDER & NON-NEGL. MANSLAUGHTER,,...,40.799665264000055,-73.94719977999995,"(40.799665264000055, -73.94719977999995)",,,18-24,BLACK,M,FELONY,
1,738418369,10/12/2019,06:55:00,,,77,10/12/2019,101,MURDER & NON-NEGL. MANSLAUGHTER,,...,40.676216571000055,-73.93042172099996,"(40.676216571000055, -73.93042172099996)",,,25-44,BLACK,M,FELONY,
2,675691126,09/13/2019,17:05:00,,,46,09/13/2019,101,MURDER & NON-NEGL. MANSLAUGHTER,,...,40.85839609900006,-73.90056609399994,"(40.85839609900006, -73.90056609399994)",,,25-44,WHITE HISPANIC,M,FELONY,
3,481542706,06/17/2019,14:47:00,,,81,06/17/2019,101,MURDER & NON-NEGL. MANSLAUGHTER,,...,40.69266153900003,-73.93697840199997,"(40.69266153900003, -73.93697840199997)",,,25-44,BLACK,M,FELONY,
4,899249176,04/13/2019,01:50:00,,,110,04/13/2019,101,MURDER & NON-NEGL. MANSLAUGHTER,,...,40.74987737400005,-73.86272726099996,"(40.74987737400005, -73.86272726099996)",,,25-44,WHITE HISPANIC,M,FELONY,


# Show the Tables

In [41]:
show_tbl_stmnt = f"SHOW TABLES in {database_name}"

In [42]:
df_tables = pd.read_sql(show_tbl_stmnt,
                        conn)
df_tables.head(17)

Unnamed: 0,tab_name
0,abt
1,census
2,census_block
3,crime
4,crime_pqt
5,evictions
6,grad_outcomes
7,hs_info
8,jobs


if abt_tbl_name in df_tables.values:
    ingest_create_athena_table_parquet_passed = True

%store ingest_create_athena_table_parquet_passed

# Run Sample Query

abt_borough01 = 'bronx'

abt_select_borough_stmnt02 = f"""
    SELECT * FROM {database_name}.{abt_tbl_name}
    WHERE blockcode IS NOT NULL
    LIMIT 10000
    """

print(abt_select_borough_stmnt02)

abt_df02_s01 = pd.read_sql(abt_select_borough_stmnt02,
                           conn)

print(abt_df02_s01.shape)
display(abt_df02_s01.head(17))

path_balanced = "./data-clarify/amazon_reviews_us_giftcards_software_videogames_balanced.csv"
df_balanced.to_csv(path_balanced, index=False, header=True)

if not abt_df02_s01.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

## Review the New Athena Table in the Glue Catalog

In [43]:
display(
    HTML(
        f'<b>Review <a target="top" href="https://console.aws.amazon.com/glue/home?region={region}#">AWS Glue Catalog</a></b>'
    )
)

## Store Variables for the Next Notebooks

In [None]:
%store

## Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}