# ADS-508-01-SP23 Team 8: Final Project

# Setup Database and Athena Tables

Much of the code is modified from `Fregly, C., & Barth, A. (2021). Data science on AWS: Implementing end-to-end, continuous AI and machine learning pipelines. O’Reilly.`

## Install missing dependencies

[PyAthena](https://pypi.org/project/PyAthena/) is a Python DB API 2.0 (PEP 249) compliant client for Amazon Athena.

In [2]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0

[0m

## Globally import libraries

In [3]:
import boto3
from botocore.client import ClientError
import sagemaker
import pandas as pd
from pyathena import connect
from IPython.core.display import display, HTML

%matplotlib inline

## Instantiate AWS SageMaker session

In [4]:
session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
def_bucket = sagemaker_session.default_bucket()
bucket = 'sagemaker-us-east-ads508-sp23-t8'

s3 = boto3.Session().client(service_name="s3",
                            region_name=region)

role = sagemaker.get_execution_role()
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker",
                            region_name=region)

In [5]:
setup_s3_bucket_passed = False
ingest_create_athena_db_passed = False
ingest_create_athena_table_tsv_passed = False

In [6]:
print(f"Default bucket: {def_bucket}")
print(f"Public T8 bucket: {bucket}")

Default bucket: sagemaker-us-east-1-657724983756
Public T8 bucket: sagemaker-us-east-ads508-sp23-t8


## Verify S3 Bucket Creation

In [7]:
%%bash

aws s3 ls s3://${bucket}/

2023-03-16 17:05:02 aws-athena-query-results-657724983756-us-east-1
2023-03-02 16:56:48 sagemaker-studio-657724983756-5nh7ydsouq7
2023-03-02 17:25:41 sagemaker-studio-657724983756-7yc8bp8xk0b
2023-03-02 17:01:51 sagemaker-us-east-1-657724983756
2023-03-17 05:19:31 sagemaker-us-east-ads508-sp23-t8


In [8]:
response = None

try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e:
    print(f"[ERROR] Cannot find bucket {bucket} in {response} due to {e}.")

{'ResponseMetadata': {'RequestId': 'SVMXN6X37MEBHKWD', 'HostId': 'RXgRFVHFnPcOr1MLGT3ZVZzuS5ZyT09xU8/1usp6nCDCivyey/7QG2Q/A5Z9fyNsL0et/C5+S2g=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'RXgRFVHFnPcOr1MLGT3ZVZzuS5ZyT09xU8/1usp6nCDCivyey/7QG2Q/A5Z9fyNsL0et/C5+S2g=', 'x-amz-request-id': 'SVMXN6X37MEBHKWD', 'date': 'Thu, 13 Apr 2023 16:48:46 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'server': 'AmazonS3'}, 'RetryAttempts': 0}}


In [9]:
%store setup_s3_bucket_passed

Stored 'setup_s3_bucket_passed' (bool)


# Set S3 Source Location (Public S3 Bucket)

In [10]:
s3_public_path_tsv = f"s3://{bucket}"

In [11]:
%store s3_public_path_tsv

Stored 's3_public_path_tsv' (str)


# Set S3 Destination Location (Our Private S3 Bucket)

In [12]:
s3_private_path_tsv = f"s3://{def_bucket}/team_8_data"
print(s3_private_path_tsv)

s3://sagemaker-us-east-1-657724983756/team_8_data


In [13]:
%store s3_private_path_tsv

Stored 's3_private_path_tsv' (str)


# Copy Data From the Public S3 Bucket to our Private S3 Bucket in this Account

In [14]:
!aws s3 cp --recursive $s3_public_path_tsv/ $s3_private_path_tsv/

copy: s3://sagemaker-us-east-ads508-sp23-t8/raw_data/census_block/census_block_loc.csv to s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/census_block/census_block_loc.csv
copy: s3://sagemaker-us-east-ads508-sp23-t8/raw_data/census/nyc_census_tracts.csv to s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/census/nyc_census_tracts.csv
copy: s3://sagemaker-us-east-ads508-sp23-t8/raw_data/grad_outcomes/2005-2010_Graduation_Outcomes_-_School_Level.tsv to s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/grad_outcomes/2005-2010_Graduation_Outcomes_-_School_Level.tsv
copy: s3://sagemaker-us-east-ads508-sp23-t8/raw_data/hs_dir/2014_-_2015_DOE_High_School_Directory.tsv to s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/hs_dir/2014_-_2015_DOE_High_School_Directory.tsv
copy: s3://sagemaker-us-east-ads508-sp23-t8/raw_data/jobs/NYC_Jobs.tsv to s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/jobs/NYC_Jobs.tsv
copy: s3://sagemaker-us-east-ads508-sp2

# List Files in our Private S3 Bucket in this Account

In [15]:
print(s3_private_path_tsv)

s3://sagemaker-us-east-1-657724983756/team_8_data


In [16]:
!aws s3 ls $s3_private_path_tsv/

                           PRE raw_data/


In [17]:
from IPython.core.display import display, HTML

display(
    HTML(
        f'<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/sagemaker-{region}-{account_id}/amazon-reviews-pds/?region={region}&tab=overview">S3 Bucket</a></b>'
    )
)

## Create Athena Database and Tables

In [18]:
database_name = "ads508_t8"

In [19]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = f"s3://{def_bucket}/team_8_data/athena/staging"
print(s3_staging_dir)

s3://sagemaker-us-east-1-657724983756/team_8_data/athena/staging


In [20]:
conn = connect(region_name=region,
               s3_staging_dir=s3_staging_dir)

In [21]:
create_db_stmnt = f"CREATE DATABASE IF NOT EXISTS {database_name}"
print(create_db_stmnt)

CREATE DATABASE IF NOT EXISTS ads508_t8


In [22]:
pd.read_sql(create_db_stmnt,
            conn)

### Verify The Database Has Been Created Succesfully

In [23]:
show_db_stmnt = "SHOW DATABASES"

df_show = pd.read_sql(show_db_stmnt,
                      conn)

df_show.head(17)

Unnamed: 0,database_name
0,ads508_t8
1,default
2,dsoaws
3,sagemaker_featurestore


In [24]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [25]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


## Define custom function to create tables in existing database

In [26]:
def create_athena_tbl_tsv(conn=None,
                          db=None,
                          tbl_name=None,
                          fields='',
                          s3_path=None,
                          delim=',',
                          ret='',
                          comp='',
                          skip=''):
    # Set Athena parameters

    # SQL statement to execute
    drop_tsv_tbl_stmnt = f"""DROP TABLE IF EXISTS {db}.{tbl_name}"""

    create_tsv_tbl_stmnt = f"""
        CREATE EXTERNAL TABLE IF NOT EXISTS {db}.{tbl_name}({fields})
        ROW FORMAT DELIMITED
            FIELDS
                TERMINATED BY '{delim}'
            LINES
                TERMINATED BY '{ret}\\n'
        LOCATION '{s3_path}'
        TBLPROPERTIES ({comp}{skip})
        """

    print(f'Create table statement:\n{create_tsv_tbl_stmnt}')

    pd.read_sql(drop_tsv_tbl_stmnt,
                conn)

    pd.read_sql(create_tsv_tbl_stmnt,
                conn)
    
    # Verify The Table Has Been Created Succesfully
    show_tsv_tbl_stmnt = f"SHOW TABLES IN {db}"

    df_show = pd.read_sql(show_tsv_tbl_stmnt,
                          conn)
    display(df_show.head(17))

    if tbl_name in df_show.values:
        ingest_create_athena_table_tsv_passed = True

    print(f'\nDataframe contains records: {ingest_create_athena_table_tsv_passed}')

## Create Athena Table from Local TSV File - `2005-2010_Graduation_Outcomes_-_School_Level.tsv`

In [27]:
grd_tsv_tbl_name = 'grad_outcomes'
grd_tsv_field_list = """
demographic string,
dbn string,
school_name string,
cohort string,
total_cohort string,
total_grads_n string,
total_grads_perc_cohort string,
total_regents_n string,
total_regents_perc_cohort string,
total_regents_perc_grads string,
advanced_regents_n string,
advanced_regents_perc_cohort string,
advanced_regents_perc_grads string,
regents_wo_advanced_n string,
regents_wo_advanced_perc_cohort string,
regents_wo_advanced_perc_grads string,
local_n string,
local_perc_cohort string,
local_perc_grads string,
still_enrolled_n string,
still_enrolled_perc_cohort string,
dropped_out_n string,
dropped_out_perc_cohort string
"""
grd_tsv_s3_raw_data_path = f"s3://{def_bucket}/team_8_data/raw_data/grad_outcomes"
print(grd_tsv_s3_raw_data_path)

create_athena_tbl_tsv(conn=conn,
                      db=database_name,
                      tbl_name=grd_tsv_tbl_name,
                      fields=grd_tsv_field_list,
                      s3_path=grd_tsv_s3_raw_data_path,
                      delim='\\t',
                      comp='',
                      skip="'skip.header.line.count'='1'")

s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/grad_outcomes
Create table statement:

        CREATE EXTERNAL TABLE IF NOT EXISTS ads508_t8.grad_outcomes(
demographic string,
dbn string,
school_name string,
cohort string,
total_cohort string,
total_grads_n string,
total_grads_perc_cohort string,
total_regents_n string,
total_regents_perc_cohort string,
total_regents_perc_grads string,
advanced_regents_n string,
advanced_regents_perc_cohort string,
advanced_regents_perc_grads string,
regents_wo_advanced_n string,
regents_wo_advanced_perc_cohort string,
regents_wo_advanced_perc_grads string,
local_n string,
local_perc_cohort string,
local_perc_grads string,
still_enrolled_n string,
still_enrolled_perc_cohort string,
dropped_out_n string,
dropped_out_perc_cohort string
)
        ROW FORMAT DELIMITED
            FIELDS
                TERMINATED BY '\t'
            LINES
                TERMINATED BY '\n'
        LOCATION 's3://sagemaker-us-east-1-657724983756/team_8_data/raw_d

Unnamed: 0,tab_name
0,census
1,census_block
2,crime
3,crime_pqt
4,evictions
5,grad_outcomes
6,hs_info
7,jobs



Dataframe contains records: True


### Run A Sample Query

In [28]:
grd_dbn_id01 = "01M448"

grd_select_dbn_stmnt = f"""
SELECT * FROM {database_name}.{grd_tsv_tbl_name}
WHERE dbn = '{grd_dbn_id01}'
LIMIT 17
"""

print(grd_select_dbn_stmnt)

grd_df01_s01 = pd.read_sql(grd_select_dbn_stmnt,
                           conn)

grd_df01_s01.head(17)


SELECT * FROM ads508_t8.grad_outcomes
WHERE dbn = '01M448'
LIMIT 17



Unnamed: 0,demographic,dbn,school_name,cohort,total_cohort,total_grads_n,total_grads_perc_cohort,total_regents_n,total_regents_perc_cohort,total_regents_perc_grads,...,regents_wo_advanced_n,regents_wo_advanced_perc_cohort,regents_wo_advanced_perc_grads,local_n,local_perc_cohort,local_perc_grads,still_enrolled_n,still_enrolled_perc_cohort,dropped_out_n,dropped_out_perc_cohort
0,Total Cohort,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2001,64,46,71.9,32,50.0,69.6,...,25,39.1,54.3,14,21.9,30.4,10,15.6,6,9.4
1,Total Cohort,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2002,52,33,63.5,19,36.5,57.6,...,11,21.2,33.3,14,26.9,42.4,16,30.8,1,1.9
2,Total Cohort,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2003,87,67,77.0,39,44.8,58.2,...,28,32.2,41.8,28,32.2,41.8,9,10.3,11,12.6
3,Total Cohort,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2004,112,75,67.0,36,32.1,48.0,...,30,26.8,40.0,39,34.8,52.0,33,29.5,4,3.6
4,Total Cohort,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2005,121,64,52.9,35,28.9,54.7,...,31,25.6,48.4,29,24.0,45.3,41,33.9,11,9.1
5,Total Cohort,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2006,124,53,42.7,42,33.9,79.2,...,34,27.4,64.2,11,8.9,20.8,46,37.1,20,16.1
6,Total Cohort,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2006 Aug,124,60,48.4,42,33.9,70.0,...,34,27.4,56.7,18,14.5,30.0,39,31.5,20,16.1
7,English Language Learners,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2002,1,s,,s,,,...,s,,,s,,,s,,s,
8,English Language Learners,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2001,5,s,,s,,,...,s,,,s,,,s,,s,
9,English Language Learners,01M448,UNIVERSITY NEIGHBORHOOD HIGH SCHOOL,2003,1,s,,s,,,...,s,,,s,,,s,,s,


In [29]:
if not grd_df01_s01.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


## Create Athena Table from Local TSV File - `2014_-_2015_DOE_High_School_Directory.tsv`

In [30]:
hsi_tsv_tbl_name = 'hs_info'
hsi_tsv_field_list = """
dbn string,
school_name string,
borough string,
building_code string,
phone_number string,
fax_number string,
grade_span_min string,
grade_span_max string,
expgrade_span_min string,
expgrade_span_max string,
bus string,
subway string,
primary_address_line_1 string,
city string,
state_code string,
postcode string,
website string,
total_students string,
campus_name string,
school_type string,
overview_paragraph string,
program_highlights string,
language_classes string,
advancedplacement_courses string,
online_ap_courses string,
online_language_courses string,
extracurricular_activities string,
psal_sports_boys string,
psal_sports_girls string,
psal_sports_coed string,
school_sports string,
partner_cbo string,
partner_hospital string,
partner_highered string,
partner_cultural string,
partner_nonprofit string,
partner_corporate string,
partner_financial string,
partner_other string,
addtl_info1 string,
addtl_info2 string,
start_time string,
end_time string,
se_services string,
ell_programs string,
school_accessibility_description string,
number_programs string,
priority01 string,
priority02 string,
priority03 string,
priority04 string,
priority05 string,
priority06 string,
priority07 string,
priority08 string,
priority09 string,
priority10 string,
location_1 string,
community_board string,
council_district string,
census_tract string,
bin string,
bbl string,
nta string
"""
hsi_tsv_s3_raw_data_path = f"s3://{def_bucket}/team_8_data/raw_data/hs_dir"
print(hsi_tsv_s3_raw_data_path)

create_athena_tbl_tsv(conn=conn,
                      db=database_name,
                      tbl_name=hsi_tsv_tbl_name,
                      fields=hsi_tsv_field_list,
                      s3_path=hsi_tsv_s3_raw_data_path,
                      delim='\\t',
                      comp='',
                      skip="'skip.header.line.count'='1'")

s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/hs_dir
Create table statement:

        CREATE EXTERNAL TABLE IF NOT EXISTS ads508_t8.hs_info(
dbn string,
school_name string,
borough string,
building_code string,
phone_number string,
fax_number string,
grade_span_min string,
grade_span_max string,
expgrade_span_min string,
expgrade_span_max string,
bus string,
subway string,
primary_address_line_1 string,
city string,
state_code string,
postcode string,
website string,
total_students string,
campus_name string,
school_type string,
overview_paragraph string,
program_highlights string,
language_classes string,
advancedplacement_courses string,
online_ap_courses string,
online_language_courses string,
extracurricular_activities string,
psal_sports_boys string,
psal_sports_girls string,
psal_sports_coed string,
school_sports string,
partner_cbo string,
partner_hospital string,
partner_highered string,
partner_cultural string,
partner_nonprofit string,
partner_corporate string,
p

Unnamed: 0,tab_name
0,census
1,census_block
2,crime
3,crime_pqt
4,evictions
5,grad_outcomes
6,hs_info
7,jobs



Dataframe contains records: True


### Run A Sample Query

In [31]:
hsi_dbn_id01 = "01M448"

hsi_select_dbn_stmnt = f"""
SELECT * FROM {database_name}.{hsi_tsv_tbl_name}
WHERE dbn = '{hsi_dbn_id01}'
LIMIT 17
"""

print(hsi_select_dbn_stmnt)

hsi_df01_s01 = pd.read_sql(hsi_select_dbn_stmnt,
                           conn)

hsi_df01_s01.head(17)


SELECT * FROM ads508_t8.hs_info
WHERE dbn = '01M448'
LIMIT 17



Unnamed: 0,dbn,school_name,borough,building_code,phone_number,fax_number,grade_span_min,grade_span_max,expgrade_span_min,expgrade_span_max,...,priority08,priority09,priority10,location_1,community_board,council_district,census_tract,bin,bbl,nta
0,01M448,University Neighborhood High School,Manhattan,M446,212-962-4341,212-267-5611,9,12,,,...,,,,"""200 Monroe Street",,,,,,


In [32]:
if not hsi_df01_s01.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


## Create Athena Table from Local CSV File - `nyc_census_tracts.csv`

In [33]:
cen_tsv_tbl_name = 'census'
cen_tsv_field_list = """
censustract string,
county string,
borough string,
totalpop int,
men int,
women int,
hispanic double,
white double,
black double,
native double,
asian double,
citizen int,
income int,
incomeerr int,
incomepercap int,
incomepercaperr int,
poverty double,
childpoverty double,
professional double,
service double,
office double,
construction double,
production double,
drive double,
carpool double,
transit double,
walk double,
othertransp double,
workathome double,
meancommute double,
employed int,
privatework double,
publicwork double,
selfemployed double,
familywork double,
unemployment double
"""
cen_tsv_s3_raw_data_path = f"s3://{def_bucket}/team_8_data/raw_data/census"
print(cen_tsv_s3_raw_data_path)

create_athena_tbl_tsv(conn=conn,
                      db=database_name,
                      tbl_name=cen_tsv_tbl_name,
                      fields=cen_tsv_field_list,
                      s3_path=cen_tsv_s3_raw_data_path,
                      comp='',
                      skip="'skip.header.line.count'='1'")

s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/census
Create table statement:

        CREATE EXTERNAL TABLE IF NOT EXISTS ads508_t8.census(
censustract string,
county string,
borough string,
totalpop int,
men int,
women int,
hispanic double,
white double,
black double,
native double,
asian double,
citizen int,
income int,
incomeerr int,
incomepercap int,
incomepercaperr int,
poverty double,
childpoverty double,
professional double,
service double,
office double,
construction double,
production double,
drive double,
carpool double,
transit double,
walk double,
othertransp double,
workathome double,
meancommute double,
employed int,
privatework double,
publicwork double,
selfemployed double,
familywork double,
unemployment double
)
        ROW FORMAT DELIMITED
            FIELDS
                TERMINATED BY ','
            LINES
                TERMINATED BY '\n'
        LOCATION 's3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/census'
        TBLPROPERTIES ('ski

Unnamed: 0,tab_name
0,census
1,census_block
2,crime
3,crime_pqt
4,evictions
5,grad_outcomes
6,hs_info
7,jobs



Dataframe contains records: True


### Run A Sample Query

In [34]:
cen_bourough_id01 = "Bronx"

cen_select_dbn_stmnt = f"""
SELECT * FROM {database_name}.{cen_tsv_tbl_name}
WHERE borough = '{cen_bourough_id01}'
LIMIT 17
"""

print(cen_select_dbn_stmnt)

cen_df01_s01 = pd.read_sql(cen_select_dbn_stmnt,
                           conn)

cen_df01_s01.head(17)


SELECT * FROM ads508_t8.census
WHERE borough = 'Bronx'
LIMIT 17



Unnamed: 0,censustract,county,borough,totalpop,men,women,hispanic,white,black,native,...,walk,othertransp,workathome,meancommute,employed,privatework,publicwork,selfemployed,familywork,unemployment
0,36005000100,Bronx,Bronx,7703,7133,570,29.9,6.1,60.9,0.2,...,,,,,0,,,,,
1,36005000200,Bronx,Bronx,5403,2659,2744,75.8,2.3,16.0,0.0,...,2.9,0.0,0.0,43.0,2308,80.8,16.2,2.9,0.0,7.7
2,36005000400,Bronx,Bronx,5915,2896,3019,62.7,3.6,30.7,0.0,...,1.4,0.5,2.1,45.0,2675,71.7,25.3,2.5,0.6,9.5
3,36005001600,Bronx,Bronx,5879,2558,3321,65.1,1.6,32.4,0.0,...,8.6,1.6,1.7,38.8,2120,75.0,21.3,3.8,0.0,8.7
4,36005001900,Bronx,Bronx,2591,1206,1385,55.4,9.0,29.0,0.0,...,3.0,2.4,6.2,45.4,1083,76.8,15.5,7.7,0.0,19.2
5,36005002000,Bronx,Bronx,8516,3301,5215,61.1,1.6,31.1,0.3,...,4.3,1.0,0.0,46.0,2508,71.0,21.3,7.7,0.0,17.2
6,36005002300,Bronx,Bronx,4774,2130,2644,62.3,0.2,36.5,1.0,...,14.0,1.5,4.1,42.7,1191,74.2,16.1,9.7,0.0,18.9
7,36005002400,Bronx,Bronx,150,109,41,0.0,52.0,48.0,0.0,...,0.0,0.0,0.0,,113,62.8,37.2,0.0,0.0,0.0
8,36005002500,Bronx,Bronx,5355,2338,3017,76.5,1.5,18.9,0.0,...,17.7,1.8,2.7,35.5,1691,85.1,8.3,6.1,0.5,9.4
9,36005002701,Bronx,Bronx,3016,1375,1641,68.0,0.0,31.2,0.0,...,18.0,0.0,1.6,42.8,1102,86.9,8.5,4.5,0.0,15.2


In [35]:
if not cen_df01_s01.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


## Create Athena Table from Local TSV File - `NYPD_Complaint_Data_Historic (1).csv`

In [36]:
cri_tsv_tbl_name = 'crime'
cri_tsv_field_list = """
cmplnt_num string,
cmplnt_fr_dt string,
cmplnt_fr_tm string,
cmplnt_to_dt string,
cmplnt_to_tm string,
addr_pct_cd string,
rpt_dt string,
ky_cd string,
ofns_desc string,
pd_cd string,
pd_desc string,
crm_atpt_cptd_cd string,
law_cat_cd string,
borough string,
loc_of_occur_desc string,
prem_typ_desc string,
juris_desc string,
jurisdiction_code string,
parks_nm string,
hadevelopt string,
housing_psa string,
x_coord_cd string,
y_coord_cd string,
susp_age_group string,
susp_race string,
susp_sex string,
transit_district string,
latitude string,
longitude string,
lat_lon string,
patrol_boro string,
station_name string,
vic_age_group string,
vic_race string,
vic_sex string
"""
cri_tsv_s3_raw_data_path = f"s3://{def_bucket}/team_8_data/raw_data/crime"
print(cri_tsv_s3_raw_data_path)

create_athena_tbl_tsv(conn=conn,
                      db=database_name,
                      tbl_name=cri_tsv_tbl_name,
                      fields=cri_tsv_field_list,
                      s3_path=cri_tsv_s3_raw_data_path,
                      delim='\\t',
                      comp="'compressionType'='gzip', ",
                      skip="'skip.header.line.count'='1'")

s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/crime
Create table statement:

        CREATE EXTERNAL TABLE IF NOT EXISTS ads508_t8.crime(
cmplnt_num string,
cmplnt_fr_dt string,
cmplnt_fr_tm string,
cmplnt_to_dt string,
cmplnt_to_tm string,
addr_pct_cd string,
rpt_dt string,
ky_cd string,
ofns_desc string,
pd_cd string,
pd_desc string,
crm_atpt_cptd_cd string,
law_cat_cd string,
borough string,
loc_of_occur_desc string,
prem_typ_desc string,
juris_desc string,
jurisdiction_code string,
parks_nm string,
hadevelopt string,
housing_psa string,
x_coord_cd string,
y_coord_cd string,
susp_age_group string,
susp_race string,
susp_sex string,
transit_district string,
latitude string,
longitude string,
lat_lon string,
patrol_boro string,
station_name string,
vic_age_group string,
vic_race string,
vic_sex string
)
        ROW FORMAT DELIMITED
            FIELDS
                TERMINATED BY '\t'
            LINES
                TERMINATED BY '\n'
        LOCATION 's3://sagemaker-us

Unnamed: 0,tab_name
0,census
1,census_block
2,crime
3,crime_pqt
4,evictions
5,grad_outcomes
6,hs_info
7,jobs



Dataframe contains records: True


### Run A Sample Query

In [37]:
cri_law_cat_cd01 = "misdemeanor"
cri_borough01 = "bronx"

cri_select_dbn_stmnt01 = f"""
SELECT * FROM {database_name}.{cri_tsv_tbl_name}
WHERE LOWER(law_cat_cd) = '{cri_law_cat_cd01}'
    AND LOWER(borough) = '{cri_borough01}'
LIMIT 17
"""

print(cri_select_dbn_stmnt01)

cri_df01_s01 = pd.read_sql(cri_select_dbn_stmnt01,
                           conn)

cri_df01_s01.head(17)


SELECT * FROM ads508_t8.crime
WHERE LOWER(law_cat_cd) = 'misdemeanor'
    AND LOWER(borough) = 'bronx'
LIMIT 17



Unnamed: 0,cmplnt_num,cmplnt_fr_dt,cmplnt_fr_tm,cmplnt_to_dt,cmplnt_to_tm,addr_pct_cd,rpt_dt,ky_cd,ofns_desc,pd_cd,...,susp_sex,transit_district,latitude,longitude,lat_lon,patrol_boro,station_name,vic_age_group,vic_race,vic_sex
0,629632833,02/06/2018,23:15:00,,,52,02/07/2018,341,PETIT LARCENY,333,...,F,,40.87367103500002,-73.90801364899994,"(40.873671035, -73.908013649)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,D
1,377132404,08/04/2018,22:15:00,,,44,08/04/2018,344,ASSAULT 3 & RELATED OFFENSES,101,...,M,,40.82616961200006,-73.91683070899995,"(40.826169612, -73.916830709)",PATROL BORO BRONX,,25-44,WHITE HISPANIC,F
2,584276892,02/11/2018,17:30:00,02/12/2018,06:00:00,41,02/12/2018,351,CRIMINAL MISCHIEF & RELATED OF,254,...,U,,40.827049319000025,-73.89499419099997,"(40.827049319, -73.894994191)",PATROL BORO BRONX,,45-64,BLACK,F
3,599398393,05/23/2018,23:30:00,05/24/2018,02:00:00,47,05/24/2018,351,CRIMINAL MISCHIEF & RELATED OF,254,...,,,40.88261532500008,-73.85194765899996,"(40.882615325, -73.851947659)",PATROL BORO BRONX,,25-44,ASIAN / PACIFIC ISLANDER,F
4,955332763,02/23/2018,13:55:00,,,43,02/23/2018,351,CRIMINAL MISCHIEF & RELATED OF,259,...,F,,40.82870937100006,-73.87776995499998,"(40.828709371, -73.877769955)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,D
5,412087799,05/07/2018,15:00:00,05/19/2018,18:00:00,47,05/21/2018,361,OFF. AGNST PUB ORD SENSBLTY &,639,...,U,,40.88130091300008,-73.85433733899998,"(40.881300913, -73.854337339)",PATROL BORO BRONX,,<18,WHITE HISPANIC,F
6,692539256,08/30/2018,17:01:00,08/31/2018,17:41:00,52,09/01/2018,341,PETIT LARCENY,313,...,,,40.86840712200007,-73.89260767699994,"(40.868407122, -73.892607677)",PATROL BORO BRONX,,65+,BLACK HISPANIC,F
7,763109503,05/03/2018,16:55:00,,,44,05/03/2018,341,PETIT LARCENY,333,...,M,,40.83778161800007,-73.91945797099999,"(40.837781618, -73.919457971)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,D
8,472961714,08/01/2018,11:30:00,08/01/2018,11:33:00,49,08/16/2018,361,OFF. AGNST PUB ORD SENSBLTY &,639,...,M,,40.84670561500008,-73.86472139499993,"(40.846705615, -73.864721395)",PATROL BORO BRONX,,25-44,ASIAN / PACIFIC ISLANDER,M
9,249426294,06/14/2018,14:50:00,06/14/2018,14:55:00,49,06/14/2018,351,CRIMINAL MISCHIEF & RELATED OF,259,...,M,,40.844996090000045,-73.85167356799997,"(40.84499609, -73.851673568)",PATROL BORO BRONX,,45-64,WHITE,M


In [38]:
if not cri_df01_s01.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


## Create Athena Table from Local TSV File - `Evictions.tsv`

In [39]:
evi_tsv_tbl_name = 'evictions'
evi_tsv_field_list = """
court_index_number string,
docket_number string,
eviction_address string,
eviction_apartment_number string,
executed_date string,
marshal_first_name string,
marshal_last_name string,
residential_or_commercial string,
borough string,
eviction_postcode string,
ejectment string,
eviction_or_legal_possession string,
latitude string,
longitude string,
community_board string,
council_district string,
census_tract string,
bin string,
bbl string,
nta string
"""
evi_tsv_s3_raw_data_path = f"s3://{def_bucket}/team_8_data/raw_data/evictions"
print(evi_tsv_s3_raw_data_path)

create_athena_tbl_tsv(conn=conn,
                      db=database_name,
                      tbl_name=evi_tsv_tbl_name,
                      fields=evi_tsv_field_list,
                      s3_path=evi_tsv_s3_raw_data_path,
                      delim='\\t',
                      comp='',
                      skip="'skip.header.line.count'='1'")

s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/evictions
Create table statement:

        CREATE EXTERNAL TABLE IF NOT EXISTS ads508_t8.evictions(
court_index_number string,
docket_number string,
eviction_address string,
eviction_apartment_number string,
executed_date string,
marshal_first_name string,
marshal_last_name string,
residential_or_commercial string,
borough string,
eviction_postcode string,
ejectment string,
eviction_or_legal_possession string,
latitude string,
longitude string,
community_board string,
council_district string,
census_tract string,
bin string,
bbl string,
nta string
)
        ROW FORMAT DELIMITED
            FIELDS
                TERMINATED BY '\t'
            LINES
                TERMINATED BY '\n'
        LOCATION 's3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/evictions'
        TBLPROPERTIES ('skip.header.line.count'='1')
        


Unnamed: 0,tab_name
0,census
1,census_block
2,crime
3,crime_pqt
4,evictions
5,grad_outcomes
6,hs_info
7,jobs



Dataframe contains records: True


### Run A Sample Query

In [40]:
evi_borough01 = "BRONX"

evi_select_dbn_stmnt = f"""
SELECT * FROM {database_name}.{evi_tsv_tbl_name}
WHERE borough = '{evi_borough01}'
LIMIT 17
"""

print(evi_select_dbn_stmnt)

evi_df01_s01 = pd.read_sql(evi_select_dbn_stmnt,
                           conn)

evi_df01_s01.head(17)


SELECT * FROM ads508_t8.evictions
WHERE borough = 'BRONX'
LIMIT 17



Unnamed: 0,court_index_number,docket_number,eviction_address,eviction_apartment_number,executed_date,marshal_first_name,marshal_last_name,residential_or_commercial,borough,eviction_postcode,ejectment,eviction_or_legal_possession,latitude,longitude,community_board,council_district,census_tract,bin,bbl,nta
0,56037/17,339568,547 EAST 168TH STREET,3H,02/26/2018,Thomas,Bia,Residential,BRONX,10456,Not an Ejectment,Possession,40.830857,-73.905191,3.0,16.0,145.0,2004227.0,2026100065.0,Claremont-Bathgate
1,B047517/19,409031,4014 CARPENTER AVENUE,4B,11/16/2022,Richard,McCoy,Residential,BRONX,10466,Not an Ejectment,Possession,40.889878,-73.862686,12.0,12.0,408.0,2063060.0,2048280031.0,Williamsbridge-Olinville
2,15068/17,334442,655 EAST 224TH STREET,1,09/29/2017,Thomas,Bia,Residential,BRONX,10467,Not an Ejectment,Possession,40.887599,-73.862391,12.0,12.0,394.0,2062985.0,2048260028.0,Williamsbridge-Olinville
3,14866/19A,97278,718 PENFIELD STREET,2-F,10/24/2019,Justin,Grossman,Residential,BRONX,10470,Not an Ejectment,Possession,40.904888,-73.849089,12.0,11.0,442.0,2071873.0,2051130039.0,Woodlawn-Wakefield
4,66703/18BX,90391,2032 EAST 177TH ST A /K/A 2032 CROSS BRONX EXP...,1E,07/30/2019,Justin,Grossman,Residential,BRONX,10472,Not an Ejectment,Possession,40.831685,-73.856168,9.0,18.0,78.0,2026230.0,2038030019.0,Westchester-Unionport
5,B806500/18,396012,281 EAST 143RD STREET,07A,01/17/2019,Richard,McCoy,Residential,BRONX,10451,Not an Ejectment,Possession,40.814845,-73.924083,1.0,8.0,51.0,2091116.0,2023240001.0,Mott Haven-Port Morris
6,54026/17,341956,1211 SOUTHERN BOULEVARD,301,11/19/2018,Thomas,Bia,Residential,BRONX,10459,Not an Ejectment,Possession,40.828949,-73.891897,3.0,17.0,125.0,2113777.0,2029750037.0,Morrisania-Melrose
7,69137/18,10335,1351 BOSTON ROAD - APT 201,201,07/15/2019,Robert,Renzulli,Residential,BRONX,10456,Not an Ejectment,Possession,40.832166,-73.898808,3.0,16.0,151.0,2128618.0,2029340050.0,Morrisania-Melrose
8,18348/16,324092,2280 LORING PLACE NORTH,4B,05/22/2017,Thomas,Bia,Residential,BRONX,10468,Not an Ejectment,Possession,40.861277,-73.908723,7.0,14.0,255.0,2014918.0,2032250015.0,Kingsbridge Heights
9,75943/16A,60118,1551 WILLIAMSBRID GE ROAD,4-B,08/10/2017,Justin,Grossman,Residential,BRONX,10461,Not an Ejectment,Possession,,,,,,,,


In [41]:
if not evi_df01_s01.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


## Create Athena Table from Local TSV File - `NYC _Jobs.tsv`

In [42]:
job_tsv_tbl_name = 'jobs'
job_tsv_field_list = """
job_id string,
agency string,
posting_type string,
num_of_positions string,
business_title string,
civil_service_title string,
title_classification string,
title_code_no string,
level string,
job_category string,
fulltime_or_parttime_indicator string,
career_level string,
salary_range_from string,
salary_range_to string,
salary_frequency string,
work_location string,
division_or_work_unit string,
job_description string,
minimum_qual_requirements string,
preferred_skills string,
additional_information string,
to_apply string,
hours_or_shift string,
work_location_1 string,
recruitment_contact string,
residency_requirement string,
posting_date string,
post_until string,
posting_updated string,
process_date string
"""
job_tsv_s3_raw_data_path = f"s3://{def_bucket}/team_8_data/raw_data/jobs"
print(job_tsv_s3_raw_data_path)

create_athena_tbl_tsv(conn=conn,
                      db=database_name,
                      tbl_name=job_tsv_tbl_name,
                      fields=job_tsv_field_list,
                      s3_path=job_tsv_s3_raw_data_path,
                      delim='\\t',
                      comp='',
                      skip="'skip.header.line.count'='1'")

s3://sagemaker-us-east-1-657724983756/team_8_data/raw_data/jobs
Create table statement:

        CREATE EXTERNAL TABLE IF NOT EXISTS ads508_t8.jobs(
job_id string,
agency string,
posting_type string,
num_of_positions string,
business_title string,
civil_service_title string,
title_classification string,
title_code_no string,
level string,
job_category string,
fulltime_or_parttime_indicator string,
career_level string,
salary_range_from string,
salary_range_to string,
salary_frequency string,
work_location string,
division_or_work_unit string,
job_description string,
minimum_qual_requirements string,
preferred_skills string,
additional_information string,
to_apply string,
hours_or_shift string,
work_location_1 string,
recruitment_contact string,
residency_requirement string,
posting_date string,
post_until string,
posting_updated string,
process_date string
)
        ROW FORMAT DELIMITED
            FIELDS
                TERMINATED BY '\t'
            LINES
                TERMINATED B

Unnamed: 0,tab_name
0,census
1,census_block
2,crime
3,crime_pqt
4,evictions
5,grad_outcomes
6,hs_info
7,jobs



Dataframe contains records: True


### Run A Sample Query

In [43]:
job_agency01 = "HOUSING"

job_select_dbn_stmnt = f"""
SELECT * FROM {database_name}.{job_tsv_tbl_name}
WHERE agency LIKE '%{job_agency01}%'
LIMIT 17
"""

print(job_select_dbn_stmnt)

job_df01_s01 = pd.read_sql(job_select_dbn_stmnt,
                           conn)

job_df01_s01.head(17)


SELECT * FROM ads508_t8.jobs
WHERE agency LIKE '%HOUSING%'
LIMIT 17



Unnamed: 0,job_id,agency,posting_type,num_of_positions,business_title,civil_service_title,title_classification,title_code_no,level,job_category,...,additional_information,to_apply,hours_or_shift,work_location_1,recruitment_contact,residency_requirement,posting_date,post_until,posting_updated,process_date
0,573469,HOUSING PRESERVATION & DVLPMNT,External,1,Strategic Program Development Analyst for the ...,CITY RESEARCH SCIENTIST,Non-Competitive-5,21744,02,"Policy, Research & Analysis",...,We engage New Yorkers to build and sustain nei...,Continue to work on the implementation of Loca...,Retrieve and review affordable housing regulat...,Research initiatives in other jurisdictions or...,Understand and leverage existing Agency datase...,Summarizing and communicating findingsâquali...,Contributing to the rollout of new initiatives...,"Conducting special research, analytical, or co...",Adhering to work plans and internal and extern...,"Participating in meetings, presentations, and ..."
1,568091,HOUSING PRESERVATION & DVLPMNT,External,5,Case Manager for the Division of Tenant Resources,COMMUNITY ASSOCIATE,Non-Competitive-5,56057,00,Constituent Services & Community Programs,...,Determination and verification of eligibility â¢,Client briefings {internal and external meetin...,May perform community outreach to assist Secti...,"Prepare and send appropriate correspondence, t...","Document case files and electronic records, fi...",Rent calculations â¢,Review of yearly recertificationâs of househ...,Demonstrate ability to manage multiple cases w...,"Attend mandatory trainings""",Qualification Requirements 1. High school gra...
2,576376,NYC HOUSING AUTHORITY,Internal,1,CARETAKER X,CARETAKER (HA),Labor-3,90645,00,Building Operations & Maintenance,...,Prepare apartments for move outs. Please rea...,Qualification Requirements There are no forma...,,"""1.",Possession of a valid driver's license is requ...,Preference will be given to employees who have...,"NYCHA residents are encouraged to apply.""",Click the Apply now button.,,
3,571769,HOUSING PRESERVATION & DVLPMNT,Internal,2,Case Manager for the Division of Tenant Resources,COMMUNITY ASSOCIATE,Non-Competitive-5,56057,00,Constituent Services & Community Programs,...,Determination and verification of eligibility â¢,Client briefings {internal and external meetin...,May perform community outreach to assist Secti...,"Prepare and send appropriate correspondence, t...","Document case files and electronic records, fi...",Rent calculations â¢,Review of yearly recertificationâs of househ...,Demonstrate ability to manage multiple cases w...,"Attend mandatory trainings""",Qualification Requirements 1. High school gra...
4,575854,HOUSING PRESERVATION & DVLPMNT,External,1,"Data & Analytics Manager, Division of Strategi...",CITY RESEARCH SCIENTIST,Non-Competitive-5,21744,02,"Policy, Research & Analysis",...,We engage New Yorkers to build and sustain nei...,"Gather, prepare, and merge large datasets from...",Create performance metrics for lottery and hom...,Manage and analyze eviction filing data to und...,Prepare analytic reports to inform program des...,Support the implementation of data-driven prog...,Develop strategies for data integration and au...,Assist SOA colleagues with quantitative analys...,"Respond to ad hoc data requests from programs,...","1. For Assignment Level I (only physical, bio..."
5,554300,NYC HOUSING AUTHORITY,External,1,RESIDENT RELOCATION SERVICES COMMUNITY COORDIN...,COMMUNITY COORDINATOR,Non-Competitive-5,56058,00,Constituent Services & Community Programs,...,"""1.",Preference will be given to employees who have...,"NYCHA residents are encouraged to apply.""",Click the Apply Now button.,,,,NYCHA has no residency requirements.,10/28/2022,
6,575870,HOUSING PRESERVATION & DVLPMNT,Internal,1,Director of Manhattan Planning for the Divisio...,CITY PLANNER,Competitive-1,22122,03,"Engineering, Architecture, & Planning",...,We engage New Yorkers to build and sustain nei...,Planning & Predevelopment (P&P) is central to ...,Neighborhood Development & Stabilization (ND&S...,Promote HPD and City policy objectives across ...,"Define, manage, and track team priorities and ...",Meet regularly with individual staff members a...,Identify staffing needs and advocate for resou...,Ensure that all projects move efficiently thro...,"Identify risks and troubleshoot problems, invo...","Create, implement, and maintain consistent, ef..."
7,440244,NYC HOUSING AUTHORITY,External,1,Senior Writer,AGENCY ATTORNEY,Non-Competitive-5,30087,03,"Legal Affairs Policy, Research & Analysis",...,Conducting operational analysis to understand ...,Conducting independent research of varying dif...,Organizing complex text and processes in seque...,Leading meetings with subject matter experts t...,Editing documents of a high degree of difficul...,Working with the Compliance Integration Report...,Working with the Compliance Monitoring Unit to...,Working with Compliance Inquiry Review and Ass...,Coordinating with NYCHA department heads regar...,Drafting complex documents based on important ...
8,570222,NYC HOUSING AUTHORITY,Internal,1,"SENIOR PROJECT MANAGER, ADULT EDUCATION & TRAI...",ASSOCIATE JOB OPPORTUNITY SPEC,Competitive-1,52316,02,Constituent Services & Community Programs,...,"""1.",Preference will be given to employees who have...,"NYCHA residents are encouraged to apply.""",Click the Apply Now button.,,,,NYCHA has no residency requirements.,02/14/2023,
9,576674,NYC HOUSING AUTHORITY,External,1,SUPERVISOR OF HOUSING CARETAKER,SUPERVISOR OF HOUSING CARETAKE,Competitive-1,82011,00,Building Operations & Maintenance Public Safet...,...,Handle tenant lockouts. 4.,Fill out work orders as a result of apartment ...,Report any hazardous conditions observed in an...,One year of permanent service in the title of ...,,"""1.",For NYCHA employees: This position is open as ...,For NYCHA employees: Preference will be given ...,"NYCHA residents are encouraged to apply.""",Click the Apply Now button.


In [44]:
if not job_df01_s01.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


# Create Parquet Files from TSV Table

In [45]:
ingest_create_athena_table_parquet_passed = False

In [46]:
%store -r ingest_create_athena_table_tsv_passed

In [47]:
try:
    ingest_create_athena_table_tsv_passed
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not register the TSV Data.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")

In [48]:
print(ingest_create_athena_table_tsv_passed)

True


In [49]:
if not ingest_create_athena_table_tsv_passed:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not register the TSV Data.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")
else:
    print("[OK]")

[OK]


In [50]:
# Set S3 path to Parquet data
cri_pqt_s3_data_path = f"s3://{def_bucket}/team_8_data/columnar"

# Execute Statement

In [51]:
cri_pqt_tbl_name = 'crime_pqt'
drop_pqt_tbl_stmnt = f"""DROP TABLE IF EXISTS {database_name}.{cri_pqt_tbl_name}"""

# SQL statement to execute
create_pqt_tble_stmnt = f"""
CREATE TABLE IF NOT EXISTS {database_name}.{cri_pqt_tbl_name}
WITH (
    format = 'PARQUET',
    external_location = '{cri_pqt_s3_data_path}',
    partitioned_by = ARRAY['law_cat_cd', 'borough']
    )
AS
SELECT
    cmplnt_num,
    cmplnt_fr_dt,
    cmplnt_fr_tm,
    cmplnt_to_dt,
    cmplnt_to_tm,
    addr_pct_cd,
    rpt_dt,
    ky_cd,
    ofns_desc,
    pd_cd,
    pd_desc,
    crm_atpt_cptd_cd,
    loc_of_occur_desc,
    prem_typ_desc,
    juris_desc,
    jurisdiction_code,
    parks_nm,
    hadevelopt,
    housing_psa,
    x_coord_cd,
    y_coord_cd,
    susp_age_group,
    susp_race,
    susp_sex,
    transit_district,
    latitude,
    longitude,
    lat_lon,
    patrol_boro,
    station_name,
    vic_age_group,
    vic_race,
    vic_sex,
    law_cat_cd,
    borough
FROM {database_name}.{cri_tsv_tbl_name}
TABLESAMPLE BERNOULLI(2)
"""

print(f'Create table statement:\n{create_pqt_tble_stmnt}')

pd.read_sql(drop_pqt_tbl_stmnt,
            conn)

pd.read_sql(create_pqt_tble_stmnt,
            conn)

Create table statement:

CREATE TABLE IF NOT EXISTS ads508_t8.crime_pqt
WITH (
    format = 'PARQUET',
    external_location = 's3://sagemaker-us-east-1-657724983756/team_8_data/columnar',
    partitioned_by = ARRAY['law_cat_cd', 'borough']
    )
AS
SELECT
    cmplnt_num,
    cmplnt_fr_dt,
    cmplnt_fr_tm,
    cmplnt_to_dt,
    cmplnt_to_tm,
    addr_pct_cd,
    rpt_dt,
    ky_cd,
    ofns_desc,
    pd_cd,
    pd_desc,
    crm_atpt_cptd_cd,
    loc_of_occur_desc,
    prem_typ_desc,
    juris_desc,
    jurisdiction_code,
    parks_nm,
    hadevelopt,
    housing_psa,
    x_coord_cd,
    y_coord_cd,
    susp_age_group,
    susp_race,
    susp_sex,
    transit_district,
    latitude,
    longitude,
    lat_lon,
    patrol_boro,
    station_name,
    vic_age_group,
    vic_race,
    vic_sex,
    law_cat_cd,
    borough
FROM ads508_t8.crime
TABLESAMPLE BERNOULLI(2)



Unnamed: 0,rows


# Load partitions by running `MSCK REPAIR TABLE`

In [52]:
partition_pqt_stmnt = f"MSCK REPAIR TABLE {database_name}.{cri_pqt_tbl_name}"

print(partition_pqt_stmnt)

MSCK REPAIR TABLE ads508_t8.crime_pqt


In [53]:
cri_df02 = pd.read_sql(partition_pqt_stmnt,
                       conn)

cri_df02.head(17)

# Show the Partitions

In [54]:
show_part_stmnt = f"SHOW PARTITIONS {database_name}.{cri_pqt_tbl_name}"

print(show_part_stmnt)

SHOW PARTITIONS ads508_t8.crime_pqt


In [55]:
cri_df02_part = pd.read_sql(show_part_stmnt,
                            conn)

cri_df02_part.head(31)

Unnamed: 0,partition
0,law_cat_cd=MISDEMEANOR/borough=BROOKLYN
1,law_cat_cd=VIOLATION/borough=MANHATTAN
2,law_cat_cd=MISDEMEANOR/borough=__HIVE_DEFAULT_...
3,law_cat_cd=FELONY/borough=__HIVE_DEFAULT_PARTI...
4,law_cat_cd=FELONY/borough=BROOKLYN
5,law_cat_cd=MISDEMEANOR/borough=MANHATTAN
6,law_cat_cd=MISDEMEANOR/borough=STATEN ISLAND
7,law_cat_cd=VIOLATION/borough=QUEENS
8,law_cat_cd=MISDEMEANOR/borough=BRONX
9,law_cat_cd=VIOLATION/borough=__HIVE_DEFAULT_PA...


# Show the Tables

In [56]:
show_tbl_stmnt = f"SHOW TABLES in {database_name}"

In [57]:
df_tables = pd.read_sql(show_tbl_stmnt,
                        conn)

df_tables.head(17)

Unnamed: 0,tab_name
0,census
1,census_block
2,crime
3,crime_pqt
4,evictions
5,grad_outcomes
6,hs_info
7,jobs


In [58]:
if cri_pqt_tbl_name in df_tables.values:
    ingest_create_athena_table_parquet_passed = True

In [59]:
%store ingest_create_athena_table_parquet_passed

Stored 'ingest_create_athena_table_parquet_passed' (bool)


# Run Sample Query

In [60]:
cri_select_dbn_stmnt02 = f"""
SELECT * FROM {database_name}.{cri_pqt_tbl_name}
WHERE LOWER(law_cat_cd) = '{cri_law_cat_cd01}'
    AND LOWER(borough) = '{cri_borough01}'
LIMIT 17
"""

print(cri_select_dbn_stmnt02)

cri_df02_s01 = pd.read_sql(cri_select_dbn_stmnt02,
                           conn)

cri_df02_s01.head(17)


SELECT * FROM ads508_t8.crime_pqt
WHERE LOWER(law_cat_cd) = 'misdemeanor'
    AND LOWER(borough) = 'bronx'
LIMIT 17



Unnamed: 0,cmplnt_num,cmplnt_fr_dt,cmplnt_fr_tm,cmplnt_to_dt,cmplnt_to_tm,addr_pct_cd,rpt_dt,ky_cd,ofns_desc,pd_cd,...,latitude,longitude,lat_lon,patrol_boro,station_name,vic_age_group,vic_race,vic_sex,law_cat_cd,borough
0,590499848,07/14/2013,05:30:00,07/14/2013,05:43:00,43,07/14/2013,340,FRAUDS,707,...,40.823101299,-73.869690461,"(40.823101299, -73.869690461)",PATROL BORO BRONX,,,UNKNOWN,E,MISDEMEANOR,BRONX
1,675597625,05/27/2017,21:15:00,05/27/2017,21:25:00,45,05/27/2017,235,DANGEROUS DRUGS,511,...,40.848632895,-73.8279976,"(40.848632895, -73.8279976)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,E,MISDEMEANOR,BRONX
2,336036648,03/21/2012,13:00:00,,,45,03/21/2012,351,CRIMINAL MISCHIEF & RELATED OF,254,...,40.830889993,-73.82728462,"(40.830889993, -73.82728462)",PATROL BORO BRONX,,45-64,WHITE HISPANIC,M,MISDEMEANOR,BRONX
3,388744844,05/25/2015,04:25:00,05/25/2015,04:30:00,52,05/25/2015,351,CRIMINAL MISCHIEF & RELATED OF,259,...,40.868812402,-73.888723856,"(40.868812402, -73.888723856)",PATROL BORO BRONX,,,UNKNOWN,D,MISDEMEANOR,BRONX
4,706022394,02/07/2016,16:00:00,,,47,02/10/2016,344,ASSAULT 3 & RELATED OFFENSES,114,...,40.886936175,-73.85249861,"(40.886936175, -73.85249861)",PATROL BORO BRONX,,25-44,BLACK,F,MISDEMEANOR,BRONX
5,281742597,08/29/2013,00:25:00,08/29/2013,00:30:00,40,08/29/2013,344,ASSAULT 3 & RELATED OFFENSES,101,...,40.811116426,-73.927329309,"(40.811116426, -73.927329309)",PATROL BORO BRONX,,18-24,BLACK,F,MISDEMEANOR,BRONX
6,411962262,04/09/2013,16:55:00,,,46,04/09/2013,358,OFFENSES INVOLVING FRAUD,705,...,40.861886273,-73.89320749,"(40.861886273, -73.89320749)",PATROL BORO BRONX,,,UNKNOWN,D,MISDEMEANOR,BRONX
7,545346999,10/01/2017,18:00:00,10/01/2017,18:15:00,52,10/02/2017,341,PETIT LARCENY,339,...,40.865155015,-73.892996163,"(40.865155015, -73.892996163)",PATROL BORO BRONX,,65+,WHITE HISPANIC,F,MISDEMEANOR,BRONX
8,818313819,05/30/2017,15:15:00,05/30/2017,15:26:00,46,05/30/2017,352,CRIMINAL TRESPASS,205,...,40.849496743,-73.909315789,"(40.849496743, -73.909315789)",PATROL BORO BRONX,,UNKNOWN,WHITE HISPANIC,M,MISDEMEANOR,BRONX
9,890406291,03/11/2013,21:25:00,03/12/2013,12:00:00,45,03/12/2013,359,OFFENSES AGAINST PUBLIC ADMINI,748,...,40.827532802,-73.821613113,"(40.827532802, -73.821613113)",PATROL BORO BRONX,,25-44,WHITE,F,MISDEMEANOR,BRONX


In [61]:
if not cri_df02_s01.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN CONVERTED TO PARQUET. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


## Review the New Athena Table in the Glue Catalog

In [62]:
display(
    HTML(
        f'<b>Review <a target="top" href="https://console.aws.amazon.com/glue/home?region={region}#">AWS Glue Catalog</a></b>'
    )
)

## Store Variables for the Next Notebooks

In [63]:
%store

Stored variables and their in-db values:
balance_dataset                                       -> True
balanced_bias_data_jsonlines_s3_uri                   -> 's3://sagemaker-us-east-1-657724983756/bias-detect
balanced_bias_data_s3_uri                             -> 's3://sagemaker-us-east-1-657724983756/bias-detect
bias_data_s3_uri                                      -> 's3://sagemaker-us-east-1-657724983756/bias-detect
experiment_name                                       -> 'Amazon-Customer-Reviews-BERT-Experiment-168013737
feature_group_name                                    -> 'reviews-feature-group-1680137375'
feature_store_offline_prefix                          -> 'reviews-feature-store-1680137375'
ingest_create_athena_db_passed                        -> True
ingest_create_athena_table_parquet_passed             -> True
ingest_create_athena_table_tsv_passed                 -> True
max_seq_length                                        -> 64
processed_test_data_s3_uri         

## Release Resources

In [64]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [65]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>